# Stage 6

## Running the experiments

In the previous stage, our team selected `Sammon's error` function and `Genetic algorithm` and `Simulated annealing` for the experiments, so we wil use them here.

### 1. Loading and splitting the data

In [88]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [89]:
# Loading the data
df = pd.read_csv('combined-metrics.csv')

In [90]:
X = df.drop('Name', axis=1).values
y = df['Name'].values

In [91]:
print(X.shape)

(101, 35)


In [92]:
X

array([[5.42625369e+00, 3.77802360e+01, 2.97935103e+01, ...,
        1.81400000e+03, 1.58500000e+04, 0.00000000e+00],
       [2.02732240e+00, 9.52868852e+00, 6.37568306e+00, ...,
        9.22000000e+02, 2.64500000e+03, 0.00000000e+00],
       [1.67987805e+00, 1.17048780e+01, 8.17073171e+00, ...,
        2.79100000e+03, 1.14790000e+04, 0.00000000e+00],
       ...,
       [4.29198473e+00, 2.41488550e+01, 1.87232824e+01, ...,
        3.82000000e+02, 6.43600000e+03, 0.00000000e+00],
       [9.15789474e+00, 3.77017544e+01, 2.96842105e+01, ...,
        1.78000000e+02, 1.38800000e+03, 0.00000000e+00],
       [2.24528302e+00, 1.04371069e+01, 7.09119497e+00, ...,
        8.01000000e+02, 2.98200000e+03, 0.00000000e+00]])

In [93]:
y

array(['ART', 'BentoML', 'Bokeh', 'Camel', 'CatBoost', 'Causal ML',
       'Chainer', 'Computer Vision', 'D2L', 'Darts', 'Dash', 'DeepChem',
       'DeepMind Control', 'DeepPavlov', 'Detectron2', 'DIGITS',
       'DragGan', 'EasyOCR', 'ELI5', 'EvalAI', 'facenet', 'FaceSwap',
       'Fairseq', 'FastAI', 'FeatureTools', 'FiftyOne', 'gensim',
       'Giskard', 'Gluonts', 'Google Flax', 'Google JAX', 'GPT-Engineer',
       'GPTDiscord', 'Gradio', 'Gymnasium', 'Horovod', 'ImageAI',
       'imbalanced-learn', 'InsightFace', 'Kaolin', 'Kedro', 'Keras',
       'Kserve', 'Lightning', 'Ludwig', 'Mage-ai', 'Mars', 'Matplotlib',
       'metatransformer', 'Mindsdb', 'MLflow', 'Mycroft',
       'Neural Prophet', 'NNI', 'Numpy', 'ONNX', 'Open-Assistant',
       'OpenAI Baselines', 'OpenAI Python API library', 'OpenVINO',
       'Optuna', 'Paddle', 'Pandas', 'Pocker', 'Pybrain', 'PyCaret',
       'pycm', 'PyMC', 'Pyro', 'PyTensor', 'PyTorch',
       'Pytorch image models', 'qlib', 'Rasa', 'Ray', 'Reco

In [94]:
# Splitting the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(80, 35)
(21, 35)


### 2. Determining the minimal subset of metrics on the training data

Sammon's error function that we will use as an objective

In [95]:
import numpy as np

# Sammon's error function
def sammons_error(X, Y):
    """
    X: Original high-dimensional data
    Y: Low-dimensional representation
    """
    # Calculate pairwise distances in X and Y
    dist_orig = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1))
    dist_lowd = np.sqrt(np.sum((Y[:, np.newaxis, :] - Y[np.newaxis, :, :]) ** 2, axis=-1))

    # Avoid division by zero
    epsilon = 1e-12

    numerator = np.sum(((dist_orig - dist_lowd) * 2) / (dist_orig + epsilon))
    denominator = np.sum(dist_orig)

    error = numerator / denominator

    return error

#### Genetic algorithm

In [96]:
def fitness_genetic(individual, X):
    # Create a subset of features based on the individual
    subset_X = X[:, np.array(individual).astype(bool)]

    # Calculate the fitness using the `sammons_error` function
    fitness = -sammons_error(X, subset_X)
    return fitness,

In [97]:
from deap import creator, base, tools, algorithms
import random
from sklearn.decomposition import PCA

# Set up the genetic algorithm
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)


toolbox = base.Toolbox()


# Functions to limit the number of features in the individual to 10
def create_individual():
    individual = [random.randint(0, 1) for _ in range(X.shape[1])]
    while sum(individual) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(individual) - 10, replace=False)
        for idx in indices:
            individual[idx] = 0
    return creator.Individual(individual)


def mate(ind1, ind2):
    child1, child2 = [toolbox.clone(ind) for ind in (ind1, ind2)]
    tools.cxTwoPoint(child1, child2)

    # Limit the number of features in the child to 10
    while sum(child1) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(child1) - 10, replace=False)
        for idx in indices:
            child1[idx] = 0
    while sum(child2) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(child2) - 10, replace=False)
        for idx in indices:
            child2[idx] = 0
    return child1, child2


def mutate(individual):
    child = toolbox.clone(individual)
    tools.mutFlipBit(child, indpb=0.05)
    while sum(child) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(child) - 10, replace=False)
        for idx in indices:
            child[idx] = 0
    return child,


toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", fitness_genetic, X=X_train)
toolbox.register("mate", mate)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)



In [98]:
import numpy as np

# Run the genetic algorithm
population = toolbox.population(n=100)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

population, log = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=100, stats=stats, halloffame=hof, verbose=True)

gen	nevals	avg         	std        	min         	max         
0  	100   	-4.31971e-06	1.99471e-06	-7.11684e-06	-6.47558e-07
1  	67    	-2.53719e-06	1.7305e-06 	-6.44468e-06	-5.76285e-07
2  	62    	-1.43723e-06	1.17744e-06	-7.14823e-06	-5.75688e-07


3  	59    	-8.88344e-07	2.45486e-07	-1.6428e-06 	-1.42149e-07
4  	49    	-6.81458e-07	1.98324e-07	-1.12059e-06	-1.41985e-07
5  	63    	-8.03043e-07	8.90059e-07	-4.47984e-06	-1.41493e-07
6  	67    	-6.50613e-07	8.31126e-07	-5.57951e-06	-1.18825e-07
7  	62    	-5.13197e-07	7.99811e-07	-4.34602e-06	-4.73123e-08
8  	63    	-3.02316e-07	5.47433e-07	-4.10646e-06	-4.73123e-08
9  	57    	-3.14486e-07	7.02476e-07	-4.33258e-06	-4.73123e-08
10 	60    	-2.41189e-07	5.07083e-07	-3.96707e-06	-4.73123e-08
11 	62    	-3.53432e-07	8.67722e-07	-4.34514e-06	-2.80246e-08
12 	59    	-1.2649e-07 	3.32131e-07	-3.19841e-06	-1.33527e-08
13 	70    	-1.62408e-07	5.48405e-07	-3.3177e-06 	-1.33527e-08
14 	58    	-1.28638e-07	5.25719e-07	-3.92922e-06	-1.28161e-08
15 	55    	-2.92484e-07	8.53752e-07	-4.08706e-06	-1.28161e-08
16 	63    	-1.601e-07  	6.09567e-07	-3.89812e-06	-1.22859e-08
17 	64    	-2.27e-07   	7.51497e-07	-3.34765e-06	-1.22859e-08
18 	63    	-1.429e-07  	5.46898e-07	-3.27538e-06	-1.22859e-08
19 	67  

In [99]:
# Get the best individual (subset of metrics)
best_individual = hof.items[0]
best_subset_genetic = [df.columns[i+1] for i, bit in enumerate(best_individual) if bit]

print("Best subset of metrics:", best_subset_genetic)
print("Best fitness:", fitness_genetic(best_individual, X_train))
print("Length of best subset:", len(best_subset_genetic))

Best fitness: (-3.2787695694992544e-09,)
Length of best subset: 10


#### Simulated annealing

In [353]:
import math

def fitness_annealing(X, subset_X):
    return -sammons_error(X, subset_X)
    
def generate_random_subset(X, best_subset):
    # Generate a random subset of features based on the best subset
    subset = np.array(best_subset, dtype=int) if best_subset is not None else np.zeros(X.shape[1], dtype=int)

    # Randomly select features to add or remove
    for i in range(X.shape[1]):
        if random.random() < 0.1:
            subset[i] = 1 - subset[i]

    return subset

def annealing(X):
    # Simulated annealing parameters
    t = 1e-9 # Initial temperature
    t_min = 1e-100 # Minimum temperature
    cooling_factor = 0.9999 # Temperature damping factor
    max_iterations = 10000 # Maximum number of iterations
    best_subset = generate_random_subset(X, None)
    best_score = fitness_annealing(X, X[:,best_subset==1])
    tried_subsets = set()
    
    i = 0
    while i < max_iterations:
        # Generate a new random subset
        subset = generate_random_subset(X, best_subset)

        # Limit the number of features in the subset to 10
        if sum(subset) > 10:
            continue

        # Skip if subset already tried
        subset_hash = hash(str(subset))

        if subset_hash in tried_subsets:
            continue

        tried_subsets.add(subset_hash)

        score = fitness_annealing(X, X[:,subset==1]) # Evaluate subset

        if score > best_score:
            # Accept new better state unconditionally
            best_subset = subset
            best_score = score

        elif random.random() < math.exp((score - best_score) / t):
            # Accept new worse state with probability
            print("Accepting worse state with probability:", math.exp((score - best_score) / t))
            best_subset = subset
            best_score = score

        # Cool temperature
        t = t * cooling_factor

        # Check if cooled enough
        if t < t_min:
            break

        i += 1

    return best_subset, best_score

best_subset, best_score = annealing(X_train)


Accepting worse state with probability: 0.998559124585052
Accepting worse state with probability: 0.9999856554000673
Accepting worse state with probability: 0.9955725283756776
Accepting worse state with probability: 0.9982212176324723
Accepting worse state with probability: 0.6404976324095744
Accepting worse state with probability: 1.0
Accepting worse state with probability: 0.9969576433847761
Accepting worse state with probability: 0.9909566525721489
Accepting worse state with probability: 0.5819519985342155
Accepting worse state with probability: 0.9999935506320803
Accepting worse state with probability: 0.892947663942996
Accepting worse state with probability: 0.9999999394955846
Accepting worse state with probability: 1.0
Accepting worse state with probability: 0.9960664660226065
Accepting worse state with probability: 0.890981964796782
Accepting worse state with probability: 0.8403951452912932
Accepting worse state with probability: 0.9209622706479704
Accepting worse state with pro

In [357]:
print("Best subset:", best_subset)
print("Best sammon's error result:", -best_score)
print(sammons_error(X_train, X_train[:, best_subset==1]))

Best subset: [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 1 1 1 0]
Best sammon's error result: 3.2787695694992544e-09
3.2787695694992544e-09


In [358]:
best_subset_annealing = [df.columns[i+1] for i, bit in enumerate(best_subset) if bit]
print(best_subset_annealing)
print(len(best_subset_annealing))

10


In [359]:
print("Genetic algorithm result: ", best_subset_genetic)
print("Simulated annealing result:", best_subset_annealing)

X_genetic = X_train[:, np.array(best_individual).astype(bool)]
X_annealing = X_train[:, np.array(best_subset).astype(bool)]

X_genetic_test = X_test[:, np.array(best_individual).astype(bool)]
X_annealing_test = X_test[:, np.array(best_subset).astype(bool)]

print("Genetic training score:", sammons_error(X_train, X_genetic))
print("Annealing training score:", sammons_error(X_train, X_annealing))

print("Genetic validation score:", sammons_error(X_test, X_genetic_test))
print("Annealing validation score:", sammons_error(X_test, X_annealing_test))

Genetic training score: 3.2787695694992544e-09
Annealing training score: 3.2787695694992544e-09
Genetic validation score: 1.3740122716132385e-08
Annealing validation score: 1.3740122716132385e-08
