# Stage 6

## Running the experiments

In the previous stage, our team selected `Sammon's error` function and `Genetic algorithm` and `Simulated annealing` for the experiments, so we wil use them here.

### 1. Loading and splitting the data

In [57]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [58]:
# Loading the data
df = pd.read_csv('combined-metrics.csv')

In [59]:
X = df.drop('Name', axis=1).values
y = df['Name'].values

In [60]:
print(X.shape)

(101, 35)


In [61]:
X

array([[5.42625369e+00, 3.77802360e+01, 2.97935103e+01, ...,
        1.81400000e+03, 1.58500000e+04, 0.00000000e+00],
       [2.02732240e+00, 9.52868852e+00, 6.37568306e+00, ...,
        9.22000000e+02, 2.64500000e+03, 0.00000000e+00],
       [1.67987805e+00, 1.17048780e+01, 8.17073171e+00, ...,
        2.79100000e+03, 1.14790000e+04, 0.00000000e+00],
       ...,
       [4.29198473e+00, 2.41488550e+01, 1.87232824e+01, ...,
        3.82000000e+02, 6.43600000e+03, 0.00000000e+00],
       [9.15789474e+00, 3.77017544e+01, 2.96842105e+01, ...,
        1.78000000e+02, 1.38800000e+03, 0.00000000e+00],
       [2.24528302e+00, 1.04371069e+01, 7.09119497e+00, ...,
        8.01000000e+02, 2.98200000e+03, 0.00000000e+00]])

In [62]:
y

array(['ART', 'BentoML', 'Bokeh', 'Camel', 'CatBoost', 'Causal ML',
       'Chainer', 'Computer Vision', 'D2L', 'Darts', 'Dash', 'DeepChem',
       'DeepMind Control', 'DeepPavlov', 'Detectron2', 'DIGITS',
       'DragGan', 'EasyOCR', 'ELI5', 'EvalAI', 'facenet', 'FaceSwap',
       'Fairseq', 'FastAI', 'FeatureTools', 'FiftyOne', 'gensim',
       'Giskard', 'Gluonts', 'Google Flax', 'Google JAX', 'GPT-Engineer',
       'GPTDiscord', 'Gradio', 'Gymnasium', 'Horovod', 'ImageAI',
       'imbalanced-learn', 'InsightFace', 'Kaolin', 'Kedro', 'Keras',
       'Kserve', 'Lightning', 'Ludwig', 'Mage-ai', 'Mars', 'Matplotlib',
       'metatransformer', 'Mindsdb', 'MLflow', 'Mycroft',
       'Neural Prophet', 'NNI', 'Numpy', 'ONNX', 'Open-Assistant',
       'OpenAI Baselines', 'OpenAI Python API library', 'OpenVINO',
       'Optuna', 'Paddle', 'Pandas', 'Pocker', 'Pybrain', 'PyCaret',
       'pycm', 'PyMC', 'Pyro', 'PyTensor', 'PyTorch',
       'Pytorch image models', 'qlib', 'Rasa', 'Ray', 'Reco

In [63]:
# Splitting the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)

(80, 35)
(21, 35)


### 2. Determining the minimal subset of metrics on the training data

Sammon's error function that we will use as an objective

In [64]:
import numpy as np

# Sammon's error function
def sammons_error(X, Y):
    """
    X: Original high-dimensional data
    Y: Low-dimensional representation
    """
    # Calculate pairwise distances in X and Y
    dist_orig = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1))
    dist_lowd = np.sqrt(np.sum((Y[:, np.newaxis, :] - Y[np.newaxis, :, :]) ** 2, axis=-1))

    # Avoid division by zero
    epsilon = 1e-12

    numerator = np.sum(((dist_orig - dist_lowd) * 2) / (dist_orig + epsilon))
    denominator = np.sum(dist_orig)

    error = numerator / denominator

    return error

#### Genetic algorithm

In [65]:
def fitness_genetic(individual, X):
    # Create a subset of features based on the individual
    subset_X = X[:, np.array(individual).astype(bool)]

    # Calculate the fitness using the `sammons_error` function
    fitness = -sammons_error(X, subset_X)
    return fitness,

In [66]:
from deap import creator, base, tools, algorithms
import random
from sklearn.decomposition import PCA

# Set up the genetic algorithm
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)


toolbox = base.Toolbox()


# Functions to limit the number of features in the individual to 10
def create_individual():
    individual = [random.randint(0, 1) for _ in range(X.shape[1])]
    while sum(individual) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(individual) - 10, replace=False)
        for idx in indices:
            individual[idx] = 0
    return creator.Individual(individual)


def mate(ind1, ind2):
    child1, child2 = [toolbox.clone(ind) for ind in (ind1, ind2)]
    tools.cxTwoPoint(child1, child2)

    # Limit the number of features in the child to 10
    while sum(child1) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(child1) - 10, replace=False)
        for idx in indices:
            child1[idx] = 0
    while sum(child2) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(child2) - 10, replace=False)
        for idx in indices:
            child2[idx] = 0
    return child1, child2


def mutate(individual):
    child = toolbox.clone(individual)
    tools.mutFlipBit(child, indpb=0.05)
    while sum(child) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(child) - 10, replace=False)
        for idx in indices:
            child[idx] = 0
    return child,


toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", fitness_genetic, X=X_train)
toolbox.register("mate", mate)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)



In [67]:
import numpy as np

# Run the genetic algorithm
population = toolbox.population(n=100)
hof = tools.HallOfFame(100)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

population, log = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=100, stats=stats, halloffame=hof, verbose=True)

gen	nevals	avg         	std        	min         	max         
0  	100   	-4.41726e-06	2.12633e-06	-7.12144e-06	-2.59199e-07


1  	57    	-3.08783e-06	2.08875e-06	-6.89482e-06	-2.59199e-07
2  	67    	-1.62987e-06	1.59125e-06	-6.45965e-06	-2.47664e-07
3  	62    	-8.37881e-07	8.2284e-07 	-4.39582e-06	-2.13384e-07
4  	53    	-6.77807e-07	7.94533e-07	-4.58439e-06	-1.20072e-07
5  	66    	-5.19481e-07	7.09249e-07	-3.87091e-06	-1.20075e-07
6  	59    	-4.60476e-07	7.95388e-07	-4.58411e-06	-8.50291e-08
7  	55    	-4.06907e-07	7.11527e-07	-3.51324e-06	-8.50291e-08
8  	69    	-4.20278e-07	7.97879e-07	-4.8137e-06 	-6.22203e-08
9  	56    	-2.88706e-07	5.82562e-07	-4.22824e-06	-8.43835e-08
10 	63    	-2.21304e-07	5.09802e-07	-4.03802e-06	-8.43835e-08
11 	54    	-2.90028e-07	7.29204e-07	-4.03802e-06	-8.43835e-08
12 	55    	-2.03239e-07	4.92641e-07	-3.72519e-06	-8.43835e-08
13 	54    	-2.42596e-07	6.52346e-07	-4.24775e-06	-8.43716e-08
14 	61    	-2.87025e-07	7.76521e-07	-4.0366e-06 	-2.7398e-08 
15 	53    	-2.02567e-07	5.90093e-07	-4.03639e-06	-2.7398e-08 
16 	64    	-1.70478e-07	3.65089e-07	-3.36972e-06	-6.13184e-08
17 	61  

In [68]:
# Get the best individual (subset of metrics)
best_individual = hof.items[0]
best_subset_genetic = [df.columns[i+1] for i, bit in enumerate(best_individual) if bit]

print("Best subset of metrics:", best_subset_genetic)
print("Best fitness:", fitness_genetic(best_individual, X_train))
print("Length of best subset:", len(best_subset_genetic))

Best fitness: (-3.2787695694992544e-09,)
Length of best subset: 10


#### Simulated annealing

In [69]:
import math

def fitness_annealing(X, subset_X):
    return -sammons_error(X, subset_X)

def accept_probability(curr_score, best_score, temperature):
    if curr_score > best_score:
        return 1.0
    else:
        return math.exp((curr_score - best_score) / temperature)
    
def generate_random_subset(X):
    # Generate new random subset
    subset = np.random.randint(2, size=(35,)) # 0 or 1 masks

    # Limit the number of features in the subset to 10
    while sum(subset) > 10:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(subset) - 10, replace=False)
        for idx in indices:
            subset[idx] = 0

    return subset

def annealing(X):
    # Simulated annealing parameters
    t = 10000 # Initial temperature
    t_min = 1e-3 # Minimum temperature
    cooling_factor = 0.9990 # Temperature damping factor
    max_iterations = 10000 # Maximum number of iterations
    best_subset = None
    best_score = float("-inf")
    
    for _ in range(max_iterations):

        subset = generate_random_subset(X) # Generate random subset    

        score = fitness_annealing(X, X[:,subset==1]) # Evaluate subset

        # Update best, higher is better
        if score > best_score or random.random() < accept_probability(score, best_score, t):
            best_subset = subset
            best_score = score

        # Cool temperature
        t = t * cooling_factor

        # Check if cooled enough
        if t < t_min:
            break

    return best_subset, best_score

best_subset, best_score = annealing(X_train)

print("Best subset:", best_subset)
print("Best sammon's error result:", -best_score)

Best subset: [0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1]
Best sammon's error result: 1.2091227099838297e-06


In [70]:
best_subset_annealing = [df.columns[i+1] for i, bit in enumerate(best_subset) if bit]
print(best_subset_annealing)
print(len(best_subset_annealing))

10


In [72]:
print("Genetic algorithm result: ", best_subset_genetic)
print("Simulated annealing result:", best_subset_annealing)

X_genetic = X_train[:, np.array(best_individual).astype(bool)]
X_annealing = X_train[:, np.array(best_subset).astype(bool)]

X_genetic_test = X_test[:, np.array(best_individual).astype(bool)]
X_annealing_test = X_test[:, np.array(best_subset).astype(bool)]

print("Genetic training score:", sammons_error(X_train, X_genetic))
print("Annealing training score:", sammons_error(X_train, X_annealing))

print("Genetic validation score:", sammons_error(X_test, X_genetic_test))
print("Annealing validation score:", sammons_error(X_test, X_annealing_test))

Genetic training score: 3.2787695694992544e-09
Annealing training score: 1.2091227099838297e-06
Genetic validation score: 1.3740122716132385e-08
Annealing validation score: 1.0553328848265607e-06
