# Stage 6

## Running the experiments

In the previous stage, our team selected `Sammon's error` function and `Genetic algorithm` and `Simulated annealing` for the experiments, so we wil use them here.

### 1. Loading the data

In [438]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [439]:
# Loading the data
df = pd.read_csv('combined-metrics.csv')

In [440]:
X = df.drop('Name', axis=1).values
y = df['Name'].values

In [441]:
print(X.shape)

(101, 38)


In [442]:
X

array([[5.42625369e+00, 3.77802360e+01, 2.97935103e+01, ...,
        1.81400000e+03, 1.58500000e+04, 0.00000000e+00],
       [2.02732240e+00, 9.52868852e+00, 6.37568306e+00, ...,
        9.22000000e+02, 2.64500000e+03, 0.00000000e+00],
       [1.67987805e+00, 1.17048780e+01, 8.17073171e+00, ...,
        2.79100000e+03, 1.14790000e+04, 0.00000000e+00],
       ...,
       [4.29198473e+00, 2.41488550e+01, 1.87232824e+01, ...,
        3.82000000e+02, 6.43600000e+03, 0.00000000e+00],
       [9.15789474e+00, 3.77017544e+01, 2.96842105e+01, ...,
        1.78000000e+02, 1.38800000e+03, 0.00000000e+00],
       [2.24528302e+00, 1.04371069e+01, 7.09119497e+00, ...,
        8.01000000e+02, 2.98200000e+03, 0.00000000e+00]])

In [443]:
y

array(['ART', 'BentoML', 'Bokeh', 'Camel', 'CatBoost', 'Causal ML',
       'Chainer', 'Computer Vision', 'D2L', 'Darts', 'Dash', 'DeepChem',
       'DeepMind Control', 'DeepPavlov', 'Detectron2', 'DIGITS',
       'DragGan', 'EasyOCR', 'ELI5', 'EvalAI', 'facenet', 'FaceSwap',
       'Fairseq', 'FastAI', 'FeatureTools', 'FiftyOne', 'gensim',
       'Giskard', 'Gluonts', 'Google Flax', 'Google JAX', 'GPT-Engineer',
       'GPTDiscord', 'Gradio', 'Gymnasium', 'Horovod', 'ImageAI',
       'imbalanced-learn', 'InsightFace', 'Kaolin', 'Kedro', 'Keras',
       'Kserve', 'Lightning', 'Ludwig', 'Mage-ai', 'Mars', 'Matplotlib',
       'metatransformer', 'Mindsdb', 'MLflow', 'Mycroft',
       'Neural Prophet', 'NNI', 'Numpy', 'ONNX', 'Open-Assistant',
       'OpenAI Baselines', 'OpenAI Python API library', 'OpenVINO',
       'Optuna', 'Paddle', 'Pandas', 'Pocker', 'Pybrain', 'PyCaret',
       'pycm', 'PyMC', 'Pyro', 'PyTensor', 'PyTorch',
       'Pytorch image models', 'qlib', 'Rasa', 'Ray', 'Reco

### 2. Determining the minimal subset of metrics on the training data

Sammon's error function that we will use as an objective

In [444]:
import numpy as np

# Sammon's error function
def sammons_error(X, Y):
    """
    X: Original high-dimensional data
    Y: Low-dimensional representation
    """
    # Calculate pairwise distances in X and Y
    dist_orig = np.sqrt(np.sum((X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** 2, axis=-1))
    dist_lowd = np.sqrt(np.sum((Y[:, np.newaxis, :] - Y[np.newaxis, :, :]) ** 2, axis=-1))

    numerator = np.sum((dist_orig - dist_lowd) ** 2)
    denominator = np.sum(dist_orig ** 2)

    error = numerator / denominator

    return error

#### Genetic algorithm

In [445]:
def fitness(individual, X):
    # Create a subset of features based on the individual
    subset_X = X[:, np.array(individual).astype(bool)]

    # Calculate the fitness using the `sammons_error` function
    fitness = -sammons_error(X, subset_X)
    return fitness,

In [446]:
from deap import creator, base, tools, algorithms
import random
from sklearn.decomposition import PCA

# Set up the genetic algorithm
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)


toolbox = base.Toolbox()


# Functions to limit the number of features in the individual to 25
def create_individual():
    individual = [random.randint(0, 1) for _ in range(X.shape[1])]
    while sum(individual) > 25:
        indices = np.random.choice(np.arange(X.shape[1]), size=sum(individual) - 25, replace=False)
        for idx in indices:
            individual[idx] = 0
    return creator.Individual(individual)


def mate(ind1, ind2):
    child1, child2 = [toolbox.clone(ind) for ind in (ind1, ind2)]
    child1_features = sum(child1)
    child2_features = sum(child2)
    tools.cxTwoPoint(child1, child2)
    if child1_features > 25:
        indices = np.random.choice(np.arange(X.shape[1]), size=child1_features - 25, replace=False)
        for idx in indices:
            child1[idx] = 0
    if child2_features > 25:
        indices = np.random.choice(np.arange(X.shape[1]), size=child2_features - 25, replace=False)
        for idx in indices:
            child2[idx] = 0
    return child1, child2


def mutate(individual):
    child = toolbox.clone(individual)
    tools.mutFlipBit(child, indpb=0.05)
    child_features = sum(child)
    if child_features > 25:
        indices = np.random.choice(np.arange(X.shape[1]), size=child_features - 25, replace=False)
        for idx in indices:
            child[idx] = 0
    return child,


toolbox.register("individual", create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", fitness, X=X)
toolbox.register("mate", mate)
toolbox.register("mutate", mutate)
toolbox.register("select", tools.selTournament, tournsize=3)



In [447]:
import numpy as np

# Run the genetic algorithm
population = toolbox.population(n=100)
hof = tools.HallOfFame(100)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

population, log = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=10000, stats=stats, halloffame=hof, verbose=True)

gen	nevals	avg     	std     	min      	max         
0  	100   	-0.32193	0.267947	-0.898907	-0.000185331


1  	69    	-0.092915	0.160186	-0.812856	-1.2795e-05 
2  	57    	-0.0101008	0.0483705	-0.456573	-7.78715e-06
3  	61    	-0.000393121	0.00154609	-0.015645	-5.26216e-08
4  	72    	-0.000885395	0.00496287	-0.0363452	-5.26216e-08
5  	55    	-0.00940687 	0.063988  	-0.457429 	-5.26214e-08
6  	53    	-0.00981331 	0.0634664 	-0.452561 	-5.26214e-08
7  	59    	-0.0145001  	0.0772321 	-0.452882 	-5.26214e-08
8  	60    	-0.0210918  	0.0941008 	-0.551989 	-5.26214e-08
9  	49    	-0.00974516 	0.0637068 	-0.454559 	-5.26214e-08
10 	61    	-0.0157015  	0.078031  	-0.45249  	-5.26214e-08
11 	68    	-0.0153459  	0.077189  	-0.453    	-1.31496e-11
12 	69    	-0.000390512	0.00352782	-0.0354758	-1.29856e-11
13 	56    	-0.0108278  	0.0636396 	-0.454559 	-1.31496e-11
14 	64    	-0.0104582  	0.0637009 	-0.455146 	-1.29856e-11
15 	63    	-0.015527   	0.0829194 	-0.5452   	-1.29856e-11
16 	57    	-0.0149713  	0.0828248 	-0.547849 	-1.29856e-11
17 	66    	-0.0143842  	0.0774146 	-0.457695 	-1.29856e-11
18 	64  

In [448]:
# Get the best individual (subset of metrics)
best_individual = hof.items[0]
best_subset = [df.columns[i+1] for i, bit in enumerate(best_individual) if bit]

print("Best subset of metrics:", best_subset)
print("Length of best subset:", len(best_subset))

Length of best subset: 30
