## Synthetic Data Experiment Setup

In [1]:
from src.clustering import *
from src.pca_analysis import *
from src.synthetic_data import *
from sklearn.metrics import adjusted_rand_score
from pathlib import Path
import os
import json
import random

def run_experiment(experiment_name: str, model_class: any, param_to_vary: str, param_values: list, default_params: dict):
    """Perform experiments based on the input model and the specified variables to both change and leave fixed

    Args:
        experiment_name (str): Label of experiment
        model_class (any): Classification model class (KMeans or GaussianMixture) - NOT AN INSTANTIATION but a reference to the actual class
        param_to_vary (str): Parameter to modify
        param_values (list): Values to set said parameter to
        default_params (dict): Fixed values for all other parameters
    """
    scores = []
    for v in param_values:
        params_copy = default_params.copy()
        params_copy[param_to_vary] = v
        num_sources, num_clusters, num_points, sigma, delta_mu = params_copy['num_sources'], params_copy['num_clusters'], params_copy['num_points'], params_copy['sigma'], params_copy['delta_mu']
        gaussian_means = [delta_mu*i for i in range(num_sources)]
        # Trials
        avg_score = 0
        for _ in range(params_copy['num_trials']):
            if sigma == "random":
                std_devs = [0.75 + 1.25*random.random() for _ in range(num_sources)]
            else:
                std_devs = [sigma for _ in range(num_sources)]
            # Source points from the gaussians, and keep track of the true gaussian distribution each point came from
            X, y_true = generate_mixture(num_points_per_source=num_points, means=gaussian_means, stds=std_devs)
            X = X.reshape((X.shape[0],1))
            y_true = y_true.flatten()
            # The number of clusters the model guesses from is the same as the number of sources the data is sampled from unless otherwise specified
            num_clusters = num_sources if num_clusters is None else num_clusters
            model = model_class(n_clusters=num_clusters)
            model.fit(X)
            y = model.predict(X).flatten()
            avg_score += adjusted_rand_score(labels_true=y_true, labels_pred=y)
        avg_score /= params_copy['num_trials']
        scores.append({
            "parameters": params_copy,
            "avg_score": avg_score
        })
    
    results_path = Path(f"results/{experiment_name}.json")
    os.makedirs(results_path.parent, exist_ok=True)
    with open(results_path, 'w') as f:
        json.dump(scores, f, indent=4)

## Experiment Runner

In [None]:
num_trials = 50

# Experiment 1:
num_sources_list = [3, 5, 10]
num_points = 1000
sigma = 1
delta_mu = 2
num_clusters = None

params_default = { 
    "num_clusters": num_clusters,
    "num_points": num_points,
    "sigma": sigma,
    "delta_mu": delta_mu,
    "num_trials": num_trials
}

run_experiment("vary_num_sources_kmeans", model_class=KMeans, param_to_vary="num_sources", param_values=num_sources_list, default_params=params_default)
run_experiment("vary_num_sources_mixed_gaussian", model_class=GaussianMixture, param_to_vary="num_sources", param_values=num_sources_list, default_params=params_default)


# Experiment 2:
num_points_list = [100, 1000, 5000]
num_sources = 5
sigma = 1
delta_mu = 2
num_clusters = None

params_default = { 
    "num_clusters": num_clusters,
    "num_sources": num_sources,
    "sigma": sigma,
    "delta_mu": delta_mu,
    "num_trials": num_trials
}

run_experiment("vary_num_points_kmeans", model_class=KMeans, param_to_vary="num_points", param_values=num_points_list, default_params=params_default)
run_experiment("vary_num_points_mixed_gaussian", model_class=GaussianMixture, param_to_vary="num_points", param_values=num_points_list, default_params=params_default)

# Experiment 3:
sigma_list = [1, 2, 3]
num_sources = 5
num_points = 1000
delta_mu = 2
num_clusters = None

params_default = { 
    "num_clusters": num_clusters,
    "num_sources": num_sources,
    "num_points": num_points,
    "delta_mu": delta_mu,
    "num_trials": num_trials
}

run_experiment("vary_sigma_kmeans", model_class=KMeans, param_to_vary="sigma", param_values=sigma_list, default_params=params_default)
run_experiment("vary_sigma_mixed_gaussian", model_class=GaussianMixture, param_to_vary="sigma", param_values=sigma_list, default_params=params_default)

# Experiment 4:
sigma_list = ["random"]
num_sources = 5
num_points = 1000
delta_mu = 2
num_clusters = None

params_default = { 
    "num_clusters": num_clusters,
    "num_sources": num_sources,
    "num_points": num_points,
    "delta_mu": delta_mu,
    "num_trials": num_trials
}

run_experiment("random_sigma_kmeans", model_class=KMeans, param_to_vary="sigma", param_values=sigma_list, default_params=params_default)
run_experiment("random_sigma_mixed_gaussian", model_class=GaussianMixture, param_to_vary="sigma", param_values=sigma_list, default_params=params_default)

# Experiment 5:
delta_mu_list = [0.5, 1, 2, 4]
num_sources = 3
num_points = 1000
sigma = 1
num_clusters = None

params_default = { 
    "num_clusters": num_clusters,
    "num_sources": num_sources,
    "num_points": num_points,
    "sigma": sigma,
    "num_trials": num_trials
}

run_experiment("vary_delta_mu_kmeans", model_class=KMeans, param_to_vary="delta_mu", param_values=delta_mu_list, default_params=params_default)
run_experiment("vary_delta_mu_mixed_gaussian", model_class=GaussianMixture, param_to_vary="delta_mu", param_values=delta_mu_list, default_params=params_default)

# Experiment 6:
num_clusters_list = [2, 3, 6, 8]
num_sources = 3
num_points = 1000
delta_mu = 2
sigma = 1

params_default = { 
    "sigma": sigma,
    "num_sources": num_sources,
    "num_points": num_points,
    "delta_mu": delta_mu,
    "num_trials": num_trials
}

run_experiment("vary_num_clusters_kmeans", model_class=KMeans, param_to_vary="num_clusters", param_values=num_clusters_list, default_params=params_default)
run_experiment("vary_num_clusters_mixed_gaussian", model_class=GaussianMixture, param_to_vary="num_clusters", param_values=num_clusters_list, default_params=params_default)

########################################################################################################################################################################################

print("All experiments done!")