In [1]:
%cd ..

/home/alberto/PycharmProjects/incomplete_multiview_clustering


In [2]:
import pandas as pd
import numpy as np
from imvc.datasets import LoadDataset
from imvc.ampute import Amputer
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import tqdm
from time import perf_counter
import itertools

In [3]:
from imvc.cluster import IMSR

In [4]:
def dataset_engine_comp(results_dict, Xs, y, n_clusters, estimator, engines, ps, n_times):
    for p in ps:
        missing_percentge = int(p*100)
        results_dict[missing_percentge] = {}
        matrices_comp = {}
        amputed_Xs = Amputer(p=p, mechanism="mcar", random_state=42).fit_transform(Xs)
        for engine in engines:
            results_dict[missing_percentge][engine] = {}
            matrices_comp[engine] = []
            for i in range(n_times):
                results_dict[missing_percentge][engine][i] = {}
                estimator.set_params(n_clusters=n_clusters, engine=engine, random_state=i)
                start_time = perf_counter()
                try:
                    labels = estimator.fit_predict(amputed_Xs)
                except Exception as ex:
                    print(ex)
                    continue
                results_dict[missing_percentge][engine][i]["Computing time"] = perf_counter() - start_time
                results_dict[missing_percentge][engine][i]["AMI"] = adjusted_mutual_info_score(labels_true=y, labels_pred=labels)
                results_dict[missing_percentge][engine][i]["ARI"] = adjusted_rand_score(labels_true=y, labels_pred=labels)
                try:
                    embeddings = True
                    matrices_comp[engine].append(estimator.embedding_)
                except:
                    pass
        if embeddings:
            results_dict[missing_percentge]["both"] = {}
            results_dict[missing_percentge]["both"][0] = {}
            for engine in engines + ["both"]: 
                results_dict[missing_percentge][engine][0]["RMSE"] = []
                results_dict[missing_percentge][engine][0]["MAE"] = []
                if engine == "both":
                    mats = [mat for mats in matrices_comp.values() for mat in mats]
                else:
                    mats = matrices_comp[engine]
                combs = set(itertools.combinations(range(len(mats)), 2))
                combs = [(mats[comb[0]], mats[comb[1]]) for comb in combs]
                for i, (mat1, mat2) in enumerate(combs):
                    results_dict[missing_percentge][engine][0]["RMSE"].append(mean_squared_error(y_true=mat1, y_pred=mat2, squared=False))
                    results_dict[missing_percentge][engine][0]["MAE"].append(mean_absolute_error(y_true=mat1, y_pred=mat2))
    return results_dict

In [5]:
def engine_comp(datasets, estimator, engines, ps, n_times):
    results = {}
    for dataset in tqdm(datasets):
        names = dataset.split("_")
        if "simulated" in names:
            names = ["_".join(names)]
        x_name,y_name = names if len(names) > 1 else (names[0], "0")
        Xs, y = LoadDataset.load_dataset(dataset_name=x_name, return_y=True)
        y = y[y_name]
        n_clusters = int(y.nunique())
        
        results[dataset] = {}
        results[dataset] = dataset_engine_comp(results_dict= results[dataset], Xs=Xs, y=y, n_clusters=n_clusters,
                                      estimator=estimator, engines=engines, ps=ps, n_times=n_times)

    return results

In [6]:
results = engine_comp(datasets= ["nutrimouse_genotype", "buaa", "bdgp", "bbcsport", "sensIT300"],
                      estimator=IMSR(), engines= ["python", "matlab"], ps= np.arange(0., 0.7, 0.2), n_times = 50)

  0%|          | 0/5 [00:00<?, ?it/s]

ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 8/10 eigenvectors converged)
ARPACK error -1: No convergence (901 iterations, 7/10 eigenvecto

In [7]:
flattened_data = [
    {
        'Dataset': dataset,
        'Incomplete samples (\%)': p,
        'Engine': engine,
        'Iteration': i,
        **iter_dict
    }
    for dataset, dataset_dict in results.items()
    for p, p_dict in dataset_dict.items()
    for engine, engine_dict in p_dict.items()
    for i, iter_dict in engine_dict.items()
]
results = pd.DataFrame(flattened_data)
estimator_name = "ISMR"
results.to_csv(f"tutorials/engine_comparison_{estimator_name}.csv", index= None)
print("results", results.shape)
results.head()

results (2020, 9)


Unnamed: 0,Dataset,Incomplete samples (\%),Engine,Iteration,Computing time,AMI,ARI,RMSE,MAE
0,nutrimouse_genotype,0,python,0,0.09423,0.120905,0.031056,"[6.54252339334932e-16, 0.3162277660168379, 0.1...","[5.173921187318076e-16, 0.2590347212058302, 0...."
1,nutrimouse_genotype,0,python,1,0.093705,0.120905,0.031056,,
2,nutrimouse_genotype,0,python,2,0.080905,0.120905,0.031056,,
3,nutrimouse_genotype,0,python,3,0.073203,0.120905,0.031056,,
4,nutrimouse_genotype,0,python,4,0.085842,0.120905,0.031056,,
