In [228]:
import itertools
import os.path
import argparse

import numpy as np
from pandarallel import pandarallel
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.cluster import KMeans
from mvlearn.decomposition import AJIVE, GroupPCA
from mvlearn.cluster import MultiviewSpectralClustering, MultiviewCoRegSpectralClustering
from imvc.datasets import LoadDataset
from imvc.transformers import MultiViewTransformer, ConcatenateViews
from imvc.algorithms import NMFC

from utils import Utils


folder_results = "results"
folder_subresults = "subresults"
filelame = "complete_algorithms_evaluation.csv"
file_path = os.path.join(folder_results, filelame)
subresults_path = os.path.join(folder_results, folder_subresults)
logs_file = os.path.join(folder_results, 'logs.txt')
error_file = os.path.join(folder_results, 'error.txt')

random_state = 42

# parser = argparse.ArgumentParser()
# parser.add_argument('-continue_benchmarking', default= False, action='store_true')
# parser.add_argument('-n_jobs', default= 1, type= int)
# args = parser.parse_args()

continue_benchmarking = False
n_jobs = 1
if n_jobs > 1:
    pandarallel.initialize(nb_workers= args.n_jobs)

datasets = [
    # "simulated_InterSIM",
    # "simulated_netMUG",
    "nutrimouse_genotype",
    "nutrimouse_diet",
    "bbcsport",
    "buaa",
    "metabric",
    "digits",
    "bdgp",
    "tcga",
    "caltech101",
    "nuswide",
]
amputation_mechanisms = ["EDM", 'MCAR', 'MAR', 'MNAR']
probs = np.arange(100, step= 10)
imputation = [True, False]
runs_per_alg = np.arange(10)
algorithms = {
    "Concat": {"alg": make_pipeline(ConcatenateViews(),
                                    StandardScaler().set_output(transform='pandas'),
                                    KMeans()), "params": {}},
    "NMFC": {"alg": make_pipeline(ConcatenateViews(),
                                  MinMaxScaler().set_output(transform='pandas'),
                                  NMFC().set_output(transform='pandas')), "params": {}},
    "MVSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                  MultiviewSpectralClustering()),
                             "params": {}},
    "MVCoRegSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                       MultiviewCoRegSpectralClustering()),
                                  "params": {}},
    "GroupPCA": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), GroupPCA(), StandardScaler(), KMeans()),
                 "params": {}},
    "AJIVE": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), AJIVE(), MultiViewTransformer(FunctionTransformer(pd.DataFrame)),
                                   ConcatenateViews(), StandardScaler(), KMeans()),
              "params": {}},
    "SNF": {},
    "intNMF": {},
    "jNMF": {},
}
indexes_results = {"dataset": datasets, "algorithm": list(algorithms.keys()), "missing_percentage": probs,
                   "amputation_mechanism": amputation_mechanisms, "imputation": imputation, "run_n": runs_per_alg}
indexes_names = list(indexes_results.keys())

if not continue_benchmarking:
    if not eval(input("Are you sure you want to start benchmarking and delete previous results? (True/False)")):
        raise Exception
    results = pd.DataFrame(datasets, columns= ["dataset"])
    for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
        results = results.merge(pd.Series(v, name= k), how= "cross")
    results.loc[(results["amputation_mechanism"] == "EDM") & (
                results["missing_percentage"] == 0), "amputation_mechanism"] = "'None'"
    results = results.set_index(indexes_names)

    idx_to_drop = results.xs(0, level="missing_percentage",
                             drop_level=False).xs(True, level="imputation", drop_level=False).index
    results = results.drop(idx_to_drop)
    for amputation_mechanism in amputation_mechanisms[1:]:
        idx_to_drop = results.xs(0, level="missing_percentage",
                                 drop_level=False).xs(amputation_mechanism, level="amputation_mechanism", drop_level=False).index
        results = results.drop(idx_to_drop)
    results_amputation_mechanism_none = results.xs(0, level="missing_percentage", drop_level=False)
    results_amputation_mechanism_none_tochange = results_amputation_mechanism_none.index.to_frame()
    results_amputation_mechanism_none_tochange["amputation_mechanism"] = "None"
    results.loc[results_amputation_mechanism_none.index].index = pd.MultiIndex.from_frame(results_amputation_mechanism_none_tochange)

    for amputation_mechanism, dataset in itertools.product(["MAR", "MNAR"], ["nutrimouse_genotype",
                                                                             "nutrimouse_diet",
                                                                             "metabric",
                                                                             "bdgp",
                                                                             "buaa",
                                                                             # "simulated_netMUG",
                                                                             ]):
        idx_to_drop = results.xs(dataset, level="dataset",
                                 drop_level=False).xs(amputation_mechanism, level="amputation_mechanism", drop_level=False).index
        results = results.drop(idx_to_drop)

    results[["finished", "completed"]] = False
    results.to_csv(file_path)
    raise

    shutil.rmtree(subresults_path, ignore_errors=True)
    os.mkdir(subresults_path)

    os.remove(logs_file) if os.path.exists(logs_file) else None
    os.remove(error_file) if os.path.exists(error_file) else None
    open(logs_file, 'w').close()
    open(error_file, 'w').close()
else:
    results = pd.read_csv(file_path, index_col= indexes_names)
    results = Utils.collect_subresults(results=results, subresults_path=subresults_path, indexes_names=indexes_names)

unfinished_results = results.loc[~results["finished"]]

for dataset_name in unfinished_results.index.get_level_values("dataset").unique():
    names = dataset_name.split("_")
    if "simulated" in names:
        names = ["_".join(names)]
    x_name,y_name = names if len(names) > 1 else (names[0], "0")
    Xs, y = LoadDataset.load_dataset(dataset_name=x_name, return_y=True, shuffle= False)
    y = y[y_name]
    n_clusters = y.nunique()
    unfinished_results_dataset = unfinished_results.loc[[dataset_name]]

    if n_jobs == 1:
        iterator = pd.DataFrame(unfinished_results_dataset.index.to_list(), columns=indexes_names)
        iterator.apply(lambda x: Utils.run_iteration(idx= x, results= results, Xs=Xs, y=y, n_clusters=n_clusters,
                                                     algorithms=algorithms, random_state=random_state, subresults_path=subresults_path,
                                                     logs_file=logs_file, error_file=error_file), axis= 1)
    else:
        try:
            unfinished_results_dataset_idx = unfinished_results_dataset.xs(0, level="missing_percentage", drop_level=False).index
            iterator = pd.DataFrame(unfinished_results_dataset_idx.to_list(), columns= indexes_names)
            iterator.parallel_apply(lambda x: Utils.run_iteration(idx= x, results= results, Xs=Xs, y=y,
                                                                  n_clusters=n_clusters,
                                                                  algorithms=algorithms,
                                                                  random_state=random_state,
                                                                  subresults_path=subresults_path,
                                                                  logs_file=logs_file,
                                                                  error_file=error_file), axis= 1)
            results = Utils.collect_subresults(results=results, subresults_path=subresults_path,
                                               indexes_names=indexes_names)
            results.to_csv(file_path)

            unfinished_results_dataset_idx = unfinished_results_dataset.drop(unfinished_results_dataset_idx).index
            iterator = pd.DataFrame(unfinished_results_dataset_idx.to_list(), columns=indexes_names)
        except KeyError:
            iterator = pd.DataFrame(unfinished_results_dataset.index.to_list(), columns=indexes_names)

        iterator.parallel_apply(lambda x: Utils.run_iteration(idx= x, results= results, Xs=Xs, y=y,
                                                                            n_clusters=n_clusters,
                                                                            algorithms=algorithms,
                                                                            random_state=random_state,
                                                                            subresults_path=subresults_path,
                                                                            logs_file=logs_file,
                                                                            error_file=error_file), axis= 1)
        results = Utils.collect_subresults(results=results, subresults_path=subresults_path,
                                           indexes_names=indexes_names)
        results.to_csv(file_path)


Are you sure you want to start benchmarking and delete previous results? (True/False) True


RuntimeError: No active exception to reraise

In [1]:
import pandas as pd
from imvc.datasets import LoadDataset
import numpy as np
Xs, y = LoadDataset.load_dataset(dataset_name="caltech101", return_y=True, shuffle= False)
Xs[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,15.044314,31.836497,40.997210,34.390896,7.736058,17.370114,25.391800,29.198195,5.377202,14.717360,...,0.037056,0.040868,0.011358,0.020355,0.028393,0.035676,0.013326,0.022278,0.029836,0.033345
1,28.504582,53.951531,62.548719,84.996720,22.503153,45.091515,59.365519,90.261557,23.175209,61.951504,...,0.061419,0.057223,0.023957,0.039699,0.045649,0.054297,0.021933,0.031987,0.038050,0.047752
2,15.947757,19.039989,7.196868,1.775596,8.391891,10.695923,5.011634,2.016594,6.809082,7.125768,...,0.012585,0.016084,0.007273,0.008067,0.008445,0.010746,0.007856,0.009487,0.007954,0.007093
3,13.135696,28.928518,38.080037,55.357058,7.541382,15.375125,19.820881,38.400375,7.608873,16.643876,...,0.042706,0.056762,0.014946,0.022783,0.030973,0.039278,0.014053,0.021040,0.027943,0.040719
4,9.598944,17.575873,23.974306,56.529306,7.932658,15.406384,20.424018,35.888609,8.962675,19.784140,...,0.053178,0.069009,0.019230,0.030706,0.040181,0.052220,0.013286,0.019051,0.024898,0.035943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9139,16.226805,69.214218,90.441376,68.436279,15.313380,61.463430,77.947826,66.263331,14.241716,57.003396,...,0.048697,0.045058,0.017056,0.036708,0.048240,0.038688,0.017844,0.038576,0.051452,0.044039
9140,12.503711,32.980913,68.166927,79.869925,11.415075,29.113732,52.375020,78.490055,12.988374,31.037411,...,0.033886,0.054953,0.010437,0.018520,0.031220,0.050817,0.010087,0.018301,0.032092,0.049505
9141,4.972273,10.890188,31.764150,62.417002,4.454803,8.302686,20.663929,49.419153,2.767894,4.794864,...,0.011959,0.023671,0.006361,0.008392,0.011643,0.019725,0.007316,0.010598,0.019110,0.029684
9142,18.828697,47.957211,75.842385,147.081311,10.364175,38.001792,73.559660,125.925574,9.352314,34.360579,...,0.038313,0.059128,0.011211,0.024217,0.038212,0.055324,0.011959,0.026034,0.040628,0.064625


In [2]:
Xs[3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983
0,0.261661,0.400000,0.400000,0.400000,0.343632,0.400000,0.329767,0.400000,0.167035,0.394578,...,0.158582,0.228006,0.187460,0.172032,0.139624,0.293360,0.187875,0.254355,0.223806,0.189381
1,0.014952,0.053008,0.138431,0.391251,0.352965,0.217056,0.023932,0.016740,0.004268,0.194551,...,0.226811,0.209077,0.000000,0.024079,0.108644,0.199408,0.093917,0.129881,0.140024,0.222356
2,0.000000,0.000000,0.102156,0.302656,0.292165,0.000000,0.000000,0.000000,0.000000,0.134023,...,0.000000,0.000000,0.000000,0.000000,0.000422,0.297752,0.359863,0.011706,0.000000,0.000000
3,0.168027,0.188040,0.072884,0.163923,0.400000,0.238917,0.020608,0.006927,0.124900,0.113634,...,0.210533,0.074725,0.250818,0.315082,0.323974,0.102916,0.000000,0.000000,0.000000,0.000000
4,0.328875,0.391070,0.275135,0.030786,0.029746,0.040101,0.048513,0.024524,0.032631,0.028272,...,0.282160,0.127711,0.187435,0.278200,0.214201,0.259537,0.265158,0.290671,0.330026,0.244864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9139,0.039068,0.007367,0.084344,0.381240,0.390044,0.120257,0.016993,0.024952,0.014783,0.014419,...,0.245075,0.207252,0.001578,0.144414,0.054743,0.077117,0.084481,0.280723,0.212753,0.262370
9140,0.011986,0.291720,0.319846,0.328607,0.286253,0.143306,0.000000,0.000000,0.001630,0.086209,...,0.168624,0.024420,0.013202,0.103529,0.146153,0.116770,0.137495,0.121783,0.144657,0.000000
9141,0.149710,0.099214,0.203577,0.200623,0.079059,0.098057,0.151877,0.138999,0.136936,0.148251,...,0.293888,0.348856,0.300672,0.323372,0.332723,0.315752,0.340056,0.354810,0.340426,0.344751
9142,0.000000,0.000000,0.000070,0.000099,0.000000,0.000190,0.000465,0.000000,0.000595,0.033671,...,0.248481,0.222414,0.001434,0.072547,0.103934,0.063300,0.057856,0.129774,0.207614,0.186621


In [7]:
pd.read_csv("imvc/datasets/data/caltech101/upload/caltech101_3.csv")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983
0,0.261661,0.400000,0.400000,0.400000,0.343632,0.400000,0.329767,0.400000,0.167035,0.394578,...,0.158582,0.228006,0.187460,0.172032,0.139624,0.293360,0.187875,0.254355,0.223806,0.189381
1,0.014952,0.053008,0.138431,0.391251,0.352965,0.217056,0.023932,0.016740,0.004268,0.194551,...,0.226811,0.209077,0.000000,0.024079,0.108644,0.199408,0.093917,0.129881,0.140024,0.222356
2,0.000000,0.000000,0.102156,0.302656,0.292165,0.000000,0.000000,0.000000,0.000000,0.134023,...,0.000000,0.000000,0.000000,0.000000,0.000422,0.297752,0.359863,0.011706,0.000000,0.000000
3,0.168027,0.188040,0.072884,0.163923,0.400000,0.238917,0.020608,0.006927,0.124900,0.113634,...,0.210533,0.074725,0.250818,0.315082,0.323974,0.102916,0.000000,0.000000,0.000000,0.000000
4,0.328875,0.391070,0.275135,0.030786,0.029746,0.040101,0.048513,0.024524,0.032631,0.028272,...,0.282160,0.127711,0.187435,0.278200,0.214201,0.259537,0.265158,0.290671,0.330026,0.244864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9148,0.039068,0.007367,0.084344,0.381240,0.390044,0.120257,0.016993,0.024952,0.014783,0.014419,...,0.245075,0.207252,0.001578,0.144414,0.054743,0.077117,0.084481,0.280723,0.212753,0.262370
9149,0.011986,0.291720,0.319846,0.328607,0.286253,0.143306,0.000000,0.000000,0.001630,0.086209,...,0.168624,0.024420,0.013202,0.103529,0.146153,0.116770,0.137495,0.121783,0.144657,0.000000
9150,0.149710,0.099214,0.203577,0.200623,0.079059,0.098057,0.151877,0.138999,0.136936,0.148251,...,0.293888,0.348856,0.300672,0.323372,0.332723,0.315752,0.340056,0.354810,0.340426,0.344751
9151,0.000000,0.000000,0.000070,0.000099,0.000000,0.000190,0.000465,0.000000,0.000595,0.033671,...,0.248481,0.222414,0.001434,0.072547,0.103934,0.063300,0.057856,0.129774,0.207614,0.186621


In [245]:
Xs[3].iloc[:500].to_csv(f"imvc/datasets/data/caltech101/caltech101_3_{i}.csv", index= False)

In [6]:
for i, mini_X in enumerate(np.array_split(Xs[3], 10)):
    mini_X.to_csv(f"imvc/datasets/data/caltech101/upload/caltech101_3_{i}.csv", index= False)
for i, mini_X in enumerate(np.array_split(Xs[5], 10)):
    mini_X.to_csv(f"imvc/datasets/data/caltech101/upload/caltech101_5_{i}.csv", index= False)

In [205]:
results.xs(0, level= "missing_percentage", drop_level=False).xs(True, level= "imputation", drop_level=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,completed,n_samples,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,comments,stratified,...,bal_acc,ami,ari,completeness,random_acc,random_f1,silhouette,vrc,db,dunn
dataset,algorithm,missing_percentage,imputation,run_n,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
simulated_InterSIM,Concat,0,True,0,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,1,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,2,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,3,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,4,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,5,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,6,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,7,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,8,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,9,False,False,,,,,,,,,...,,,,,,,,,,


In [109]:
unfinished_results = results.loc[~results["finished"]]
unfinished_results.drop(unfinished_results.loc[["bbcsport"]].xs(0, level=2, drop_level=False).index)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,completed,n_samples,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,comments,stratified,...,MCC_artificial_performance,F1_artificial_performance,precision_artificial_performance,recall_artificial_performance,bal_acc_artificial_performance,ami_artificial_performance,ari_artificial_performance,completeness_artificial_performance,random_acc_artificial_performance,random_f1_artificial_performance
dataset,algorithm,missing_percentage,imputation,run_n,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
nutrimouse_diet,AJIVE,30,False,8,False,False,,,,,,,,,...,,,,,,,,,,
nutrimouse_diet,AJIVE,30,False,9,False,False,,,,,,,,,...,,,,,,,,,,
nutrimouse_diet,AJIVE,40,True,0,False,False,,,,,,,,,...,,,,,,,,,,
nutrimouse_diet,AJIVE,40,True,1,False,False,,,,,,,,,...,,,,,,,,,,
nutrimouse_diet,AJIVE,40,True,2,False,False,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
metabric,jNMF,90,False,5,False,False,,,,,,,,,...,,,,,,,,,,
metabric,jNMF,90,False,6,False,False,,,,,,,,,...,,,,,,,,,,
metabric,jNMF,90,False,7,False,False,,,,,,,,,...,,,,,,,,,,
metabric,jNMF,90,False,8,False,False,,,,,,,,,...,,,,,,,,,,


In [197]:
for i in os.listdir("imvc/datasets/data/buaa"):
    if "y.csv" in i:
        continue
    if ".csv" in i:
        pd.read_csv(os.path.join("imvc/datasets/data/buaa", i), index_col= 0).iloc[:90].to_csv(os.path.join("imvc/datasets/data/buaa", i), index= False)

In [33]:
import os
import pandas as pd
for i in os.listdir("imvc/datasets/data/"):
    if i == "metabric":
        continue
    print(i)
    for j in os.listdir(os.path.join("imvc/datasets/data/", i)):
        if os.path.isfile(os.path.join("imvc/datasets/data/", i, j)):
            a = pd.read_csv(os.path.join("imvc/datasets/data/", i, j))
            print(j, a.shape)
    print()

simulated_netMUG
simulated_netMUG_1.csv (1000, 2000)
simulated_netMUG_0.csv (1000, 2000)
simulated_netMUG_y.csv (1000, 1)

.ipynb_checkpoints

simulated_InterSIM
simulated_InterSIM_1.csv (500, 131)
simulated_InterSIM_y.csv (500, 1)
simulated_InterSIM_0.csv (500, 367)
simulated_InterSIM_2.csv (500, 160)

bdgp
bdgp_1.csv (2500, 79)
bdgp_y.csv (2500, 2)
metadata.json (0, 2)
bdgp_0.csv (2500, 1750)

tcga
tcga_2.csv (2437, 132)
tcga_3.csv (2437, 2044)
tcga_y.csv (2437, 2)
tcga_0.csv (2437, 216)
metadata.json (0, 14)
tcga_1.csv (2437, 16117)

nuswide
nuswide_4.csv (30000, 129)
nuswide_3.csv (30000, 74)
nuswide_y.csv (30000, 2)
nuswide_1.csv (30000, 226)
nuswide_0.csv (30000, 65)
nuswide_2.csv (30000, 145)

nutrimouse
nutrimouse_y.csv (40, 3)
nutrimouse_0.csv (40, 120)
nutrimouse_1.csv (40, 21)
metadata.json (0, 9)

digits
digits_5.csv (2000, 240)
digits_3.csv (2000, 47)
digits_4.csv (2000, 76)
digits_y.csv (2000, 2)
digits_0.csv (2000, 6)
digits_2.csv (2000, 216)
digits_1.csv (2000, 64)
meta

In [10]:
finished_results = results[results["finished"]]
finished_results.loc[["nutrimouse_genotype"]].xs("SNF", level=1, drop_level=False).xs(False, level=3, drop_level=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,completed,n_samples,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,comments,stratified,...,MCC_artificial_performance,F1_artificial_performance,precision_artificial_performance,recall_artificial_performance,bal_acc_artificial_performance,ami_artificial_performance,ari_artificial_performance,completeness_artificial_performance,random_acc_artificial_performance,random_f1_artificial_performance
dataset,algorithm,missing_percentage,imputation,run_n,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
nutrimouse_genotype,SNF,0,False,0,True,True,40.0,0.0,40.0,0.038531,40.0,100.0,{},False,...,,,,,,,,,,
nutrimouse_genotype,SNF,0,False,1,True,True,40.0,0.0,40.0,0.034154,40.0,100.0,{},False,...,,,,,,,,,,
nutrimouse_genotype,SNF,0,False,2,True,True,40.0,0.0,40.0,0.42229,40.0,100.0,{},False,...,,,,,,,,,,
nutrimouse_genotype,SNF,0,False,3,True,True,40.0,0.0,40.0,0.659519,40.0,100.0,{},False,...,,,,,,,,,,
nutrimouse_genotype,SNF,0,False,4,True,True,40.0,0.0,40.0,0.591319,40.0,100.0,{},False,...,,,,,,,,,,
nutrimouse_genotype,SNF,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nutrimouse_genotype,SNF,90,False,5,True,False,,,,,,,{'ValueError: Percentiles must be in the range...,,...,,,,,,,,,,
nutrimouse_genotype,SNF,90,False,6,True,False,,,,,,,{'ValueError: Percentiles must be in the range...,,...,,,,,,,,,,
nutrimouse_genotype,SNF,90,False,7,True,False,,,,,,,{'ValueError: Percentiles must be in the range...,,...,,,,,,,,,,
nutrimouse_genotype,SNF,90,False,8,True,False,,,,,,,{'ValueError: Percentiles must be in the range...,,...,,,,,,,,,,


In [None]:
%tb

In [93]:
iterator = finished_results.loc[["nutrimouse_genotype"]].xs("SNF", level=1, drop_level=False).xs(False, level=3, drop_level=False).index

In [94]:
import time
from collections import defaultdict
from datetime import datetime
import numpy as np
import pandas as pd
from bignmf.models.jnmf.integrative import IntegrativeJnmf
from bignmf.models.jnmf.standard import StandardJnmf
from reval.utils import kuhn_munkres_algorithm
from sklearn import metrics
from sklearn.cluster import spectral_clustering
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from snf import compute
from validclust import dunn

from imvc.transformers import MultiViewTransformer, ConcatenateViews
from imvc.utils import DatasetUtils

for idx in iterator:
    row = results.loc[[idx]]
    row_index = row.index
    alg_name, impute, p, run_n = (
        row_index.get_level_values("algorithm")[0],
        row_index.get_level_values("imputation")[0],
        row_index.get_level_values("missing_percentage")[0] / 100,
        row_index.get_level_values("run_n")[0])

    print(row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0], "\t", datetime.now())
    alg = algorithms[alg_name]
    train_Xs = DatasetUtils.shuffle_imvd(Xs=Xs, random_state=random_state + run_n)
    y_train = y.loc[train_Xs[0].index]
    errors_dict = defaultdict(int)
    if p != 0:
        try:
            assert n_clusters < len(train_Xs[0]) * (1-p)
        except AssertionError as exception:
            errors_dict[f"{type(exception).__name__}: {exception}; n_clusters < len(train_Xs[0]) * (1-p)"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        try:
            train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p=round(p, 2),
                                                              random_state=random_state + run_n,
                                                              assess_percentage=True, stratify=y_train)
            strat = True
        except ValueError:
            try:
                train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p=round(p, 2),
                                                                  random_state=random_state + run_n,
                                                                  assess_percentage=True)
                strat = False
            except Exception as exception:
                errors_dict[f"{type(exception).__name__}: {exception}"] += 1
                # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
                results.loc[idx, ["finished", "comments"]] = True, errors_dict
                results.to_csv(file_path)
                # return results.loc[idx]
    else:
        strat = False

    if impute:
        train_Xs = MultiViewTransformer(SimpleImputer(strategy="mean").set_output(transform="pandas")).fit_transform(
            train_Xs)
    else:
        train_Xs = DatasetUtils.select_complete_samples(Xs=train_Xs)
        y_train = y_train.loc[train_Xs[0].index]

    try:
        start_time = time.perf_counter()
        if alg_name == "SNF":
            preprocessing_step = MultiViewTransformer(StandardScaler().set_output(transform="pandas"))
            train_Xs = preprocessing_step.fit_transform(train_Xs)
            k_snf = np.ceil(len(y_train)/10).astype(int)
            print(k_snf)
            affinities = compute.make_affinity(train_Xs, normalize=False, K= k_snf)
            fused = compute.snf(affinities, K= k_snf)
            clusters = spectral_clustering(fused, n_clusters=n_clusters, random_state=random_state + run_n)
        elif alg_name == "intNMF":
            preprocessing_step = MultiViewTransformer(MinMaxScaler().set_output(transform="pandas"))
            train_Xs = preprocessing_step.fit_transform(train_Xs)
            model = IntegrativeJnmf({k: v for k, v in enumerate(train_Xs)}, k=n_clusters, lamb=0.1)
            model.run(trials=50, iterations=100, verbose=False)
            model.cluster_data()
            clusters = np.argmax(model.w_cluster, axis=1)
        elif alg_name == "jNMF":
            preprocessing_step = make_pipeline(MultiViewTransformer(MinMaxScaler().set_output(transform="pandas")))
            train_Xs = preprocessing_step.fit_transform(train_Xs)
            model = StandardJnmf({k: v for k, v in enumerate(train_Xs)}, k=n_clusters)
            model.run(trials=50, iterations=100, verbose=False)
            model.cluster_data()
            clusters = np.argmax(model.w_cluster, axis=1)
        else:
            model, params = alg["alg"], alg["params"]
            if alg_name == "GroupPCA":
                model[1].set_params(n_components=n_clusters, random_state=random_state + run_n, multiview_output=False)
            elif alg_name == "AJIVE":
                model[1].set_params(random_state=random_state + run_n)
            if alg_name == "NMFC":
                model[-1].set_params(n_components=n_clusters, random_state=random_state + run_n)
            else:
                model[-1].set_params(n_clusters=n_clusters, random_state=random_state + run_n)
            clusters = model.fit_predict(train_Xs)
    except ValueError as exception:
        if alg_name == "AJIVE" and len(y_train) < 5:
            errors_dict[f"{type(exception).__name__}: {exception}"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        if alg_name == "SNF" and len(y_train) < 17:
            errors_dict[f"{type(exception).__name__}: {exception}"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        if alg_name == "intNMF" and len(y_train) < 5:
            errors_dict[f"{type(exception).__name__}: {exception}"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        else:
            raise

    clusters = pd.Series(clusters, index=y_train.index)

    elapsed_time = time.perf_counter() - start_time

    if alg_name in ["NMFC"]:
        train_X = model.transform(train_Xs)
    elif alg_name in ["SNF"]:
        train_X = preprocessing_step.transform(train_Xs)
    elif alg_name in ["intNMF", "jNMF"]:
        train_X = model.w
    else:
        train_X = model[:-1].transform(train_Xs)
    if isinstance(train_X, list):
        train_X = ConcatenateViews().fit_transform(train_X)
    if not isinstance(train_X, pd.DataFrame):
        train_X = pd.DataFrame(train_X, index=y_train.index)

    assert train_X.index.equals(y_train.index)
    assert train_Xs[0].index.equals(y_train.index)

    if p > 0:
        best_solution = pd.MultiIndex.from_arrays(
            [[row_index.get_level_values(level=level)[0]] if level != "missing_percentage" else [0]
             for level in row_index.names], names=row_index.names)
        best_solution = results.loc[best_solution].iloc[0]
        y_train_total = pd.Series(best_solution["y_true"], index=best_solution["y_pred_idx"])
        best_solution = pd.Series(best_solution["y_pred"], index=best_solution["y_pred_idx"])
    else:
        best_solution = None
        y_train_total = None

    dict_results = Utils.save_record(train_Xs=train_Xs, train_X=train_X, clusters=clusters, y=y_train, p=p, y_true_total=y_train_total,
                               best_solution=best_solution, elapsed_time=elapsed_time, strat=strat,
                               random_state=random_state, errors_dict=errors_dict)


{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 0} 	 2023-12-05 14:40:29.658451
4
{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 1} 	 2023-12-05 14:40:29.841562
4
{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 2} 	 2023-12-05 14:40:30.179149
4
{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 3} 	 2023-12-05 14:40:30.319688
4
{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 4} 	 2023-12-05 14:40:30.516572
4
{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 5} 	 2023-12-05 14:40:30.680208
4
{'dataset': 'nutrimouse_genotype', 'algorithm': 'SNF', 'missing_percentage': 0, 'imputation': False, 'run_n': 6} 	 2023-12-05 14:40:30.9

In [85]:
train_Xs = DatasetUtils.shuffle_imvd(Xs=Xs, random_state=random_state + run_n)
y_train = y.loc[train_Xs[0].index]
train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p=round(p, 2),
                                                              random_state=random_state + run_n,
                                                              assess_percentage=True, stratify=y_train)
train_Xs = DatasetUtils.select_complete_samples(Xs=train_Xs)
y_train = y_train.loc[train_Xs[0].index]

In [86]:
preprocessing_step = MultiViewTransformer(StandardScaler().set_output(transform="pandas"))
train_Xs = preprocessing_step.fit_transform(train_Xs)
k_snf = np.ceil(len(y_train)/10).astype(int)
aff = compute.make_affinity(train_Xs, normalize=False, K= k_snf)
aff[1].shape

(16, 16)

In [87]:
compute.snf(aff, K= k_snf)

array([[8.17888478e-01, 3.65788169e-02, 2.43160808e-01, 2.38397328e-02,
        2.74530611e-03, 7.32549852e-03, 1.36156155e-01, 3.64192253e-04,
        3.67052593e-02, 2.17831412e-03, 1.53018811e-01, 2.99521108e-03,
        2.46207380e-03, 3.64147880e-04, 2.14004989e-02, 2.48877638e-03],
       [3.65788169e-02, 8.43450897e-01, 4.89686247e-02, 4.40409230e-03,
        1.55403572e-02, 4.40224789e-02, 7.10256111e-03, 7.45412247e-03,
        2.59595045e-01, 1.08554513e-02, 9.42230365e-03, 4.62753688e-02,
        1.35503984e-02, 7.43400467e-03, 1.31759921e-01, 1.29612998e-02],
       [2.43160808e-01, 4.89686247e-02, 8.21065943e-01, 1.96412592e-02,
        3.40814059e-03, 9.38110542e-03, 1.19007067e-01, 5.48592142e-04,
        5.08643961e-02, 2.32211758e-03, 1.31357059e-01, 4.20535775e-03,
        2.88066982e-03, 5.48419839e-04, 2.86873768e-02, 2.75469464e-03],
       [2.38397328e-02, 4.40409230e-03, 1.96412592e-02, 9.71374665e-01,
        4.24767808e-02, 2.32350094e-02, 7.81886899e-02, 4.344

In [72]:
Wk = [0] * len(aff)
Wsum = np.zeros(aff[0].shape)

# get number of modalities informing each subject x subject affinity
n_aff = len(aff) - np.sum([np.isnan(a) for a in aff], axis=0)

for n, mat in enumerate(aff):
    print(1)
    # normalize affinity matrix based on strength of edges
    mat = mat / np.nansum(mat, axis=1, keepdims=True)
    aff[n] = check_symmetric(mat, raise_warning=False)
    # apply KNN threshold to normalized affinity matrix
    Wk[n] = _find_dominate_set(aff[n], int(K))

1
1


In [62]:
Wk = aff[n].copy()
cutoff = 100 - (100 * (K / len(Wk)))
cutoff

68.75

In [69]:
parsed2 = rdata.parser.parse_file("imvc/datasets/data/metabric/METABRIC_validation.RData")
converted2 = rdata.conversion.convert(parsed2)
converted2

{'survivalDFS':         PatientID     Survival  Death
 MB-0002   MB-0002  2539.000000    0.0
 MB-0005   MB-0005  4893.000000    1.0
 MB-0006   MB-0006  4947.999999    0.0
 MB-0010   MB-0010   234.000000    1.0
 MB-0014   MB-0014  4929.999999    0.0
 ...           ...          ...    ...
 MB-0291   MB-0291  1176.000000    1.0
 MB-0444   MB-0444  1664.000000    1.0
 MB-0482   MB-0482   746.000000    1.0
 MB-0519   MB-0519  3329.000001    0.0
 MB-0593   MB-0593  1040.000000    1.0
 
 [1980 rows x 3 columns],
 'survival':         PatientID     Survival  Death
 MB-0002   MB-0002  2539.000000    0.0
 MB-0005   MB-0005  4893.000000    1.0
 MB-0006   MB-0006  4947.999999    0.0
 MB-0010   MB-0010   234.000000    1.0
 MB-0014   MB-0014  4929.999999    0.0
 ...           ...          ...    ...
 MB-0291   MB-0291  1176.000000    1.0
 MB-0444   MB-0444  1664.000000    1.0
 MB-0482   MB-0482   746.000000    1.0
 MB-0519   MB-0519  3329.000001    0.0
 MB-0593   MB-0593  1040.000000    1.0
 
 [1980 

In [76]:
mydatGE_dis = pd.DataFrame(converted["mydatGE"], index = converted["mydatGE"].dim_0, columns = converted["mydatGE"].dim_1)
mydatGE_dis.head()

Unnamed: 0,ILMN_1802380,ILMN_1893287,ILMN_1736104,ILMN_1792389,ILMN_1854015,ILMN_1904757,ILMN_1740305,ILMN_1665168,ILMN_2375156,ILMN_1705423,...,ILMN_1659781,ILMN_1908807,ILMN_1701127,ILMN_1751164,ILMN_1843643,ILMN_1852347,ILMN_1909210,ILMN_1693941,ILMN_1846115,ILMN_1709472
MB-0362,8.676978,5.298711,5.430877,6.075331,5.595625,5.453928,5.49024,4.994525,5.83827,5.498484,...,5.161796,5.504426,6.353215,5.472163,4.836483,5.319774,5.075572,7.304643,5.251843,5.049591
MB-0346,9.653589,5.378801,5.199253,6.687887,6.010127,5.454185,5.15021,5.34601,5.600876,5.454868,...,5.197392,5.276526,6.132355,5.289956,5.316819,5.141559,5.420024,7.933324,5.450611,5.31679
MB-0386,9.033589,5.606122,5.449121,5.910885,5.683969,5.501577,5.385091,5.247467,6.030718,5.296832,...,8.087722,5.322883,6.366335,5.672556,5.466419,5.508364,5.349676,7.580336,5.235394,5.461617
MB-0574,8.814855,5.316155,5.309371,5.62874,5.479983,5.471941,5.416758,5.316523,5.849428,5.373278,...,5.780062,5.466534,6.424048,5.492811,5.19315,5.419906,5.124697,6.903654,5.091927,5.22713
MB-0185,8.736406,5.303613,5.438538,6.392422,6.0135,5.525027,5.769779,5.032408,5.542133,5.552146,...,5.327687,5.495221,6.252966,5.441239,5.29907,5.489531,5.299731,6.848395,5.238651,5.057761


In [77]:
mydatGE_val = pd.DataFrame(converted2["mydatGE"], index = converted2["mydatGE"].dim_0, columns = converted2["mydatGE"].dim_1)
mydatGE_val.head()

Unnamed: 0,ILMN_1802380,ILMN_1893287,ILMN_1736104,ILMN_1792389,ILMN_1854015,ILMN_1904757,ILMN_1740305,ILMN_1665168,ILMN_2375156,ILMN_1705423,...,ILMN_1868532,ILMN_1659781,ILMN_1908807,ILMN_1701127,ILMN_1751164,ILMN_1843643,ILMN_1852347,ILMN_1909210,ILMN_1693941,ILMN_1846115
MB-0005,7.963493,5.579494,5.213908,5.553056,5.786583,5.472386,5.607341,5.265556,5.793398,5.296342,...,5.453946,5.306294,5.569,6.306927,5.419689,5.362207,5.440643,5.373503,7.397672,5.372138
MB-0048,8.452693,5.456827,5.206483,6.219375,5.747841,5.40522,5.565285,5.327176,5.689668,5.387294,...,5.561369,5.17656,5.547286,6.065291,5.563191,5.180065,5.54761,5.464853,5.651414,5.35856
MB-0025,7.588018,5.350564,5.418576,5.755084,5.733067,5.429913,5.547107,5.204948,5.679817,5.472628,...,5.615135,5.390555,5.400563,6.019489,5.600126,5.206286,5.555487,5.396562,6.809422,5.513906
MB-0083,8.248024,5.284935,5.326922,5.838722,5.594793,5.377686,5.669123,5.073629,5.684246,5.471562,...,5.715747,5.629181,5.636301,6.115126,5.529249,5.052481,5.658572,5.537335,6.611526,5.55259
MB-0053,7.78906,5.364712,5.322772,6.122633,5.683303,5.32607,5.473555,5.233173,5.693977,5.756432,...,5.610843,5.238755,5.61157,5.878613,5.561742,5.317208,5.350093,5.38336,5.646037,5.352785


In [79]:
mydatGE_val.index.intersection(converted2["mydatCNV"].T.index)

Index(['MB-0005', 'MB-0048', 'MB-0025', 'MB-0083', 'MB-0053', 'MB-0056',
       'MB-0068', 'MB-0093', 'MB-0079', 'MB-0108',
       ...
       'MB-6192', 'MB-4820', 'MB-5527', 'MB-5167', 'MB-5465', 'MB-5453',
       'MB-5471', 'MB-5127', 'MB-4313', 'MB-4823'],
      dtype='object', length=995)

In [None]:
pd.DataFrame(converted2["mydatGE"].dim_0)

In [46]:
parsed = rdata.parser.parse_file("imvc/datasets/data/metabric/METABRIC_discovery.RData")
converted = rdata.conversion.convert(parsed)
converted

{'survivalDFS':         PatientID     Survival  Death
 MB-0002   MB-0002  2539.000000    0.0
 MB-0005   MB-0005  4893.000000    1.0
 MB-0006   MB-0006  4947.999999    0.0
 MB-0010   MB-0010   234.000000    1.0
 MB-0014   MB-0014  4929.999999    0.0
 ...           ...          ...    ...
 MB-0291   MB-0291  1176.000000    1.0
 MB-0444   MB-0444  1664.000000    1.0
 MB-0482   MB-0482   746.000000    1.0
 MB-0519   MB-0519  3329.000001    0.0
 MB-0593   MB-0593  1040.000000    1.0
 
 [1980 rows x 3 columns],
 'survival':         PatientID     Survival  Death
 MB-0002   MB-0002  2539.000000    0.0
 MB-0005   MB-0005  4893.000000    1.0
 MB-0006   MB-0006  4947.999999    0.0
 MB-0010   MB-0010   234.000000    1.0
 MB-0014   MB-0014  4929.999999    0.0
 ...           ...          ...    ...
 MB-0291   MB-0291  1176.000000    1.0
 MB-0444   MB-0444  1664.000000    1.0
 MB-0482   MB-0482   746.000000    1.0
 MB-0519   MB-0519  3329.000001    0.0
 MB-0593   MB-0593  1040.000000    1.0
 
 [1980 

In [48]:
pd.DataFrame(converted["mydatGE"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48793,48794,48795,48796,48797,48798,48799,48800,48801,48802
0,8.676978,5.298711,5.430877,6.075331,5.595625,5.453928,5.490240,4.994525,5.838270,5.498484,...,5.161796,5.504426,6.353215,5.472163,4.836483,5.319774,5.075572,7.304643,5.251843,5.049591
1,9.653589,5.378801,5.199253,6.687887,6.010127,5.454185,5.150210,5.346010,5.600876,5.454868,...,5.197392,5.276526,6.132355,5.289956,5.316819,5.141559,5.420024,7.933324,5.450611,5.316790
2,9.033589,5.606122,5.449121,5.910885,5.683969,5.501577,5.385091,5.247467,6.030718,5.296832,...,8.087722,5.322883,6.366335,5.672556,5.466419,5.508364,5.349676,7.580336,5.235394,5.461617
3,8.814855,5.316155,5.309371,5.628740,5.479983,5.471941,5.416758,5.316523,5.849428,5.373278,...,5.780062,5.466534,6.424048,5.492811,5.193150,5.419906,5.124697,6.903654,5.091927,5.227130
4,8.736406,5.303613,5.438538,6.392422,6.013500,5.525027,5.769779,5.032408,5.542133,5.552146,...,5.327687,5.495221,6.252966,5.441239,5.299070,5.489531,5.299731,6.848395,5.238651,5.057761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,8.850144,5.333344,5.331756,6.440723,5.814739,5.210231,5.321674,5.419619,5.828949,5.724069,...,5.218539,5.736583,6.166678,5.284162,5.325168,5.337517,5.569117,6.972185,5.157459,5.539762
993,9.132276,5.119812,5.760381,6.315227,6.113891,5.449235,5.066106,5.347050,5.936932,5.603672,...,5.157233,5.578294,5.926179,6.104279,5.399878,5.406930,5.106386,5.499776,5.392051,5.370991
994,8.463972,5.427502,5.417951,5.928433,6.192337,5.413672,5.223702,5.211153,5.814744,5.414783,...,5.316751,5.435352,6.207164,6.017801,5.190806,5.273789,5.180971,5.538318,5.380449,5.189992
995,9.431141,5.240360,5.258674,8.670881,5.937419,5.537914,5.328730,5.233245,5.734515,5.651650,...,5.184897,5.369399,6.290866,5.603239,5.295244,5.543862,5.513684,8.374683,5.374718,5.242508


In [51]:
converted["mydatCNV"]

Unnamed: 0,chrom,start,end,geneid,genename,MB-0002,MB-0008,MB-0010,MB-0035,MB-0036,...,MB-5634,MB-5635,MB-5638,MB-5642,MB-5645,MB-5646,MB-5647,MB-5651,MB-5653,MB-5654
1,1,1743,6658,643635,LOC643635,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,3411,6658,643643,LOC643643,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1,3757,20510,653635,LOC653635,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,1,23390,25938,645520,LOC645520,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,1,42315,43258,79504,OR4G4P,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
867315,Y,27104950,27105882,360009,LOC360009,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
867317,Y,27150218,27190190,347613,PARP4P,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
867319,Y,57410778,57411705,352905,LOC352905,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
867327,Y,57617003,57617949,644939,LOC644939,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
converted["clinical"]["CLAUDIN_SUBTYPE"]

MB-0002           LumA
MB-0005           LumB
MB-0006           LumB
MB-0010           LumB
MB-0014           LumB
              ...     
MB-0291           LumB
MB-0444           LumB
MB-0482         Normal
MB-0519    claudin-low
MB-0593           Her2
Name: CLAUDIN_SUBTYPE, Length: 1980, dtype: string

In [45]:
pd.Series(converted["data"]["cluster"]).to_csv("imvc/datasets/data/simulated_netMUG/simulated_netMUG_y.csv")

In [41]:
from sklearn.utils.validation import (check_array, check_symmetric,
                                      check_consistent_length)
from snf.compute import _find_dominate_set

In [None]:
import rpy2.robjects as robjects
robjects.r['load']("imvc/datasets/data/simulated_netMUG/original/SimulationData.Rdata")


In [33]:
pd.read_csv("imvc/datasets/data/simulated_netMUG/simulated_netMUG_y.csv").to_csv("imvc/datasets/data/simulated_netMUG/simulated_netMUG_y.csv")

In [None]:
model.initialize_wh()

In [None]:
model.update_weights()

In [None]:
model.w

In [None]:
dict(my_dict)

In [None]:
results.select_dtypes(object).drop(columns= ["comments"])

In [None]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
print(now)

In [None]:
def fun(x: int):
    "4" / 3
    return 

In [None]:
try:
    fun()
except Exception as exception:
    print(type(exception).__name__, ":", exception)

In [None]:
defaultdict(defaultdict)

In [None]:
import os.path
import time
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.cluster import KMeans, spectral_clustering
from mvlearn.decomposition import AJIVE, GroupPCA
from mvlearn.cluster import MultiviewSpectralClustering, MultiviewCoRegSpectralClustering
from snf import compute
from bignmf.models.jnmf.integrative import IntegrativeJnmf
from bignmf.models.jnmf.standard import StandardJnmf
from imvc.datasets import LoadDataset
from imvc.utils import DatasetUtils
from imvc.transformers import MultiViewTransformer, ConcatenateViews
from imvc.algorithms import NMFC

from utils import save_record
# from utils import GroupPCA

folder_name = "results"
filelame = "complete_algorithms_evaluation.csv"
file_path = os.path.join(folder_name, filelame)

random_state = 42
START_BENCHMARKING = False

datasets = ["nutrimouse_genotype", "nutrimouse_diet", "bbcsport", "bdgp", "caltech101", "digits", "tcga_tissue", "tcga_survival", "nuswide", "metabric"]
probs = np.arange(100, step= 10)
imputation = [True, False]
runs_per_alg = np.arange(10)
algorithms = {
    "Concat": {"alg": make_pipeline(ConcatenateViews(),
                                    StandardScaler().set_output(transform='pandas'),
                                    KMeans()), "params": {}},
    "NMFC": {"alg": make_pipeline(ConcatenateViews(),
                                  MinMaxScaler().set_output(transform='pandas'),
                                  NMFC().set_output(transform='pandas')), "params": {}},
    "MVSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                  MultiviewSpectralClustering()),
                             "params": {}},
    "MVCoRegSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                       MultiviewCoRegSpectralClustering()),
                                  "params": {}},
    "GroupPCA": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), GroupPCA(), StandardScaler(), KMeans()),
                 "params": {}},
    "AJIVE": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), AJIVE(), MultiViewTransformer(FunctionTransformer(pd.DataFrame)),
                                   ConcatenateViews(), StandardScaler(), KMeans()),
              "params": {}},
    "SNF": {},
    "intNMF": {},
    "jNMF": {},
}
indexes_results = {"dataset": datasets, "algorithm": list(algorithms.keys()),
                   "missing_percentage": probs, "imputation": imputation, "run_n": runs_per_alg}


if START_BENCHMARKING:
    results = pd.DataFrame(datasets, columns= ["dataset"])
    for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
        results = results.merge(pd.Series(v, name= k), how= "cross")
    results = results.set_index(list(indexes_results.keys()))
    results[["finished", "completed"]] = False
else:
    results = pd.read_csv(file_path, index_col= list(indexes_results.keys()))
    results_ = results.select_dtypes(object).drop(columns= "comments").replace(np.nan, "np.nan")
    for col in results_.columns:
        results[col] = results_[col].apply(eval)

unfinished_results = results.loc[~results["finished"]]

for dataset_name in unfinished_results.index.get_level_values("dataset").unique():
    Xs, y = LoadDataset.load_dataset(dataset_name=dataset_name.split("_")[0], return_y=True, shuffle= False)
    y = pd.DataFrame(y)
    for target in y.columns:
        y_series = y[target].squeeze()
        n_clusters = y_series.nunique()

        for idx_iterator in unfinished_results.loc[unfinished_results.index.get_level_values("dataset") == dataset_name].itertuples():
            idx = idx_iterator[0]
            row = results.loc[[idx]]
            row_index = row.index
            print(row.drop(columns= row.columns).reset_index().to_dict(orient="records")[0])
            alg_name, impute, p, run_n = (
                row_index.get_level_values("algorithm")[0],
                row_index.get_level_values("imputation")[0],
                row_index.get_level_values("missing_percentage")[0]/100,
                row_index.get_level_values("run_n")[0])

            alg = algorithms[alg_name]
            train_Xs = DatasetUtils.shuffle_imvd(Xs=Xs, random_state= random_state + run_n)
            y_train = y_series.loc[train_Xs[0].index]
            errors_dict = defaultdict(int)
            if p != 0:
                # if n_clusters > len(train_Xs[0])*p:
                #     continue
                try:
                    train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p= round(p, 2),
                                                                      random_state =random_state + run_n, 
                                                                      assess_percentage = True, stratify = y_train)
                except Exception as exception:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue

            if impute:
                train_Xs = MultiViewTransformer(SimpleImputer(strategy="mean").set_output(transform= "pandas")).fit_transform(train_Xs)
            else:
                train_Xs = DatasetUtils.select_complete_samples(Xs = train_Xs)
                y_train = y_train.loc[train_Xs[0].index]

            try:
                start_time = time.perf_counter()
                if alg_name == "SNF":
                    preprocessing_step = MultiViewTransformer(StandardScaler().set_output(transform= "pandas"))
                    train_Xs = preprocessing_step.fit_transform(train_Xs)
                    affinities = compute.make_affinity(train_Xs, normalize= False)
                    fused = compute.snf(affinities)
                    clusters = spectral_clustering(fused, n_clusters=n_clusters, random_state=random_state + run_n)
                elif alg_name == "intNMF":
                    preprocessing_step = MultiViewTransformer(MinMaxScaler().set_output(transform= "pandas"))
                    model = IntegrativeJnmf({k:v for k,v in enumerate(train_Xs)}, k= n_clusters, lamb = 0.1)
                    raise
                    model.run(trials = 50, iterations = 100, verbose=False)
                    model.cluster_data()
                    clusters = np.argmax(model.w_cluster, axis= 1)
                elif alg_name == "jNMF":
                    pipeline = make_pipeline(MultiViewTransformer(MinMaxScaler().set_output(transform= "pandas")))
                    model = StandardJnmf({k:v for k,v in enumerate(train_Xs)}, k= n_clusters)
                    model.run(trials = 50, iterations = 100, verbose=False)
                    model.cluster_data()
                    clusters = np.argmax(model.w_cluster, axis= 1)
                else:
                    model, params = alg["alg"], alg["params"]
                    if alg_name == "GroupPCA":
                        model[1].set_params(n_components=n_clusters, random_state=random_state + run_n, multiview_output=False)
                    elif alg_name == "AJIVE":
                        model[1].set_params(random_state=random_state + run_n)
                    if alg_name == "NMFC":
                        model[-1].set_params(n_components=n_clusters, random_state=random_state + run_n)
                    else:
                        model[-1].set_params(n_clusters=n_clusters, random_state=random_state + run_n)
                    clusters = model.fit_predict(train_Xs)
            except ValueError as exception:
                if alg_name == "AJIVE" and len(y_train) < 5:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue
                if alg_name == "SNF" and len(y_train) < 17:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue
                if alg_name == "intNMF" and len(y_train) < 5:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue
                else:
                    raise

            clusters = pd.Series(clusters, index= y_train.index)

            elapsed_time = time.perf_counter() - start_time

            if alg_name in ["NMFC"]:
                train_X = model.transform(train_Xs)
            elif alg_name in ["SNF"]:
                train_X = preprocessing_step.transform(train_Xs)
            elif alg_name in ["intNMF", "jNMF"]:
                train_X = model.w
            else:
                train_X = model[:-1].transform(train_Xs)
            if isinstance(train_X, list):
                train_X = ConcatenateViews().fit_transform(train_X)
            if not isinstance(train_X, pd.DataFrame):
                train_X = pd.DataFrame(train_X, index= y_train.index)
                
            assert train_X.index.equals(y_train.index)
            assert train_Xs[0].index.equals(y_train.index)

            if p > 0:
                best_solution = pd.MultiIndex.from_arrays([[row_index.get_level_values(level= level)[0]] if level != "missing_percentage" else [0]
                                                           for level in row_index.names], names= row_index.names)
                best_solution = results.loc[best_solution].iloc[0]
                best_solution = pd.Series(best_solution["y_pred"], index= best_solution["y_pred_idx"])
                best_solution = best_solution.loc[train_X.index]
            else:
                best_solution = None

            dict_results = save_record(train_Xs=train_Xs, train_X=train_X, clusters=clusters, y=y_train, p= p,
                              best_solution = best_solution, elapsed_time=elapsed_time,
                              random_state=random_state, errors_dict=errors_dict)
            dict_results = pd.DataFrame(pd.Series(dict_results), columns= row_index).T
            results.loc[[idx], dict_results.columns] = dict_results
            results.loc[idx, "finished"] = True
            results.to_csv(file_path)


In [276]:
    results = pd.DataFrame(datasets, columns= ["dataset"])
    for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
        results = results.merge(pd.Series(v, name= k), how= "cross")
    results = results.set_index(list(indexes_results.keys()))
    results[["finished", "completed"]] = False
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,completed
dataset,algorithm,missing_percentage,imputation,run_n,Unnamed: 5_level_1,Unnamed: 6_level_1
simulated_InterSIM,Concat,0,True,0,False,False
simulated_InterSIM,Concat,0,True,1,False,False
simulated_InterSIM,Concat,0,True,2,False,False
simulated_InterSIM,Concat,0,True,3,False,False
simulated_InterSIM,Concat,0,True,4,False,False
...,...,...,...,...,...,...
nuswide,jNMF,90,False,5,False,False
nuswide,jNMF,90,False,6,False,False
nuswide,jNMF,90,False,7,False,False
nuswide,jNMF,90,False,8,False,False


In [None]:
np.random.rand(list(model.x.values())[0].shape[0], model.k).shape

In [None]:
self.v = {}
self.h = {}
for key in self.x:
    self.h[key] = np.random.rand(self.k, self.x[key].shape[1])
    self.v[key] = np.random.rand(number_of_samples, self.k)

In [None]:
from sklearn import metrics
metrics.silhouette_score(train_X, clusters, random_state=random_state)

In [None]:
for i in range(1000):
    try:
        model.run(trials = 50, iterations = 100, verbose=0)
        # clusters = model.fit_predict(train_Xs)
        print(i)
    except Exception as ex:
        print(i, ex)
        pass


In [None]:
results[results["finished"] == True]

In [None]:
best_solution.loc[train_X.index]

In [None]:
a = pd.read_csv(file_path, index_col= list(indexes_results.keys()))
a

In [None]:
a["label_sizes"].apply(str).str.replace("nan", "np.nan").apply(eval)

In [None]:
b = a.select_dtypes(object).replace(np.nan, "np.nan").drop(columns= "comments")
b

In [None]:
for i in b.columns:
    print(i)
    b[i].apply(eval)

In [None]:
b.apply(eval, axis= 1)

In [None]:
a = pd.DataFrame(datasets, columns= ["dataset"])
for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
    a = a.merge(pd.Series(v, name= k), how= "cross")
a = a.set_index(list(indexes_results.keys()))
a["finished"] = False
for i in a.itertuples():
    print(len(i))
    print(i)
    break

In [None]:
for i in unfinished_results.iterrows():
    print(len(i))
    print(i)
    break

In [None]:
len(best_solution.loc[train_X.index])

In [None]:
i[0]

In [None]:
import scipy.io
import pandas as pd
import numpy as np
import os
from imvc.datasets import LoadDataset
from imvc.utils import DatasetUtils
import copy
from sklearn.model_selection import train_test_split


In [None]:
Xs = DatasetUtils.add_random_noise_to_views(Xs=Xs, p= 0.95, assess_percentage= True, random_state= 42, stratify=y.iloc[:, 1])
Xs[0].head()

In [None]:
datasets = ["nutrimouse", "bbcsport", "bdgp", "caltech101", "digits", "tcga", "nuswide", "metabric"]
probs = np.arange(0., 1., step= 0.1).round(1) * 100
imputation = [True, False]
runs_per_alg = np.arange(10)

results = pd.DataFrame(datasets, columns= ["dataset"]).merge(
    pd.Series(probs, name= "missing_percentage"), how= "cross").merge(
    pd.Series(imputation, name= "imputation"), how= "cross").merge(
    pd.Series(runs_per_alg, name= "run_n"), how= "cross")
results = results.set_index(['dataset', 'missing_percentage',"imputation", 'run_n'])
results["finished"] = False
results.to_csv("pr.csv")

In [None]:
pd.read_csv("pr.csv", index_col= ['dataset', 'missing_percentage',"imputation", 'run_n'])

In [None]:
Xs[0]

In [None]:
Xs[1]

In [None]:
DatasetUtils.convert_mvd_into_imvd(Xs, p= 0.1, assess_percentage= True, random_state= random_state)

In [None]:
def split_into_groups(num_elements, num_groups):
    # Calculate the base number of elements per group
    base_count = num_elements // num_groups
    remaining = num_elements % num_groups
    
    # Initialize the list to store the number of elements in each group
    groups_count = [base_count] * num_groups
    
    # Distribute the remaining elements equally among the groups
    for i in range(remaining):
        groups_count[i] += 1
    
    return groups_count

# Total number of elements
total_elements = 103
# Number of groups to split into
num_of_groups = 4

# Splitting elements into groups
result = split_into_groups(total_elements, num_of_groups)

# Displaying the number of elements in each group
for i, count in enumerate(result):
    print(f"Group {i + 1}: {count} elements")

In [None]:
103 % 4

In [None]:
base_count = 103 // 4
remaining = 103 % 4

# Initialize the list to store the number of elements in each group
groups_count = [base_count] * num_groups

In [None]:
for dataset_name in [
    # "nutrimouse",
    "tcga"
]:
    Xs, y = LoadDataset.load_dataset(dataset_name = dataset_name, return_y = True, p= 0.8, assess_percentage=True)
    # Xs = GetCompleteSamples().fit_transform(Xs)
    # aaaaaaaaaaaaaa
    # y.to_csv(os.path.join("imvc/datasets/data/", dataset_name, f"{dataset_name}_y.csv"))

In [None]:
survs = []
for dataset_name in os.listdir("imvc/datasets/data/tcga/original/"):
    if "survival" in dataset_name:
        print(dataset_name)
        surv = pd.read_csv(os.path.join("imvc/datasets/data/tcga/original/", dataset_name), sep= '""', index_col= 0, engine='python')
        if surv.shape[1] <2:
            surv = pd.read_csv(os.path.join("imvc/datasets/data/tcga/original/", "survival_lung.csv"), sep= "\t", index_col= 0)
        surv.index = surv.index.str.replace(".", "-").str.upper()
        surv.index = surv.index.str.extract(pat=r"(TCGA.{8})", expand=False)
        surv.index.name = None
        survs.append(surv[["Survival", "Death"]])
survs = pd.concat(survs).drop_duplicates()

In [None]:
y.index.intersection(survs.index)

In [None]:
_,met = LoadDataset.load_dataset(dataset_name = "tcga", return_metadata= True, p= 0.8, assess_percentage=True)
met

In [None]:
y[y == 0].dropna()

In [None]:
pattern = r"(TCGA.{8})"
matches = surv.index.str.extract(pat=pattern, expand=False)
matches

In [None]:
for dataset_name in [i for i in sorted(os.listdir("imvc/datasets/data/")[1:]) if i != "metabric"]:
    Xs = LoadDataset.load_dataset(dataset_name = dataset_name, return_y = False)
    for p in np.arange(0., 1., step= 0.1).round(1):
        imvd = DatasetUtils.convert_mvd_into_imvd(Xs, p= p, random_state = 42, assess_percentage = True)
        complete_Xs = GetCompleteSamples().fit_transform(imvd)
        print(dataset_name, "\t", p, "\t", len(Xs), "\t", len(DatasetUtils.get_sample_names(imvd)), "\t", len(DatasetUtils.get_sample_names(complete_Xs)))
    print()

In [None]:
probs = np.arange(0., 1., step= 0.1).round(1)
algorithms = ["Concat", "MOFA", "NMFC", "MONET", "MSNE", "SUMO", "NEMO"]
runs_per_alg = np.arange(10).tolist()
results = pd.DataFrame(os.listdir("imvc/datasets/data/"), columns= ["dataset"]).merge(
    pd.Series(algorithms, name= "algorithm"), how= "cross").merge(
    pd.Series(probs, name= "missing_percentage"), how= "cross").merge(
    pd.Series(runs_per_alg, name= "run_n"), how= "cross")
results = results.set_index(['dataset', 'algorithm', 'missing_percentage', 'run_n'])
results["finished"] = False
results

In [281]:
path = "results/subresults"
files = pd.Series(os.listdir(path)).apply(lambda x: os.path.join(path, x))
files = files[files.apply(os.path.isfile)]
files = pd.concat(files.apply(pd.read_csv).to_list())

In [282]:
files

Unnamed: 0,dataset,algorithm,missing_percentage,imputation,run_n,finished,completed,comments,n_samples,n_incomplete_samples,...,bal_acc,ami,ari,completeness,random_acc,random_f1,silhouette,vrc,db,dunn
0,simulated_InterSIM,GroupPCA,60,False,5,True,False,,,,...,,,,,,,,,,
0,simulated_InterSIM,Concat,90,True,2,True,False,,,,...,,,,,,,,,,
0,simulated_InterSIM,Concat,60,False,8,True,False,,,,...,,,,,,,,,,
0,simulated_InterSIM,Concat,60,True,2,True,False,,,,...,,,,,,,,,,
0,simulated_InterSIM,Concat,80,False,6,True,False,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,simulated_InterSIM,Concat,50,False,4,True,False,,,,...,,,,,,,,,,
0,simulated_InterSIM,jNMF,0,False,7,True,True,{},500.0,0.0,...,0.197778,-0.001763,-0.002331,0.001938,0.4,0.190476,-0.02149,0.561153,39.720961,0.000909
0,simulated_InterSIM,AJIVE,40,True,8,True,False,,,,...,,,,,,,,,,
0,simulated_InterSIM,AJIVE,10,True,4,True,False,,,,...,,,,,,,,,,


In [283]:
files = files.set_index(list(indexes_results.keys()))
results.loc[files.index, files.columns] = files
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,finished,completed,comments,n_samples,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,stratified,...,bal_acc,ami,ari,completeness,random_acc,random_f1,silhouette,vrc,db,dunn
dataset,algorithm,missing_percentage,imputation,run_n,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
simulated_InterSIM,Concat,0,True,0,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,1,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,2,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,3,False,False,,,,,,,,,...,,,,,,,,,,
simulated_InterSIM,Concat,0,True,4,False,False,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nuswide,jNMF,90,False,5,False,False,,,,,,,,,...,,,,,,,,,,
nuswide,jNMF,90,False,6,False,False,,,,,,,,,...,,,,,,,,,,
nuswide,jNMF,90,False,7,False,False,,,,,,,,,...,,,,,,,,,,
nuswide,jNMF,90,False,8,False,False,,,,,,,,,...,,,,,,,,,,


In [6]:
from imvc.datasets import LoadDataset
from imvc.transformers import Ampute
from imvc.utils import DatasetUtils
from pyampute.ampute import MultivariateAmputation
import warnings

Xs = LoadDataset.load_dataset(dataset_name = "bbcsport")

missing_view_panel = DatasetUtils.get_missing_view_panel(Xs)
Xs = Xs.float()
# amp = Ampute(p = 0.5, mechanism= "MAR")
amp = MultivariateAmputation(patterns= [{"incomplete_vars": range(1, missing_view_panel.shape[1]), "mechanism": "MAR"}])
amp.fit(missing_view_panel)
X = amp.fit_transform(missing_view_panel)
# DatasetUtils.get_n_complete_samples(iXs), DatasetUtils.get_n_incomplete_samples(iXs)

In [22]:
import pandas as pd
rng = np.random.default_rng(seed = 42)
len(Xs[0])
m = 1000
n = 10
X_compl = rng.standard_normal((len(Xs[0]), len(Xs)))
X_compl = pd.DataFrame(X_compl)
amp = MultivariateAmputation(patterns= [{"incomplete_vars": range(1, X_compl.shape[1]), "mechanism": "MAR"}])
X = amp.fit_transform(X_compl)
X[X.notnull()] = 1
X = X.fillna(0).astype(int)
X

Unnamed: 0,0,1,2,3
0,1,1,1,1
1,1,1,1,1
2,1,1,1,1
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
111,1,0,0,0
112,1,0,0,0
113,1,1,1,1
114,1,0,0,0


In [212]:
row = results.iloc[[5000]]
row_index = row.index
multiidx = []
for level in row_index.names:
    if level == "missing_percentage":
        multiidx_value = 0
    elif level == "amputation_mechanism":
        multiidx_value = "None"
    elif level == "imputation":
        multiidx_value = False
    else:
        multiidx_value = row_index.get_level_values(level=level)[0]
    multiidx.append([multiidx_value])
best_solution = pd.MultiIndex.from_arrays(multiidx, names=row_index.names)
best_solution

MultiIndex([(5000,)],
           )

In [211]:
row

Unnamed: 0,dataset,algorithm,missing_percentage,amputation_mechanism,imputation,run_n,finished,completed,n_samples,n_incomplete_samples,...,bal_acc,ami,ari,completeness,random_acc,random_f1,silhouette,vrc,db,dunn
5000,nutrimouse_diet,GroupPCA,50,MCAR,True,0,False,False,,,...,,,,,,,,,,


In [205]:
(results["amputation_mechanism"] == "EDM") & (results["missing_percentage"] == 0)

0         True
1         True
2         True
3         True
4         True
         ...  
49495    False
49496    False
49497    False
49498    False
49499    False
Length: 49500, dtype: bool

In [208]:
results["amputation_mechanism"].value_counts()

amputation_mechanism
EDM     16200
MCAR    16200
MAR      8100
MNAR     8100
None      900
Name: count, dtype: int64

In [216]:
results = pd.read_csv("results/complete_algorithms_evaluation.csv", index_col = [0,1,2,3,4,5])
# results = results.reset_index()
# results.loc[results.xs(0, level="missing_percentage", drop_level=False).index].index = results.xs(0, level="missing_percentage", drop_level=False).index.to_frame(index= False).replace("EDM", "None").set_index(results.index.names).index
results


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed
dataset,algorithm,missing_percentage,amputation_mechanism,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1
nutrimouse_genotype,Concat,0,,False,0,False,False
nutrimouse_genotype,Concat,0,,False,1,False,False
nutrimouse_genotype,Concat,0,,False,2,False,False
nutrimouse_genotype,Concat,0,,False,3,False,False
nutrimouse_genotype,Concat,0,,False,4,False,False
...,...,...,...,...,...,...,...
nuswide,jNMF,90,MNAR,False,5,False,False
nuswide,jNMF,90,MNAR,False,6,False,False
nuswide,jNMF,90,MNAR,False,7,False,False
nuswide,jNMF,90,MNAR,False,8,False,False


In [159]:
results = pd.read_csv("results/complete_algorithms_evaluation.csv", index_col = [0,1,2,3,4,5])
results_amputation_mechanism_complete = results.xs(0, level="missing_percentage", drop_level=False)
results_amputation_mechanism_none = results_amputation_mechanism_complete.index.to_frame(index= False).replace("EDM", "None")
results_amputation_mechanism_none = results_amputation_mechanism_none.set_index(results_amputation_mechanism_none.columns.to_list())
results_amputation_mechanism_none_2.index = pd.MultiIndex.from_frame(results_amputation_mechanism_none_tochange)

# results.loc[results_amputation_mechanism_none.index].index = pd.MultiIndex.from_frame(results_amputation_mechanism_none_tochange)
results_amputation_mechanism_none

dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n
nutrimouse_genotype,Concat,,0,False,0
nutrimouse_genotype,Concat,,0,False,1
nutrimouse_genotype,Concat,,0,False,2
nutrimouse_genotype,Concat,,0,False,3
nutrimouse_genotype,Concat,,0,False,4
...,...,...,...,...,...
nuswide,jNMF,,0,False,5
nuswide,jNMF,,0,False,6
nuswide,jNMF,,0,False,7
nuswide,jNMF,,0,False,8


In [162]:
results.loc[results_amputation_mechanism_complete.index].index = results_amputation_mechanism_complete.index
results.loc[results_amputation_mechanism_complete.index].index

MultiIndex([('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 0),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 1),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 2),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 3),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 4),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 5),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 6),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 7),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 8),
            ('nutrimouse_genotype', 'Concat', 'EDM', 0, False, 9),
            ...
            (            'nuswide',   'jNMF', 'EDM', 0, False, 0),
            (            'nuswide',   'jNMF', 'EDM', 0, False, 1),
            (            'nuswide',   'jNMF', 'EDM', 0, False, 2),
            (            'nuswide',   'jNMF', 'EDM', 0, False, 3),
            (            'nuswide',   'jNMF', 

In [154]:
a = results.reset_index("amputation_mechanism")
a["amputation_mechanism"] = 

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,amputation_mechanism,finished,completed,n_samples,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,comments,...,MCC_performance,F1_performance,precision_performance,recall_performance,bal_acc_performance,ami_performance,ari_performance,completeness_performance,random_acc_performance,random_f1_performance
dataset,algorithm,missing_percentage,imputation,run_n,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
nutrimouse_genotype,Concat,0,False,0,EDM,True,True,40.0,0.0,40.0,1.418447,40.0,100.0,{},...,,,,,,,,,,
nutrimouse_genotype,Concat,0,False,1,EDM,True,True,40.0,0.0,40.0,1.707200,40.0,100.0,{},...,,,,,,,,,,
nutrimouse_genotype,Concat,0,False,2,EDM,True,True,40.0,0.0,40.0,1.532106,40.0,100.0,{},...,,,,,,,,,,
nutrimouse_genotype,Concat,0,False,3,EDM,True,True,40.0,0.0,40.0,1.413640,40.0,100.0,{},...,,,,,,,,,,
nutrimouse_genotype,Concat,0,False,4,EDM,True,True,40.0,0.0,40.0,1.645764,40.0,100.0,{},...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nuswide,jNMF,0,False,5,EDM,False,False,,,,,,,,...,,,,,,,,,,
nuswide,jNMF,0,False,6,EDM,False,False,,,,,,,,...,,,,,,,,,,
nuswide,jNMF,0,False,7,EDM,False,False,,,,,,,,...,,,,,,,,,,
nuswide,jNMF,0,False,8,EDM,False,False,,,,,,,,...,,,,,,,,,,


In [143]:
results.loc[results_amputation_mechanism_none_tochange.index].index = results_amputation_mechanism_complete.index
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed
dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1
nutrimouse_genotype,Concat,EDM,0,False,0,False,False
nutrimouse_genotype,Concat,EDM,0,False,1,False,False
nutrimouse_genotype,Concat,EDM,0,False,2,False,False
nutrimouse_genotype,Concat,EDM,0,False,3,False,False
nutrimouse_genotype,Concat,EDM,0,False,4,False,False
...,...,...,...,...,...,...,...
nuswide,jNMF,MNAR,90,False,5,False,False
nuswide,jNMF,MNAR,90,False,6,False,False
nuswide,jNMF,MNAR,90,False,7,False,False
nuswide,jNMF,MNAR,90,False,8,False,False


In [125]:
results.loc[results_amputation_mechanism_none.index, :] = results_amputation_mechanism_none_2
results.loc[results_amputation_mechanism_none.index]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed
dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1
nutrimouse_genotype,Concat,EDM,0,False,0,,
nutrimouse_genotype,Concat,EDM,0,False,1,,
nutrimouse_genotype,Concat,EDM,0,False,2,,
nutrimouse_genotype,Concat,EDM,0,False,3,,
nutrimouse_genotype,Concat,EDM,0,False,4,,
...,...,...,...,...,...,...,...
nuswide,jNMF,EDM,0,False,5,,
nuswide,jNMF,EDM,0,False,6,,
nuswide,jNMF,EDM,0,False,7,,
nuswide,jNMF,EDM,0,False,8,,


In [118]:
results_amputation_mechanism_none_2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed
dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1
nutrimouse_genotype,Concat,,0,False,0,False,False
nutrimouse_genotype,Concat,,0,False,1,False,False
nutrimouse_genotype,Concat,,0,False,2,False,False
nutrimouse_genotype,Concat,,0,False,3,False,False
nutrimouse_genotype,Concat,,0,False,4,False,False
...,...,...,...,...,...,...,...
nuswide,jNMF,,0,False,5,False,False
nuswide,jNMF,,0,False,6,False,False
nuswide,jNMF,,0,False,7,False,False
nuswide,jNMF,,0,False,8,False,False


In [113]:
results.loc[results_amputation_mechanism_none.index].index = pd.MultiIndex.from_frame(results_amputation_mechanism_none_tochange)
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed
dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1
nutrimouse_genotype,Concat,EDM,0,False,0,False,False
nutrimouse_genotype,Concat,EDM,0,False,1,False,False
nutrimouse_genotype,Concat,EDM,0,False,2,False,False
nutrimouse_genotype,Concat,EDM,0,False,3,False,False
nutrimouse_genotype,Concat,EDM,0,False,4,False,False
...,...,...,...,...,...,...,...
nuswide,jNMF,MNAR,90,False,5,False,False
nuswide,jNMF,MNAR,90,False,6,False,False
nuswide,jNMF,MNAR,90,False,7,False,False
nuswide,jNMF,MNAR,90,False,8,False,False


In [101]:
pd.MultiIndex.from_frame(results_amputation_mechanism_none_tochange).values

array([('nutrimouse_genotype', 'Concat', 'None', 0, False, 0),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 1),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 2),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 3),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 4),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 5),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 6),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 7),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 8),
       ('nutrimouse_genotype', 'Concat', 'None', 0, False, 9),
       ('nutrimouse_genotype', 'NMFC', 'None', 0, False, 0),
       ('nutrimouse_genotype', 'NMFC', 'None', 0, False, 1),
       ('nutrimouse_genotype', 'NMFC', 'None', 0, False, 2),
       ('nutrimouse_genotype', 'NMFC', 'None', 0, False, 3),
       ('nutrimouse_genotype', 'NMFC', 'None', 0, False, 4),
       ('nutrimouse_genotype', 'NMFC', 'None', 0, False, 5),
    

In [63]:
b

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n
dataset,algorithm,amputation_mechanism,missing_percentage,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
nutrimouse_genotype,Concat,ED,0,False,0,nutrimouse_genotype,Concat,,0,False,0
nutrimouse_genotype,Concat,ED,0,False,1,nutrimouse_genotype,Concat,,0,False,1
nutrimouse_genotype,Concat,ED,0,False,2,nutrimouse_genotype,Concat,,0,False,2
nutrimouse_genotype,Concat,ED,0,False,3,nutrimouse_genotype,Concat,,0,False,3
nutrimouse_genotype,Concat,ED,0,False,4,nutrimouse_genotype,Concat,,0,False,4
...,...,...,...,...,...,...,...,...,...,...,...
nuswide,jNMF,MNAR,0,False,5,nuswide,jNMF,,0,False,5
nuswide,jNMF,MNAR,0,False,6,nuswide,jNMF,,0,False,6
nuswide,jNMF,MNAR,0,False,7,nuswide,jNMF,,0,False,7
nuswide,jNMF,MNAR,0,False,8,nuswide,jNMF,,0,False,8


In [10]:
import numpy as np
from scipy import stats

amp.wss_per_pattern = []
amp.probs_per_pattern = []

X = amp._validate_data(missing_view_panel)
num_samples = X.shape[0]

# split complete_data in groups
# the number of groups is defined by the number of patterns
X_incomplete = X.copy()
X_indices = np.arange(num_samples)
# set seed for choice, if None it will be random.
rng = np.random.default_rng(amp.seed)
amp.assigned_group_number = rng.choice(
    a=amp.num_patterns, size=num_samples, p=amp.freqs
)
pattern_idx = 0
group_indices = X_indices[amp.assigned_group_number == pattern_idx]
pattern = np.squeeze(
    np.asarray(amp.observed_var_indicator[pattern_idx, :])
)
data_group = (
    X[group_indices] if isinstance(X, np.ndarray) else X.iloc[group_indices]
)
# data_group = stats.zscore(data_group)
wss = np.dot(data_group, amp.weights[pattern_idx, :].T)
wss

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [3]:
len(data_group)

116

In [1368]:
a = pd.DataFrame(np.zeros((10,2))) + 1
if missing_view_panel.shape[1] > 2:
    n_views_to_remove = round(missing_view_panel.shape[1]*0.5 + pd.Series([0.1,-0.1]).sample(1, random_state=random_state).iloc[0])
else:
    n_views_to_remove = 1
views_to_remove = missing_view_panel.columns.to_series().sample(n= n_views_to_remove, random_state=random_state)
views_to_remove = missing_view_panel.columns[views_to_remove]
amp = MultivariateAmputation(patterns= [{"incomplete_vars": views_to_remove}])
amp.fit_transform(a).fillna(0).astype(int)



AssertionError: Cannot ampute all features under MAR, since all vars will be missing.

In [1166]:
!pip install ampute

[31mERROR: Could not find a version that satisfies the requirement ampute (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for ampute[0m[31m
[0m

In [1165]:
a = pd.DataFrame(np.zeros((100,10))) + 1
amp = MultivariateAmputation(prop = 30)
amp.fit_transform(a).dropna().shape



(62, 10)

In [242]:
def f(arg):
    arg["asd"] = 2
    arg["asdlkñ"] = 3
    return arg

In [233]:
a = pd.concat([pd.read_csv("imvc/datasets/data/nutrimouse/original/genotype.csv"), pd.read_csv("imvc/datasets/data/nutrimouse/original/diet.csv")], axis= 1)
le = LabelEncoder()
a.iloc[:,0] = le.fit_transform(a.iloc[:,0])
le_ = LabelEncoder()
a.iloc[:,1] = le_.fit_transform(a.iloc[:,1])
a.to_csv("imvc/datasets/data/nutrimouse/nutrimouse_y.csv", index= False)

In [232]:
metadata = {"modality": {0: "gene", 1: "lipid"}, "labels": {"genotype": pd.Series(le.classes_).to_dict(), "diet": pd.Series(le_.classes_).to_dict()}}

with open(os.path.join("imvc/datasets/data/nutrimouse", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [None]:
probs = np.arange(0., 1., step= 0.1).round(1)
runs_per_alg = np.arange(10).tolist()
pd.DataFrame(probs).merge(pd.DataFrame(runs_per_alg), how= "cross")

In [None]:
a = pd.DataFrame(["alg", "pad"])
b = pd.DataFrame(["alg1", "pad2"])
a.merge(b, how= "cross")

In [None]:
metadata = {}
metadata = {"modality": {0: "morphological features", 1: "Karhunen-Love coefficients", 2: "profile correlations", 3: "Zernike moments", 4: "Fourier coefficients of the character shapes", 5: "pixel averages of the images from 2x3 windows"}}
import json

with open(os.path.join("imvc/datasets/data/digits", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [14]:
import scipy
import pandas as pd

In [148]:
a

Unnamed: 0.1,Unnamed: 0,cg00003994,cg00024396,cg00035623,cg00047050,cg00079563,cg00095674,cg00103783,cg00112517,cg00117172,...,cg27529628,cg27543230,cg27544190,cg27555365,cg27560922,cg27574244,cg27601582,cg27626299,cg27654142,cg27662877
0,TCGA-13-0799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,TCGA-13-0800,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,TCGA-13-0801,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,TCGA-24-1924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,TCGA-24-1930,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,TCGA-CV-7434,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
2433,TCGA-CV-7435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2434,TCGA-CV-7437,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2435,TCGA-CV-7438,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [182]:
import os
import pandas as pd
for i in os.listdir("imvc/datasets/data/"):
    print(i)
    for j in os.listdir(os.path.join("imvc/datasets/data/", i)):
        if "_y.csv" in j:
            try:
                a = pd.read_csv(os.path.join("imvc/datasets/data/", i, j), index_col = 0)
                if a.shape[1] < 2:
                    print(j)
                    a.columns = list(range(a.shape[1]))
                    a.to_csv(os.path.join("imvc/datasets/data/", i, j), index= False)
                else:
                    a.to_csv(os.path.join("imvc/datasets/data/", i, j), index= False)
            except:
                pass
    print()

simulated_netMUG
simulated_netMUG_y.csv

.ipynb_checkpoints

simulated_InterSIM
simulated_InterSIM_y.csv

buaa
buaa_y.csv

bdgp
bdgp_y.csv

tcga
tcga_y.csv

nuswide
nuswide_y.csv

metabric
metabric_y.csv

nutrimouse
nutrimouse_y.csv

digits
digits_y.csv

caltech101
caltech101_y.csv

bbcsport
bbcsport_y.csv



In [180]:
a.to_csv(os.path.join("imvc/datasets/data/", i, j), index= False, columns= False)

TypeError: 'bool' object is not iterable

In [187]:
df = pd.read_csv("imvc/datasets/data/tcga/tcga_y.csv", index_col = 0)
df.to_csv("imvc/datasets/data/tcga/tcga_y.csv", index= False)

In [112]:
pd.DataFrame(mat["CNA_s"], index = pd.DataFrame(mat["gene_s"]).squeeze().apply(lambda x: x[0].replace('"', "")).to_list()).T.to_csv("imvc/datasets/data/metabric/metabric_1.csv")

In [193]:
le = LabelEncoder()
pd.Series(le.fit_transform(df)).to_csv("imvc/datasets/data/metabric/metabric_y.csv", index= False)

In [191]:
df = pd.read_excel("imvc/datasets/data/metabric/original/Metabric_breast.xlsx")["CLAUDIN_SUBTYPE"]

In [195]:
with open(os.path.join("imvc/datasets/data/metabric", 'metadata.json'), 'r') as fp:
    metadata = json.load(fp)

metadata = {**metadata, "samples": pd.read_excel("imvc/datasets/data/metabric/original/Metabric_breast.xlsx")["Patient Identifier"].str.replace(".", "-").to_dict()}
with open(os.path.join("imvc/datasets/data/metabric", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [194]:
metadata = {}
metadata = {"modality": {0: "GE", 1: "CNA"}, "labels": {i:j for i,j in enumerate(le.classes_)}}
import json, os

with open(os.path.join("imvc/datasets/data/metabric", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [170]:
pd.read_excel("imvc/datasets/data/metabric/original/Metabric_breast.xlsx")

Unnamed: 0,Patient Identifier,Unnamed: 1,OS_STATUS,OS_MONTHS,Unnamed: 4,Unnamed: 5,CLAUDIN_SUBTYPE
0,MB.0362,,DECEASED,47.033333,3,C,LumA
1,MB.0346,,DECEASED,20.433333,4,D,Her2
2,MB.0386,,LIVING,60.000000,3,C,LumA
3,MB.0574,,LIVING,60.000000,4,D,LumA
4,MB.0503,,LIVING,60.000000,3,C,LumA
...,...,...,...,...,...,...,...
1899,MB.5465,,DECEASED,18.800000,3,C,Basal
1900,MB.5453,,DECEASED,57.300000,2,B,Normal
1901,MB.5471,,LIVING,60.000000,4,D,LumA
1902,MB.5127,,LIVING,60.000000,3,C,LumB


In [114]:
aa = pd.read_excel("imvc/datasets/data/metabric/Metabric_breast.xlsx")
aa["Patient Identifier"] = aa["Patient Identifier"].str.replace(".", "-")
bb = pd.read_csv("imvc/datasets/data/metabric/brca_metabric_clinical_data.tsv", sep= "\t")

In [127]:
from sklearn.preprocessing import LabelEncoder

In [121]:
aa.sort_values("Patient Identifier")[["Patient Identifier", "CLAUDIN_SUBTYPE"]]

Unnamed: 0,Patient Identifier,Unnamed: 1,OS_STATUS,OS_MONTHS,Unnamed: 4,Unnamed: 5,CLAUDIN_SUBTYPE
1791,MB.0000,,LIVING,60.000000,2,B,claudin-low
290,MB.0002,,LIVING,60.000000,4,D,LumA
960,MB.0005,,LIVING,60.000000,1,A,LumB
969,MB.0006,,LIVING,60.000000,3,C,LumB
215,MB.0008,,DECEASED,41.366667,1,A,LumB
...,...,...,...,...,...,...,...
1700,MB.7295,,LIVING,60.000000,2,B,LumA
1703,MB.7296,,DECEASED,44.733333,4,D,LumB
1701,MB.7297,,LIVING,60.000000,3,C,LumB
1625,MB.7298,,LIVING,60.000000,1,A,LumB


In [124]:
bb[["Patient ID", "Pam50 + Claudin-low subtype"]]

Unnamed: 0,Patient ID,Pam50 + Claudin-low subtype
0,MB-0000,claudin-low
1,MB-0002,LumA
2,MB-0005,LumB
3,MB-0006,LumB
4,MB-0008,LumB
...,...,...
2504,MTS-T2428,
2505,MTS-T2429,
2506,MTS-T2430,
2507,MTS-T2431,


In [24]:
pd.Series(mat["classLabel"][:,0]).to_csv("imvc/datasets/data/buaa/buaa_y.csv")

1      9
95     9
97     9
98     9
99     9
      ..
51     9
52     9
53     9
54     9
150    9
Name: count, Length: 150, dtype: int64

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/caltech101/Caltech101-all.mat')
for i,x in enumerate(mat["X"][0]):
    print(x.shape)
    pd.DataFrame(x).to_csv(f'imvc/datasets/data/caltech101/caltech101_{i}.csv', index= False)
pd.DataFrame(mat["Y"]).to_csv(f'imvc/datasets/data/caltech101/caltech101_y.csv', index= False)

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv", index_col= 0)
x.index.name= None
x.columns = x.columns.to_series().apply(lambda x: x.split("_")[1]).str[:12]
x = x.T
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_0.csv"))
x.head()

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/PanCan12.3602-corrected-v3.txt", sep= "\t", index_col= 0, header= [0,1])
x.index.name= None
x.columns = x.columns.droplevel(0).str[:12]
x = x.T
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_1.csv"))
x.head()

In [None]:
metadata = {}
metadata = {"modality": {0: "visual", 1: "mRNA", 2: "text", 3: "methyl"}, "labels": classes}
import json

with open(os.path.join(path, 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/PanCan11_RBN_RPPA_without_Duplicates_20130325.csv", index_col= 0)
x.index.name= None
x = x[x.columns[5:]]
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_2.csv"))
x.head()

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/DNAmethylationClusteringMatrix.csv", index_col= 0)
x.columns = x.columns.str[:12]
x = x.T
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_3.csv"))
x.head()

In [None]:
b = pd.read_csv(os.path.join(path, f"tcga_y.csv"), index_col= 0)
classes = pd.Series(b.iloc[:, 0].unique()).to_dict()
b.iloc[:, 0].replace({v:k for k,v in classes.items()}).to_csv(os.path.join(path, "tcga_y.csv"))

In [None]:
b

In [None]:
i = 1
a = pd.read_csv(os.path.join(path, f"tcga_{i}.csv"), index_col= 0).index
print(len(a))
for i in range(4):
    a = a.intersection(pd.read_csv(os.path.join(path, f"tcga_{i}.csv"), index_col= 0).index)
    print(len(a))

In [None]:
for i in range(4):
    b = pd.read_csv(os.path.join(path, f"tcga_{i}.csv"), index_col= 0)
    aa = a.intersection(b.index)
    print(i, "\t", b.shape, "\t", b.drop_duplicates().shape, "\t", b.loc[aa].shape, "\t", b.loc[aa].drop_duplicates().shape)

In [None]:
c = b.loc["TCGA-13-0791"]
c[c.columns[c.iloc[0] != c.iloc[1]]]

In [None]:
b.loc[a].shape

In [None]:
b.loc[a].index.difference(a)

In [None]:
b.drop_duplicates().loc[a.drop_duplicates()].drop_duplicates()

In [None]:
b.index.intersection(a)

In [None]:
a.intersection(b.index)

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/mrna_y.csv", index_col= 0)
x

In [None]:
x1 = pd.read_csv("imvc/datasets/data/tcga/mrna_y.csv", index_col =0)
x2 = pd.read_csv("imvc/datasets/data/tcga/mirna_y.csv", index_col =0)
x

In [None]:
x1.index

In [None]:
x2.index

In [None]:
x1.index.intersection(x2.index)

In [None]:
x.T.to_csv(os.path.join(path, "mrna.csv"))

In [None]:
x.columns.droplevel(1).to_series().to_csv(os.path.join(path, "mirna_y.csv"))

In [None]:
a = x.columns.to_frame().set_index(1)
# a.columns = [''] * len(a.columns)
a.index.name= None
a.index = a.index.str[:12]
a.to_csv(os.path.join(path, "mrna_y.csv"))

In [None]:
pd.read_csv("imvc/datasets/data/tcga/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv", index_col= 0)

In [None]:
pd.read_csv("imvc/datasets/data/tcga/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv").columns.to_series().apply(lambda x: x.split("_")[0]).value_counts()

In [None]:
for i,name in enumerate(["X", "Ya"]):
    x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/{name}.mat')[name]
    print(x.shape)
    pd.DataFrame(x).to_csv(f'imvc/datasets/data/bdgp/bdgp_{i}.csv', index= False)
x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/Yc.mat')["Yc"]
pd.DataFrame(x.argmax(1)).to_csv(f'imvc/datasets/data/bdgp/bdgp_y.csv', index= False)

In [None]:
path = "imvc/datasets/data/tcga"
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
for i,x in enumerate(["exp", "methy", "mirna"]):
    target = []
    files_x = [os.path.join(path, file) for file in files if file.startswith(x)]
    ds = []
    for file_x in files_x:
        d_x = pd.read_csv(file_x, index_col= 0).T
        print(file_x, d_x.shape)
        target.extend([file_x.split("_")[-1]]* d_x.shape[0])
        ds.append(d_x)
    d = pd.concat(ds)
    print(x, d.shape)
    d = d.dropna(axis= 1)
    print(x, d.shape)
    d.to_csv(os.path.join(path, f'tcga_{i}.csv'))
pd.Series(target).to_csv(os.path.join(path, 'tcga_y.csv'))

In [None]:
path = "imvc/datasets/data/tcga"
files = [os.path.join(path, f) for f in os.listdir(path) if f.startswith("tcga_")]
d = pd.concat([pd.read_csv(file) for file in files], axis= 1)
for i,file in enumerate(files):
    d_x = pd.read_csv(file)
    print(file, d_x.shape, d_x.loc[d.index])
    d_x.loc[d.index].to_csv(file)

In [None]:
path = "imvc/datasets/data/tcga"
for i,x in enumerate([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]):
    with ZipFile(os.path.join(path, x)) as zf:
        for file in zf.namelist():
            with zf.open(file) as f2:
                d = pd.read_csv(f2, sep= " ")
                print(file, d.shape)
                d.to_csv(f"{os.path.join(path, file)}_{x.split('.')[0]}.csv")

In [None]:
path = "imvc/datasets/data/digits"
for i,x in enumerate([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]):
    d = pd.read_csv(os.path.join(path, x))
    print(d.shape)
    d.iloc[:, :-1].to_csv(os.path.join(path, f"digits_{i}.csv"), index= False)
d.iloc[:, -1].to_csv(os.path.join(path, f"digits_y.csv"), index= False)

In [None]:
x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/Yc.mat')["Yc"]
x.argmax(1)

In [None]:
[f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

In [None]:
os.listdir(path)

In [None]:
mat["X"][0][0].shape

In [None]:
mat["X"][0][1].shape

In [None]:
import pyreadr

In [None]:
mat = pyreadr.read_r('imvc/datasets/data/metabric/METABRIC_discovery')
mat

In [None]:
mat["mydatCNV"]

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(mat["Y"]).squeeze().value_counts()

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/bdgp/X.mat')
mat["X"].shape

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/bdgp/Yc.mat')
mat["Yc"][0]

In [None]:
mat["Yc"].shape