In [1]:
import os.path
import argparse
import shutil
import numpy as np
from pandarallel import pandarallel
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.cluster import KMeans
from mvlearn.decomposition import AJIVE, GroupPCA
from mvlearn.cluster import MultiviewSpectralClustering, MultiviewCoRegSpectralClustering
from imvc.datasets import LoadDataset
from imvc.transformers import MultiViewTransformer, ConcatenateViews
from imvc.algorithms import NMFC

from utils.getresult import GetResult


folder_results = "results"
folder_subresults = "subresults"
filelame = "complete_algorithms_evaluation.csv"
file_path = os.path.join(folder_results, filelame)
subresults_path = os.path.join(folder_results, folder_subresults)
logs_file = os.path.join(folder_results, 'logs.txt')
error_file = os.path.join(folder_results, 'errors.txt')

random_state = 42

args = lambda: None
args.continue_benchmarking, args.n_jobs = False, 2
# parser = argparse.ArgumentParser()
# parser.add_argument('-continue_benchmarking', default= False, action='store_true')
# parser.add_argument('-n_jobs', default= 1, type= int)
# args = parser.parse_args()

if args.n_jobs > 1:
    pandarallel.initialize(nb_workers= args.n_jobs)

datasets = [
    "simulated_gm",
    "simulated_InterSIM",
    "simulated_netMUG",
    "nutrimouse_genotype",
    "nutrimouse_diet",
    "bbcsport",
    "buaa",
    "metabric",
    "digits",
    "bdgp",
    "tcga",
    "caltech101",
    "nuswide",
]
two_view_datasets = ["simulated_gm", "nutrimouse_genotype", "nutrimouse_diet", "metabric", "bdgp",
                     "buaa", "simulated_netMUG"]
amputation_mechanisms = ["EDM", 'MCAR', 'MAR', 'MNAR']
# probs = np.arange(100, step= 10)
probs = np.arange(100, step= 40)
imputation = [True, False]
# runs_per_alg = np.arange(10)
runs_per_alg = np.arange(2)
algorithms = {
    "Concat": {"alg": make_pipeline(ConcatenateViews(),
                                    StandardScaler().set_output(transform='pandas'),
                                    KMeans()), "params": {}},
    # "NMFC": {"alg": make_pipeline(ConcatenateViews(),
    #                               MinMaxScaler().set_output(transform='pandas'),
    #                               NMFC().set_output(transform='pandas')), "params": {}},
    # "MVSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
    #                                               MultiviewSpectralClustering()),
    #                          "params": {}},
    # "MVCoRegSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
    #                                                    MultiviewCoRegSpectralClustering()),
    #                               "params": {}},
    # "GroupPCA": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), GroupPCA(), StandardScaler(), KMeans()),
    #              "params": {}},
    "AJIVE": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), AJIVE(), MultiViewTransformer(FunctionTransformer(pd.DataFrame)),
                                   ConcatenateViews(), StandardScaler(), KMeans()),
              "params": {}},
    # "SNF": {"alg": MultiViewTransformer(StandardScaler().set_output(transform="pandas")), "params": {}},
    # "IntNMF": {"alg": MultiViewTransformer(MinMaxScaler().set_output(transform="pandas")), "params": {}},
    # "COCA": {"alg": MultiViewTransformer(StandardScaler().set_output(transform="pandas")), "params": {}},
}
indexes_results = {"dataset": datasets, "algorithm": list(algorithms.keys()), "missing_percentage": probs,
                   "amputation_mechanism": amputation_mechanisms, "imputation": imputation, "run_n": runs_per_alg}
indexes_names = list(indexes_results.keys())
results = GetResult.create_results_table(datasets=datasets, indexes_results=indexes_results,
                                         indexes_names=indexes_names, amputation_mechanisms=amputation_mechanisms,
                                         two_view_datasets=two_view_datasets)

if not args.continue_benchmarking:
    if not eval(input("Are you sure you want to start benchmarking and delete previous results? (True/False)")):
        raise Exception
    results.to_csv(file_path)

    shutil.rmtree(subresults_path, ignore_errors=True)
    os.mkdir(subresults_path)

    os.remove(logs_file) if os.path.exists(logs_file) else None
    os.remove(error_file) if os.path.exists(error_file) else None
    open(logs_file, 'w').close()
    open(error_file, 'w').close()
else:
    finished_results = pd.read_csv(file_path, index_col= indexes_names)
    finished_results = GetResult.collect_subresults(results=finished_results, subresults_path=subresults_path, indexes_names=indexes_names)
    results.loc[finished_results.index, finished_results.columns] = finished_results

unfinished_results = results.loc[~results["finished"]]

for dataset_name in unfinished_results.index.get_level_values("dataset").unique():
    names = dataset_name.split("_")
    if "simulated" in names:
        names = ["_".join(names)]
    x_name,y_name = names if len(names) > 1 else (names[0], "0")
    Xs, y = LoadDataset.load_dataset(dataset_name=x_name, return_y=True, shuffle= False)
    y = y[y_name]
    n_clusters = y.nunique()
    unfinished_results_dataset = unfinished_results.loc[[dataset_name]]

    if args.n_jobs == 1:
        iterator = pd.DataFrame(unfinished_results_dataset.index.to_list(), columns=indexes_names)
        iterator.apply(lambda x: GetResult.run_iteration(idx= x, results= results, Xs=Xs, y=y, n_clusters=n_clusters,
                                                         algorithms=algorithms, random_state=random_state, subresults_path=subresults_path,
                                                         logs_file=logs_file, error_file=error_file), axis= 1)
    else:
        try:
            unfinished_results_dataset_idx = unfinished_results_dataset.xs(0, level="missing_percentage", drop_level=False).index
            iterator = pd.DataFrame(unfinished_results_dataset_idx.to_list(), columns= indexes_names)
            iterator.parallel_apply(lambda x: GetResult.run_iteration(idx= x, results= results, Xs=Xs, y=y,
                                                                      n_clusters=n_clusters,
                                                                      algorithms=algorithms,
                                                                      random_state=random_state,
                                                                      subresults_path=subresults_path,
                                                                      logs_file=logs_file,
                                                                      error_file=error_file), axis= 1)
            results = GetResult.collect_subresults(results=results, subresults_path=subresults_path,
                                                   indexes_names=indexes_names)
            results.to_csv(file_path)

            unfinished_results_dataset_idx = unfinished_results_dataset.drop(unfinished_results_dataset_idx).index
            iterator = pd.DataFrame(unfinished_results_dataset_idx.to_list(), columns=indexes_names)
        except KeyError:
            iterator = pd.DataFrame(unfinished_results_dataset.index.to_list(), columns=indexes_names)

        iterator.parallel_apply(lambda x: GetResult.run_iteration(idx= x, results= results, Xs=Xs, y=y,
                                                                  n_clusters=n_clusters,
                                                                  algorithms=algorithms,
                                                                  random_state=random_state,
                                                                  subresults_path=subresults_path,
                                                                  logs_file=logs_file,
                                                                  error_file=error_file), axis= 1)
        results = GetResult.collect_subresults(results=results, subresults_path=subresults_path,
                                               indexes_names=indexes_names)
        results.to_csv(file_path)


INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Are you sure you want to start benchmarking and delete previous results? (True/False) True


In [2]:
filelame = "complete_algorithms_evaluation.csv"
file_path = os.path.join(folder_results, filelame)
results = pd.read_csv(file_path, index_col= indexes_names)
print("results", results.shape)
results.head()

results (660, 46)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed,comments,n_samples,n_views,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,...,MCC_performance,F1_performance,precision_performance,recall_performance,bal_acc_performance,ami_performance,ari_performance,completeness_performance,random_acc_performance,random_f1_performance
dataset,algorithm,missing_percentage,amputation_mechanism,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
simulated_gm,Concat,0,'None',False,0,True,True,{},100.0,2.0,0.0,100.0,0.184676,100.0,100.0,...,,,,,,,,,,
simulated_gm,Concat,0,'None',False,1,True,True,{},100.0,2.0,0.0,100.0,0.212941,100.0,100.0,...,,,,,,,,,,
simulated_gm,Concat,40,EDM,True,0,True,True,{},100.0,2.0,0.0,100.0,0.322496,100.0,100.0,...,0.558412,0.759133,0.784784,0.773737,0.773737,0.249745,0.26303,0.259068,0.55,0.354839
simulated_gm,Concat,40,EDM,True,1,True,True,{},100.0,2.0,0.0,100.0,0.128544,100.0,100.0,...,0.902671,0.948849,0.958333,0.944444,0.055556,0.756418,0.808081,0.766758,0.55,0.354839
simulated_gm,Concat,40,EDM,False,0,True,True,{},60.0,2.0,0.0,60.0,0.217929,60.0,100.0,...,0.935414,0.96663,0.966667,0.96875,0.96875,0.819196,0.868899,0.820112,0.533333,0.347826


In [54]:
results = GetResult.collect_subresults(results=results, subresults_path=subresults_path, indexes_names=indexes_names)
results

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,finished,completed,n_samples,n_views,n_incomplete_samples,n_complete_samples,time,n_clustered_samples,percentage_clustered_samples,comments,...,MCC_performance,F1_performance,precision_performance,recall_performance,bal_acc_performance,ami_performance,ari_performance,completeness_performance,random_acc_performance,random_f1_performance
dataset,algorithm,missing_percentage,amputation_mechanism,imputation,run_n,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
simulated_gm,Concat,0,'None',False,0,True,True,100.0,2.0,0.0,100.0,0.637139,100.0,100.0,{},...,,,,,,,,,,
simulated_gm,Concat,0,'None',False,1,True,True,100.0,2.0,0.0,100.0,0.495935,100.0,100.0,{},...,,,,,,,,,,
simulated_gm,Concat,0,'None',False,2,True,True,100.0,2.0,0.0,100.0,0.246209,100.0,100.0,{},...,,,,,,,,,,
simulated_gm,Concat,0,'None',False,3,True,True,100.0,2.0,0.0,100.0,0.530780,100.0,100.0,{},...,,,,,,,,,,
simulated_gm,Concat,0,'None',False,4,True,True,100.0,2.0,0.0,100.0,0.565211,100.0,100.0,{},...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nuswide,Concat,80,MNAR,False,0,False,False,,,,,,,,,...,,,,,,,,,,
nuswide,Concat,80,MNAR,False,1,False,False,,,,,,,,,...,,,,,,,,,,
nuswide,Concat,80,MNAR,False,2,False,False,,,,,,,,,...,,,,,,,,,,
nuswide,Concat,80,MNAR,False,3,False,False,,,,,,,,,...,,,,,,,,,,


In [3]:
results = results[results["finished"]]
print("results", results.shape)

results (660, 46)


In [4]:
results["comments"]

dataset       algorithm  missing_percentage  amputation_mechanism  imputation  run_n
simulated_gm  Concat     0                   'None'                False       0                                                       {}
                                                                               1                                                       {}
                                                                               2                                                       {}
                                                                               3                                                       {}
                                                                               4                                                       {}
                                                                                                              ...                        
caltech101    Concat     0                   'None'                False       0       

In [19]:
results = results.reset_index()
results = results[~results["completed"]]
results

Unnamed: 0,dataset,algorithm,missing_percentage,amputation_mechanism,imputation,run_n,finished,completed,n_samples,n_views,...,MCC_performance,F1_performance,precision_performance,recall_performance,bal_acc_performance,ami_performance,ari_performance,completeness_performance,random_acc_performance,random_f1_performance
655,caltech101,Concat,0,'None',False,0,True,False,9144.0,6.0,...,,,,,,,,,,
656,caltech101,Concat,0,'None',False,3,True,False,9144.0,6.0,...,,,,,,,,,,


In [21]:
row = results.iloc[:1]
row

Unnamed: 0,dataset,algorithm,missing_percentage,amputation_mechanism,imputation,run_n,finished,completed,n_samples,n_views,...,MCC_performance,F1_performance,precision_performance,recall_performance,bal_acc_performance,ami_performance,ari_performance,completeness_performance,random_acc_performance,random_f1_performance
655,caltech101,Concat,0,'None',False,0,True,False,9144.0,6.0,...,,,,,,,,,,


In [41]:
row[["finished", "comments"]] = False, {"asdasd": 1}

In [47]:
row["comments"] = [{"asdasd": "1"}]

In [52]:
type(row["comments"].iloc[0])

dict

In [48]:
row[["finished", "comments"]]

Unnamed: 0,finished,comments
655,False,{'asdasd': '1'}


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn import metrics
from reval.utils import kuhn_munkres_algorithm
import copy
from pyampute import MultivariateAmputation
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans
from mvlearn.cluster import MultiviewSpectralClustering, MultiviewCoRegSpectralClustering
from mvlearn.decomposition import AJIVE, GroupPCA
from collections import defaultdict
from sklearn.impute import SimpleImputer

from imvc.transformers import MultiViewTransformer, ConcatenateViews, Ampute
from imvc.utils import DatasetUtils
from imvc.datasets import LoadDataset
from imvc.algorithms import NMFC

from models import Model
from utils import Utils


In [None]:
algorithms = {
    "Concat": {"alg": make_pipeline(ConcatenateViews(),
                                    StandardScaler().set_output(transform='pandas'),
                                    KMeans()), "params": {}},
    "NMFC": {"alg": make_pipeline(ConcatenateViews(),
                                  MinMaxScaler().set_output(transform='pandas'),
                                  NMFC().set_output(transform='pandas')), "params": {}},
    "MVSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                  MultiviewSpectralClustering()),
                             "params": {}},
    "MVCoRegSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                       MultiviewCoRegSpectralClustering()),
                                  "params": {}},
    "GroupPCA": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), GroupPCA(), StandardScaler(), KMeans()),
                 "params": {}},
    "AJIVE": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), AJIVE(), MultiViewTransformer(FunctionTransformer(pd.DataFrame)),
                                   ConcatenateViews(), StandardScaler(), KMeans()),
              "params": {}},
    "SNF": {"alg": MultiViewTransformer(StandardScaler().set_output(transform="pandas")), "params": {}},
    "IntNMF": {"alg": MultiViewTransformer(MinMaxScaler().set_output(transform="pandas")), "params": {}},
    "COCA": {"alg": MultiViewTransformer(StandardScaler().set_output(transform="pandas")), "params": {}},
    # "intNMF": {},
    # "jNMF": {},
}


In [None]:
dataset_name = "nutrimouse_genotype"
amputation_mechanism = "EDM"
random_state = 42
run_n = 0
p = 0.2
impute = False
alg_name = "COCA"
alg = algorithms[alg_name]
errors_dict = defaultdict(int)
names = dataset_name.split("_")
if "simulated" in names:
    names = ["_".join(names)]
x_name,y_name = names if len(names) > 1 else (names[0], "0")
Xs, y = LoadDataset.load_dataset(dataset_name=x_name, return_y=True, shuffle= False)
y = y[y_name]
n_clusters = y.nunique()
train_Xs = DatasetUtils.shuffle_imvd(Xs=Xs, random_state=random_state + run_n)
y_train = y.loc[train_Xs[0].index]
strat = False
if p != 0:
    if amputation_mechanism == "EDM":
        try:
            assert n_clusters < len(train_Xs[0]) * (1-p)
        except AssertionError as exception:
            raise AssertionError(f"{exception}; n_clusters < len(train_Xs[0]) * (1-p)")
        amp = Ampute(p=round(p, 2), mechanism=amputation_mechanism, random_state=random_state + run_n,
                     assess_percentage=True, stratify=y_train)
        try:
            train_Xs = amp.fit_transform(train_Xs)
            strat = True
        except ValueError:
            amp.set_params(**{"stratify": None})
            train_Xs = amp.fit_transform(train_Xs)
    else:
        amp = Ampute(p=round(p, 2), mechanism=amputation_mechanism, random_state=random_state + run_n)
        train_Xs = amp.fit_transform(train_Xs)
if impute:
    train_Xs = MultiViewTransformer(SimpleImputer(strategy="mean").set_output(
        transform="pandas")).fit_transform(train_Xs)
else:
    train_Xs = DatasetUtils.select_complete_samples(Xs=train_Xs)
    y_train = y_train.loc[train_Xs[0].index]
DatasetUtils.get_n_complete_samples(Xs=train_Xs), DatasetUtils.get_n_incomplete_samples(Xs=train_Xs), len(train_Xs)

In [None]:
model = Model(alg_name=alg_name, alg=alg)
clusters, model = model.method(train_Xs=train_Xs, y_train=y_train, n_clusters=n_clusters, random_state=random_state, run_n=run_n)
clusters = pd.Series(clusters, index=y_train.index)
metrics.matthews_corrcoef(y_true= y_train, y_pred= kuhn_munkres_algorithm(true_lab=y_train, pred_lab=clusters))

In [None]:
train_Xs = make_pipeline(MultiViewTransformer(MinMaxScaler().set_output(transform="pandas"))).fit_transform(train_Xs)
mask = [X.notnull().astype(int) for X in train_Xs]
mask = Utils.convert_df_to_r_object(mask)
train_Xs = Utils.convert_df_to_r_object(train_Xs)


In [None]:
nnTensor = importr("nnTensor")
clusters = nnTensor.jNMF(train_Xs, M= mask, J= n_clusters)
clusters = np.array(clusters[0]).argmax(axis=1)
clusters = pd.Series(clusters, index=y_train.index)
metrics.matthews_corrcoef(y_true= y_train, y_pred= kuhn_munkres_algorithm(true_lab=y_train, pred_lab=clusters))

In [None]:
Xs, y, met = LoadDataset.load_dataset(dataset_name="tcga", return_y=True, shuffle= False, return_metadata=True)

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R

install.packages("nnTensor")

In [None]:
%%R

library("nnTensor")

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
import pandas as pd
import json

In [3]:
meth = pd.read_csv("imvc/datasets/data/tcga/original/DNAmethylationClusteringMatrix.csv", index_col= 0).T
meth.index = meth.index.str[:12]
print(meth.shape)
meth = meth.loc[:, (meth.isna().sum() < meth.shape[1]*0.01).values]
print(meth.shape)
meth.head()

(4923, 2043)
(4923, 1739)


Unnamed: 0,cg00003994,cg00024396,cg00035623,cg00047050,cg00079563,cg00095674,cg00112517,cg00117172,cg00119079,cg00121640,...,cg27522780,cg27529628,cg27543230,cg27544190,cg27555365,cg27560922,cg27574244,cg27626299,cg27654142,cg27662877
TCGA-09-0364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-09-0365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TCGA-09-0366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TCGA-09-0367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TCGA-09-0369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
mirna = pd.read_csv("imvc/datasets/data/tcga/original/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv", index_col= 0).T
mirna.index = mirna.index.str.split("_").str[1].str[:12]
mirna.columns.name = None
print(mirna.shape)
mirna.head()

(4229, 215)


Unnamed: 0,hsa-mir-21.MIMAT0000076,hsa-mir-143.MIMAT0000435,hsa-mir-99b.MIMAT0000689,hsa-mir-10b.MIMAT0000254,hsa-mir-10a.MIMAT0000253,hsa-mir-30a.MIMAT0000087,hsa-mir-92a.MIMAT0000092,hsa-mir-148a.MIMAT0000243,hsa-mir-203.MIMAT0000264,hsa-mir-192.MIMAT0000222,...,hsa-mir-885.MIMAT0004947,hsa-mir-520a.MIMAT0002834,hsa-mir-140.MIMAT0000431,hsa-mir-505.MIMAT0002876,hsa-mir-30c-2.MIMAT0004550,hsa-mir-296.MIMAT0000690,hsa-mir-181c.MIMAT0000258,hsa-let-7a.MIMAT0004481,hsa-mir-324.MIMAT0000762,hsa-mir-410.MIMAT0002171
TCGA-04-1331,6884.365081,16581.469418,173467.412456,8768.470483,14049.034949,2745.645025,135990.811569,1972.32189,501.964246,28.577172,...,22.116246,0.0,1.987977,100.889843,194.076275,130.709502,48.705442,2.981966,129.715513,20.128269
TCGA-04-1332,724.02167,14939.100532,522492.699748,21839.519336,523.504889,1285.948937,39289.282078,2748.400669,144.083914,26.415384,...,22.813286,0.0,8.404895,157.291607,79.246153,190.911187,27.616084,2.401399,124.872726,8.404895
TCGA-04-1336,1960.23071,695.088668,468593.285739,1552.857661,15101.036583,1361.943559,94755.240054,3171.594133,582.153565,111.590637,...,10.755724,1.344466,0.0,43.022896,21.511448,660.132564,116.968499,4.033397,258.137377,0.0
TCGA-04-1337,2672.891647,14117.312109,492836.314754,7617.075721,9419.642831,4813.403924,119954.331109,1658.477476,959.440215,45.715185,...,11.573465,13.309484,1.73602,122.678725,88.537004,126.729437,21.989583,2.893366,98.374449,12.152138
TCGA-04-1341,1994.877742,2036.094225,408525.090375,57221.159401,10739.747083,894.080616,76951.806518,3682.217117,1946.686163,102.724156,...,1660.073086,0.0,0.0,137.599641,16.486593,434.992413,22.19349,2.536399,332.268257,3.170499


In [5]:
prot = pd.read_csv("imvc/datasets/data/tcga/original/PanCan11_RBN_RPPA_without_Duplicates_20130325.csv", index_col= 0)
prot = prot.drop(columns= ["Set","Sample_Source","Sample_description","UUID"])
prot.index.name = None
print(prot.shape)
prot.head()

(3467, 132)


Unnamed: 0,Tumor,1433EPSILON,4EBP1,4EBP1PS65,4EBP1PT37T46,53BP1,ACCPS79,ACC1,AKT,AKTPS473,...,XRCC1,YAP,YAPPS127,YB1,YB1PS102,JNKPT183Y185,PAI1,MTORPS2448,ASNS,EGFR
TCGA-02-0003,GBM,0.183081,-0.449019,0.001121,-0.123657,-0.381269,-0.442297,-0.959635,0.103691,-0.129588,...,-0.385654,-0.007476,-1.046517,-0.884312,0.13165,0.940622,2.01127,0.197016,-0.790724,-0.183051
TCGA-02-0004,GBM,0.219945,-0.679506,-0.25624,0.110517,-1.094812,-0.658515,-1.122084,-0.757821,0.755576,...,-0.379615,0.140171,-0.97627,-0.234118,0.330947,0.588233,3.77634,0.036879,-0.437159,0.09497
TCGA-02-0011,GBM,0.419958,-0.001036,0.032761,0.447925,-0.850915,-0.147996,-0.215953,-0.231629,0.830394,...,-0.556713,-0.068282,-0.63385,-1.024676,-0.086476,0.152389,-0.150585,-0.062152,-0.33302,0.381839
TCGA-02-0014,GBM,0.219633,0.050384,0.756086,1.526763,0.29684,-0.29145,-0.547863,0.25034,1.218512,...,-0.568351,0.496743,-1.287725,-1.498109,0.497103,1.477672,-0.610491,-0.177288,-0.143429,-0.328967
TCGA-02-0068,GBM,1.106023,-0.796095,-0.193148,0.408208,-1.461406,0.993927,0.232175,-0.178059,0.469241,...,-0.332685,-0.491739,-0.053597,0.842944,-0.143879,0.002885,0.602278,0.286717,-0.786658,-0.318299


In [6]:
exp = pd.read_csv("imvc/datasets/data/tcga/original/PanCan12.3602-corrected-v3.txt", sep= "\t", index_col= 0, skiprows=1).T
exp.index = exp.index.str[:12]
print(exp.shape)
exp = exp.loc[:, (exp.isna().sum() < exp.shape[1]*0.01).values]
print(exp.shape)
exp.head()

(3602, 16116)
(3602, 12726)


Unnamed: 0,?|10357,?|10431,?|155060,?|57714,?|653553,?|8225,A1BG|1,A2LD1|87769,A2M|2,A4GALT|53947,...,ZW10|9183,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
TCGA-BL-A0C8,0.18,0.35,-0.21,0.02,2.262156,-0.48,-2.533269,-0.37,1.03,-1.09,...,-0.89,0.480988,0.68,0.18,0.14551,-0.07,1.09,-0.41,0.290227,-0.12
TCGA-BL-A13I,0.84,0.76,-0.98,-0.64,0.332156,0.88,0.106731,-0.86,1.02,-0.48,...,0.01,0.490988,0.89,-0.58,-1.30449,-0.93,-0.58,1.73,0.260227,-0.16
TCGA-BL-A13J,0.51,0.63,0.23,-0.95,2.592156,-1.1,-1.303269,-0.22,-0.16,-0.49,...,0.22,0.860988,-0.77,-0.8,-0.93449,0.11,-0.16,-0.18,-1.179773,0.09
TCGA-BL-A3JM,0.79,0.26,0.37,-0.15,-0.457844,1.68,-1.75327,0.71,-1.26,0.57,...,0.85,0.660988,1.64,-1.46,-0.90449,0.58,-0.18,-1.73,0.950227,0.07
TCGA-BT-A0S7,1.55,0.36,-0.41,-0.17,1.342156,0.63,0.106731,0.85,-0.48,-1.6,...,0.67,0.420988,1.32,-0.11,0.06551,-0.04,0.09,-1.0,0.500227,-0.17


In [7]:
idxs = meth.index.intersection(prot.index).intersection(mirna.index).intersection(exp.index)
meth, prot = meth.reset_index().drop_duplicates(subset="index").set_index("index"), prot.reset_index().drop_duplicates(subset="index").set_index("index")
mirna, exp = mirna.reset_index().drop_duplicates(subset="index").set_index("index"), exp.reset_index().drop_duplicates(subset="index").set_index("index")
prot = prot.loc[idxs]
meth = meth.loc[idxs]
mirna = mirna.loc[idxs]
exp = exp.loc[idxs, (exp - exp.median(axis=0)).abs().median().sort_values(ascending= False).iloc[:2000].index]

In [8]:
oe = OrdinalEncoder(dtype= int).set_output(transform= "pandas")
tumor = oe.fit_transform(prot[["Tumor"]]).squeeze()
prot = prot.drop(columns= "Tumor")
tumor.name = None
tumor

TCGA-13-0799    7
TCGA-13-0800    7
TCGA-13-0801    7
TCGA-24-1924    7
TCGA-24-1930    7
               ..
TCGA-CV-7434    3
TCGA-CV-7435    3
TCGA-CV-7437    3
TCGA-CV-7438    3
TCGA-CV-7440    3
Length: 2437, dtype: int64

In [9]:
met = {
    "modality": {0: "miRNA", 1: "mRNA", 2: "RPPA", 3: "methyl"},
    "labels": {i:j for i,j in enumerate(oe.categories_[0])},
    "samples": {i:j for i,j in enumerate(tumor.index)},
}
met

{'modality': {0: 'miRNA', 1: 'mRNA', 2: 'RPPA', 3: 'methyl'},
 'labels': {0: 'BLCA',
  1: 'BRCA',
  2: 'COAD',
  3: 'HNSC',
  4: 'KIRC',
  5: 'LUAD',
  6: 'LUSC',
  7: 'OVCA',
  8: 'READ',
  9: 'UCEC'},
 'samples': {0: 'TCGA-13-0799',
  1: 'TCGA-13-0800',
  2: 'TCGA-13-0801',
  3: 'TCGA-24-1924',
  4: 'TCGA-24-1930',
  5: 'TCGA-24-2020',
  6: 'TCGA-24-2023',
  7: 'TCGA-24-2026',
  8: 'TCGA-24-2027',
  9: 'TCGA-30-1860',
  10: 'TCGA-30-1891',
  11: 'TCGA-31-1944',
  12: 'TCGA-31-1951',
  13: 'TCGA-61-1721',
  14: 'TCGA-61-1724',
  15: 'TCGA-61-1743',
  16: 'TCGA-61-1918',
  17: 'TCGA-61-1919',
  18: 'TCGA-09-2054',
  19: 'TCGA-09-2056',
  20: 'TCGA-23-2077',
  21: 'TCGA-23-2081',
  22: 'TCGA-24-2024',
  23: 'TCGA-24-2033',
  24: 'TCGA-24-2036',
  25: 'TCGA-24-2038',
  26: 'TCGA-24-2254',
  27: 'TCGA-24-2261',
  28: 'TCGA-61-1736',
  29: 'TCGA-61-1995',
  30: 'TCGA-61-2000',
  31: 'TCGA-61-2009',
  32: 'TCGA-61-2012',
  33: 'TCGA-61-2088',
  34: 'TCGA-61-2092',
  35: 'TCGA-61-2094',
  36

In [10]:
print(meth.shape)
print(prot.shape)
print(mirna.shape)
print(exp.shape)
print(tumor.shape)

(2437, 1739)
(2437, 131)
(2437, 215)
(2437, 2000)
(2437,)


In [11]:
knn = KNNImputer().set_output(transform="pandas")
meth = knn.fit_transform(meth)
meth.head()

Unnamed: 0,cg00003994,cg00024396,cg00035623,cg00047050,cg00079563,cg00095674,cg00112517,cg00117172,cg00119079,cg00121640,...,cg27522780,cg27529628,cg27543230,cg27544190,cg27555365,cg27560922,cg27574244,cg27626299,cg27654142,cg27662877
TCGA-13-0799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TCGA-13-0800,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
TCGA-13-0801,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TCGA-24-1924,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
TCGA-24-1930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
exp = knn.fit_transform(exp)
exp.head()

Unnamed: 0,KIAA1324|57535,ESR1|2099,CP|1356,LTF|4057,LYPD3|27076,MAPT|4137,S100A1|6271,CRABP2|1382,IGF2BP2|10644,MYB|4602,...,RGPD5|84220,IGSF3|3321,TIAM1|7074,CD74|972,PAG1|55824,FKBP1B|2281,LYPD5|284348,SKA3|221150,PCOLCE|5118,LOC339524|339524
TCGA-13-0799,-3.589232,3.20439,-1.101034,-4.117261,-1.500277,-2.866501,2.03,-1.252664,0.996037,1.1,...,-5.976818,-1.66,1.18,-0.643078,-1.895562,0.167036,1.322826,1.052315,-2.020328,1.99
TCGA-13-0800,1.140768,3.69439,-0.431034,-1.667261,0.119723,-0.056501,2.6,1.727336,3.776037,0.06,...,-8.066818,-0.9,0.24,-3.403078,-2.835562,2.197036,0.762826,1.622315,-0.220328,2.05
TCGA-13-0801,2.940768,3.504391,0.708966,2.322739,1.849723,-0.196501,5.78,2.127336,-5.303963,2.01,...,-6.808818,-2.46,1.0,1.936922,-1.685562,1.917036,-0.087174,1.732315,-0.820328,0.16
TCGA-24-1924,1.000768,4.65439,2.618966,-0.467261,1.869723,-4.576501,6.28,3.877336,0.236037,2.24,...,-5.236818,-2.25,0.96,1.336922,-0.845562,1.387036,-0.437174,1.092315,-1.110329,1.61
TCGA-24-1930,-6.129232,4.424391,4.018966,-1.387261,-1.860278,-1.926501,4.95,3.117336,0.296037,1.03,...,-5.836818,-2.03,-0.06,1.526922,0.164438,-0.282964,1.692826,0.182315,1.059671,0.16


In [13]:
meth.to_csv("imvc/datasets/data/tcga/tcga_3.csv", index= False)
prot.to_csv("imvc/datasets/data/tcga/tcga_2.csv", index= False)
mirna.to_csv("imvc/datasets/data/tcga/tcga_0.csv", index= False)
exp.to_csv("imvc/datasets/data/tcga/tcga_1.csv", index= False)
tumor.to_csv("imvc/datasets/data/tcga/tcga_y.csv", index= False)
with open('imvc/datasets/data/tcga/metadata.json', 'w') as fp:
    json.dump(met, fp)

In [None]:
from mvlearn.datasets import make_gaussian_mixture
import numpy as np
n_samples = 100
centers = [[0,1], [0,-1]]
covariances = [np.eye(2), np.eye(2)]
Xs, y = make_gaussian_mixture(n_samples, centers, covariances,
                              shuffle=True, shuffle_random_state=42)
print(y)

In [None]:
pd.Series(y).astype(int).to_csv(f"imvc/datasets/data/simulated_gm/simulated_gm_y.csv",  index= False)

In [None]:
[pd.DataFrame(X).to_csv(f"imvc/datasets/data/simulated_gm/simulated_gm_{i}.csv", index= False) for i,X in enumerate(Xs)]

In [None]:
%%R

library("coca")

clusters = coca(arrays, K = 2)

In [None]:
Model(alg_name = "IntNMF", alg = )

In [None]:
for i in os.listdir("imvc/datasets/data/buaa"):
    if "y.csv" in i:
        continue
    if ".csv" in i:
        pd.read_csv(os.path.join("imvc/datasets/data/buaa", i), index_col= 0).iloc[:90].to_csv(os.path.join("imvc/datasets/data/buaa", i), index= False)

In [None]:
import os
import pandas as pd
for i in os.listdir("imvc/datasets/data/"):
    print(i)
    for j in os.listdir(os.path.join("imvc/datasets/data/", i)):
        if os.path.isfile(os.path.join("imvc/datasets/data/", i, j)):
            a = pd.read_csv(os.path.join("imvc/datasets/data/", i, j))
            print(j, a.shape)
    print()

In [None]:
import os
import pandas as pd
for i in os.listdir("imvc/datasets/data/"):
    print(i)
    for j in os.listdir(os.path.join("imvc/datasets/data/", i)):
        if os.path.isfile(os.path.join("imvc/datasets/data/", i, j)):
            a = pd.read_csv(os.path.join("imvc/datasets/data/", i, j))
            print(j, a.shape)
    print()

In [None]:
metadata = {}
metadata = {"modality": {0: "morphological features", 1: "Karhunen-Love coefficients", 2: "profile correlations", 3: "Zernike moments", 4: "Fourier coefficients of the character shapes", 5: "pixel averages of the images from 2x3 windows"}}
import json

with open(os.path.join("imvc/datasets/data/digits", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [None]:
bb[["Patient ID", "Pam50 + Claudin-low subtype"]]

In [None]:
for i,name in enumerate(["X", "Ya"]):
    x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/{name}.mat')[name]
    print(x.shape)
    pd.DataFrame(x).to_csv(f'imvc/datasets/data/bdgp/bdgp_{i}.csv', index= False)
x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/Yc.mat')["Yc"]
pd.DataFrame(x.argmax(1)).to_csv(f'imvc/datasets/data/bdgp/bdgp_y.csv', index= False)

In [None]:
path = "imvc/datasets/data/tcga"
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
for i,x in enumerate(["exp", "methy", "mirna"]):
    target = []
    files_x = [os.path.join(path, file) for file in files if file.startswith(x)]
    ds = []
    for file_x in files_x:
        d_x = pd.read_csv(file_x, index_col= 0).T
        print(file_x, d_x.shape)
        target.extend([file_x.split("_")[-1]]* d_x.shape[0])
        ds.append(d_x)
    d = pd.concat(ds)
    print(x, d.shape)
    d = d.dropna(axis= 1)
    print(x, d.shape)
    d.to_csv(os.path.join(path, f'tcga_{i}.csv'))
pd.Series(target).to_csv(os.path.join(path, 'tcga_y.csv'))

In [None]:
path = "imvc/datasets/data/tcga"
files = [os.path.join(path, f) for f in os.listdir(path) if f.startswith("tcga_")]
d = pd.concat([pd.read_csv(file) for file in files], axis= 1)
for i,file in enumerate(files):
    d_x = pd.read_csv(file)
    print(file, d_x.shape, d_x.loc[d.index])
    d_x.loc[d.index].to_csv(file)

In [None]:
path = "imvc/datasets/data/tcga"
for i,x in enumerate([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]):
    with ZipFile(os.path.join(path, x)) as zf:
        for file in zf.namelist():
            with zf.open(file) as f2:
                d = pd.read_csv(f2, sep= " ")
                print(file, d.shape)
                d.to_csv(f"{os.path.join(path, file)}_{x.split('.')[0]}.csv")

In [None]:
path = "imvc/datasets/data/digits"
for i,x in enumerate([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]):
    d = pd.read_csv(os.path.join(path, x))
    print(d.shape)
    d.iloc[:, :-1].to_csv(os.path.join(path, f"digits_{i}.csv"), index= False)
d.iloc[:, -1].to_csv(os.path.join(path, f"digits_y.csv"), index= False)

In [None]:
x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/Yc.mat')["Yc"]
x.argmax(1)

In [None]:
[f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

In [None]:
os.listdir(path)

In [None]:
mat["X"][0][0].shape

In [None]:
mat["X"][0][1].shape

In [None]:
import pyreadr

In [None]:
mat = pyreadr.read_r('imvc/datasets/data/metabric/METABRIC_discovery')
mat

In [None]:
mat["mydatCNV"]

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(mat["Y"]).squeeze().value_counts()

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/bdgp/X.mat')
mat["X"].shape

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/bdgp/Yc.mat')
mat["Yc"][0]

In [None]:
mat["Yc"].shape