In [None]:
import os.path
import time
from collections import defaultdict
import argparse
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.cluster import KMeans, spectral_clustering
from mvlearn.decomposition import AJIVE, GroupPCA
from mvlearn.cluster import MultiviewSpectralClustering, MultiviewCoRegSpectralClustering
from snf import compute
from bignmf.models.jnmf.integrative import IntegrativeJnmf
from bignmf.models.jnmf.standard import StandardJnmf
from imvc.datasets import LoadDataset
from imvc.utils import DatasetUtils
from imvc.transformers import MultiViewTransformer, ConcatenateViews
from imvc.algorithms import NMFC

from utils import save_record, run_iteration

folder_name = "results"
filelame = "complete_algorithms_evaluation.csv"
file_path = os.path.join(folder_name, filelame)
logs_file = os.path.join(folder_name, 'logs.txt')
error_file = os.path.join(folder_name, 'error.txt')

random_state = 42

parser = argparse.ArgumentParser()
parser.add_argument('start_benchmarking', default= False, type= bool)
args = parser.parse_args()

datasets = ["nutrimouse_genotype", "nutrimouse_diet", "bbcsport", "bdgp", "caltech101", "digits", "tcga_tissue", "tcga_survival", "nuswide", "metabric"]
probs = np.arange(100, step= 10)
imputation = [True, False]
runs_per_alg = np.arange(10)
algorithms = {
    "Concat": {"alg": make_pipeline(ConcatenateViews(),
                                    StandardScaler().set_output(transform='pandas'),
                                    KMeans()), "params": {}},
    "NMFC": {"alg": make_pipeline(ConcatenateViews(),
                                  MinMaxScaler().set_output(transform='pandas'),
                                  NMFC().set_output(transform='pandas')), "params": {}},
    "MVSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                  MultiviewSpectralClustering()),
                             "params": {}},
    "MVCoRegSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                       MultiviewCoRegSpectralClustering()),
                                  "params": {}},
    "GroupPCA": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), GroupPCA(), StandardScaler(), KMeans()),
                 "params": {}},
    "AJIVE": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), AJIVE(), MultiViewTransformer(FunctionTransformer(pd.DataFrame)),
                                   ConcatenateViews(), StandardScaler(), KMeans()),
              "params": {}},
    "SNF": {},
    "intNMF": {},
    "jNMF": {},
}
indexes_results = {"dataset": datasets, "algorithm": list(algorithms.keys()),
                   "missing_percentage": probs, "imputation": imputation, "run_n": runs_per_alg}


if args.start_benchmarking:
    results = pd.DataFrame(datasets, columns= ["dataset"])
    for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
        results = results.merge(pd.Series(v, name= k), how= "cross")
    results = results.set_index(list(indexes_results.keys()))
    results[["finished", "completed"]] = False
else:
    results = pd.read_csv(file_path, index_col= list(indexes_results.keys()))
    results_ = results.select_dtypes(object).drop(columns= ["comments", "stratified"]).replace(np.nan, "np.nan")
    for col in results_.columns:
        results[col] = results_[col].apply(eval)
        
    open(logs_file, 'w').close()
    open(error_file, 'w').close()

unfinished_results = results.loc[~results["finished"]]

for dataset_name in unfinished_results.index.get_level_values("dataset").unique():
    names = dataset_name.split("_")
    x_name,y_name = names if len(names) >1 else (names[0],"0")
    Xs, y = LoadDataset.load_dataset(dataset_name=x_name, return_y=True, shuffle= False)
    y = y[y_name]
    n_clusters = y.nunique()

    iterator = pd.Series(unfinished_results.loc[unfinished_results.index.get_level_values("dataset") == dataset_name].index.to_list())
    iterator.apply(lambda x: run_iteration(idx= x, results= results, Xs=Xs, y=y, n_clusters=n_clusters,
                                           algorithms=algorithms, random_state=random_state, file_path=file_path,
                                           logs_file=logs_file, error_file=error_file))


In [None]:
from datetime import datetime

In [None]:
y["0"]

In [None]:
for idx in iterator:
    row = results.loc[[idx]]
    row_index = row.index
    alg_name, impute, p, run_n = (
        row_index.get_level_values("algorithm")[0],
        row_index.get_level_values("imputation")[0],
        row_index.get_level_values("missing_percentage")[0] / 100,
        row_index.get_level_values("run_n")[0])

    if p == run_n and impute:
        print(row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0], "\t", datetime.now())
    alg = algorithms[alg_name]
    train_Xs = DatasetUtils.shuffle_imvd(Xs=Xs, random_state=random_state + run_n)
    y_train = y.loc[train_Xs[0].index]
    errors_dict = defaultdict(int)
    if p != 0:
        try:
            assert n_clusters < len(train_Xs[0]) * (1-p)
        except AssertionError as exception:
            errors_dict[f"{type(exception).__name__}: {exception}; n_clusters < len(train_Xs[0]) * (1-p)"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        try:
            train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p=round(p, 2),
                                                              random_state=random_state + run_n,
                                                              assess_percentage=True, stratify=y_train)
            strat = True
        except ValueError:
            try:
                train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p=round(p, 2),
                                                                  random_state=random_state + run_n,
                                                                  assess_percentage=True)
                strat = False
            except Exception as exception:
                errors_dict[f"{type(exception).__name__}: {exception}"] += 1
                # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
                results.loc[idx, ["finished", "comments"]] = True, errors_dict
                results.to_csv(file_path)
                # return results.loc[idx]
    else:
        strat = False

    if impute:
        train_Xs = MultiViewTransformer(SimpleImputer(strategy="mean").set_output(transform="pandas")).fit_transform(
            train_Xs)
    else:
        train_Xs = DatasetUtils.select_complete_samples(Xs=train_Xs)
        y_train = y_train.loc[train_Xs[0].index]

    try:
        start_time = time.perf_counter()
        if alg_name == "SNF":
            preprocessing_step = MultiViewTransformer(StandardScaler().set_output(transform="pandas"))
            train_Xs = preprocessing_step.fit_transform(train_Xs)
            affinities = compute.make_affinity(train_Xs, normalize=False)
            fused = compute.snf(affinities)
            clusters = spectral_clustering(fused, n_clusters=n_clusters, random_state=random_state + run_n)
        elif alg_name == "intNMF":
            preprocessing_step = MultiViewTransformer(MinMaxScaler().set_output(transform="pandas"))
            train_Xs = preprocessing_step.fit_transform(train_Xs)
            model = IntegrativeJnmf({k: v for k, v in enumerate(train_Xs)}, k=n_clusters, lamb=0.1)
            model.run(trials=50, iterations=100, verbose=False)
            model.cluster_data()
            clusters = np.argmax(model.w_cluster, axis=1)
        elif alg_name == "jNMF":
            preprocessing_step = make_pipeline(MultiViewTransformer(MinMaxScaler().set_output(transform="pandas")))
            train_Xs = preprocessing_step.fit_transform(train_Xs)
            model = StandardJnmf({k: v for k, v in enumerate(train_Xs)}, k=n_clusters)
            model.run(trials=50, iterations=100, verbose=False)
            model.cluster_data()
            clusters = np.argmax(model.w_cluster, axis=1)
        else:
            model, params = alg["alg"], alg["params"]
            if alg_name == "GroupPCA":
                model[1].set_params(n_components=n_clusters, random_state=random_state + run_n, multiview_output=False)
            elif alg_name == "AJIVE":
                model[1].set_params(random_state=random_state + run_n)
            if alg_name == "NMFC":
                model[-1].set_params(n_components=n_clusters, random_state=random_state + run_n)
            else:
                model[-1].set_params(n_clusters=n_clusters, random_state=random_state + run_n)
            clusters = model.fit_predict(train_Xs)
    except ValueError as exception:
        if alg_name == "AJIVE" and len(y_train) < 5:
            errors_dict[f"{type(exception).__name__}: {exception}"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        if alg_name == "SNF" and len(y_train) < 17:
            errors_dict[f"{type(exception).__name__}: {exception}"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        if alg_name == "intNMF" and len(y_train) < 5:
            errors_dict[f"{type(exception).__name__}: {exception}"] += 1
            # print(errors_dict, row.drop(columns=row.columns).reset_index().to_dict(orient="records")[0])
            results.loc[idx, ["finished", "comments"]] = True, errors_dict
            results.to_csv(file_path)
            # return results.loc[idx]
        else:
            raise

    clusters = pd.Series(clusters, index=y_train.index)

    elapsed_time = time.perf_counter() - start_time

    if alg_name in ["NMFC"]:
        train_X = model.transform(train_Xs)
    elif alg_name in ["SNF"]:
        train_X = preprocessing_step.transform(train_Xs)
    elif alg_name in ["intNMF", "jNMF"]:
        train_X = model.w
    else:
        train_X = model[:-1].transform(train_Xs)
    if isinstance(train_X, list):
        train_X = ConcatenateViews().fit_transform(train_X)
    if not isinstance(train_X, pd.DataFrame):
        train_X = pd.DataFrame(train_X, index=y_train.index)

    assert train_X.index.equals(y_train.index)
    assert train_Xs[0].index.equals(y_train.index)

    if p > 0:
        best_solution = pd.MultiIndex.from_arrays(
            [[row_index.get_level_values(level=level)[0]] if level != "missing_percentage" else [0]
             for level in row_index.names], names=row_index.names)
        best_solution = results.loc[best_solution].iloc[0]
        y_train_total = pd.Series(best_solution["y_true"], index=best_solution["y_pred_idx"])
        best_solution = pd.Series(best_solution["y_pred"], index=best_solution["y_pred_idx"])
    else:
        best_solution = None
        y_train_total = None

    dict_results = save_record(train_Xs=train_Xs, train_X=train_X, clusters=clusters, y=y_train, p=p, y_true_total=y_train_total,
                               best_solution=best_solution, elapsed_time=elapsed_time, strat=strat,
                               random_state=random_state, errors_dict=errors_dict)


In [None]:
train_Xs[0]

In [None]:
model.initialize_wh()

In [None]:
model.update_weights()

In [None]:
model.w

In [None]:
dict(my_dict)

In [None]:
results.select_dtypes(object).drop(columns= ["comments"])

In [None]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
print(now)

In [None]:
def fun(x: int):
    "4" / 3
    return 

In [None]:
try:
    fun()
except Exception as exception:
    print(type(exception).__name__, ":", exception)

In [None]:
defaultdict(defaultdict)

In [None]:
import os.path
import time
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.cluster import KMeans, spectral_clustering
from mvlearn.decomposition import AJIVE, GroupPCA
from mvlearn.cluster import MultiviewSpectralClustering, MultiviewCoRegSpectralClustering
from snf import compute
from bignmf.models.jnmf.integrative import IntegrativeJnmf
from bignmf.models.jnmf.standard import StandardJnmf
from imvc.datasets import LoadDataset
from imvc.utils import DatasetUtils
from imvc.transformers import MultiViewTransformer, ConcatenateViews
from imvc.algorithms import NMFC

from utils import save_record
# from utils import GroupPCA

folder_name = "results"
filelame = "complete_algorithms_evaluation.csv"
file_path = os.path.join(folder_name, filelame)

random_state = 42
START_BENCHMARKING = False

datasets = ["nutrimouse_genotype", "nutrimouse_diet", "bbcsport", "bdgp", "caltech101", "digits", "tcga_tissue", "tcga_survival", "nuswide", "metabric"]
probs = np.arange(100, step= 10)
imputation = [True, False]
runs_per_alg = np.arange(10)
algorithms = {
    "Concat": {"alg": make_pipeline(ConcatenateViews(),
                                    StandardScaler().set_output(transform='pandas'),
                                    KMeans()), "params": {}},
    "NMFC": {"alg": make_pipeline(ConcatenateViews(),
                                  MinMaxScaler().set_output(transform='pandas'),
                                  NMFC().set_output(transform='pandas')), "params": {}},
    "MVSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                  MultiviewSpectralClustering()),
                             "params": {}},
    "MVCoRegSpectralClustering": {"alg": make_pipeline(MultiViewTransformer(StandardScaler().set_output(transform= "pandas")),
                                                       MultiviewCoRegSpectralClustering()),
                                  "params": {}},
    "GroupPCA": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), GroupPCA(), StandardScaler(), KMeans()),
                 "params": {}},
    "AJIVE": {"alg": make_pipeline(MultiViewTransformer(StandardScaler()), AJIVE(), MultiViewTransformer(FunctionTransformer(pd.DataFrame)),
                                   ConcatenateViews(), StandardScaler(), KMeans()),
              "params": {}},
    "SNF": {},
    "intNMF": {},
    "jNMF": {},
}
indexes_results = {"dataset": datasets, "algorithm": list(algorithms.keys()),
                   "missing_percentage": probs, "imputation": imputation, "run_n": runs_per_alg}


if START_BENCHMARKING:
    results = pd.DataFrame(datasets, columns= ["dataset"])
    for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
        results = results.merge(pd.Series(v, name= k), how= "cross")
    results = results.set_index(list(indexes_results.keys()))
    results[["finished", "completed"]] = False
else:
    results = pd.read_csv(file_path, index_col= list(indexes_results.keys()))
    results_ = results.select_dtypes(object).drop(columns= "comments").replace(np.nan, "np.nan")
    for col in results_.columns:
        results[col] = results_[col].apply(eval)

unfinished_results = results.loc[~results["finished"]]

for dataset_name in unfinished_results.index.get_level_values("dataset").unique():
    Xs, y = LoadDataset.load_dataset(dataset_name=dataset_name.split("_")[0], return_y=True, shuffle= False)
    y = pd.DataFrame(y)
    for target in y.columns:
        y_series = y[target].squeeze()
        n_clusters = y_series.nunique()

        for idx_iterator in unfinished_results.loc[unfinished_results.index.get_level_values("dataset") == dataset_name].itertuples():
            idx = idx_iterator[0]
            row = results.loc[[idx]]
            row_index = row.index
            print(row.drop(columns= row.columns).reset_index().to_dict(orient="records")[0])
            alg_name, impute, p, run_n = (
                row_index.get_level_values("algorithm")[0],
                row_index.get_level_values("imputation")[0],
                row_index.get_level_values("missing_percentage")[0]/100,
                row_index.get_level_values("run_n")[0])

            alg = algorithms[alg_name]
            train_Xs = DatasetUtils.shuffle_imvd(Xs=Xs, random_state= random_state + run_n)
            y_train = y_series.loc[train_Xs[0].index]
            errors_dict = defaultdict(int)
            if p != 0:
                # if n_clusters > len(train_Xs[0])*p:
                #     continue
                try:
                    train_Xs = DatasetUtils.add_random_noise_to_views(Xs=train_Xs, p= round(p, 2),
                                                                      random_state =random_state + run_n, 
                                                                      assess_percentage = True, stratify = y_train)
                except Exception as exception:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue

            if impute:
                train_Xs = MultiViewTransformer(SimpleImputer(strategy="mean").set_output(transform= "pandas")).fit_transform(train_Xs)
            else:
                train_Xs = DatasetUtils.select_complete_samples(Xs = train_Xs)
                y_train = y_train.loc[train_Xs[0].index]

            try:
                start_time = time.perf_counter()
                if alg_name == "SNF":
                    preprocessing_step = MultiViewTransformer(StandardScaler().set_output(transform= "pandas"))
                    train_Xs = preprocessing_step.fit_transform(train_Xs)
                    affinities = compute.make_affinity(train_Xs, normalize= False)
                    fused = compute.snf(affinities)
                    clusters = spectral_clustering(fused, n_clusters=n_clusters, random_state=random_state + run_n)
                elif alg_name == "intNMF":
                    preprocessing_step = MultiViewTransformer(MinMaxScaler().set_output(transform= "pandas"))
                    model = IntegrativeJnmf({k:v for k,v in enumerate(train_Xs)}, k= n_clusters, lamb = 0.1)
                    raise
                    model.run(trials = 50, iterations = 100, verbose=False)
                    model.cluster_data()
                    clusters = np.argmax(model.w_cluster, axis= 1)
                elif alg_name == "jNMF":
                    pipeline = make_pipeline(MultiViewTransformer(MinMaxScaler().set_output(transform= "pandas")))
                    model = StandardJnmf({k:v for k,v in enumerate(train_Xs)}, k= n_clusters)
                    model.run(trials = 50, iterations = 100, verbose=False)
                    model.cluster_data()
                    clusters = np.argmax(model.w_cluster, axis= 1)
                else:
                    model, params = alg["alg"], alg["params"]
                    if alg_name == "GroupPCA":
                        model[1].set_params(n_components=n_clusters, random_state=random_state + run_n, multiview_output=False)
                    elif alg_name == "AJIVE":
                        model[1].set_params(random_state=random_state + run_n)
                    if alg_name == "NMFC":
                        model[-1].set_params(n_components=n_clusters, random_state=random_state + run_n)
                    else:
                        model[-1].set_params(n_clusters=n_clusters, random_state=random_state + run_n)
                    clusters = model.fit_predict(train_Xs)
            except ValueError as exception:
                if alg_name == "AJIVE" and len(y_train) < 5:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue
                if alg_name == "SNF" and len(y_train) < 17:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue
                if alg_name == "intNMF" and len(y_train) < 5:
                    errors_dict[type(exception).__name__] += 1
                    print(errors_dict)
                    results.loc[idx, "finished"] = True
                    results.to_csv(file_path)
                    continue
                else:
                    raise

            clusters = pd.Series(clusters, index= y_train.index)

            elapsed_time = time.perf_counter() - start_time

            if alg_name in ["NMFC"]:
                train_X = model.transform(train_Xs)
            elif alg_name in ["SNF"]:
                train_X = preprocessing_step.transform(train_Xs)
            elif alg_name in ["intNMF", "jNMF"]:
                train_X = model.w
            else:
                train_X = model[:-1].transform(train_Xs)
            if isinstance(train_X, list):
                train_X = ConcatenateViews().fit_transform(train_X)
            if not isinstance(train_X, pd.DataFrame):
                train_X = pd.DataFrame(train_X, index= y_train.index)
                
            assert train_X.index.equals(y_train.index)
            assert train_Xs[0].index.equals(y_train.index)

            if p > 0:
                best_solution = pd.MultiIndex.from_arrays([[row_index.get_level_values(level= level)[0]] if level != "missing_percentage" else [0]
                                                           for level in row_index.names], names= row_index.names)
                best_solution = results.loc[best_solution].iloc[0]
                best_solution = pd.Series(best_solution["y_pred"], index= best_solution["y_pred_idx"])
                best_solution = best_solution.loc[train_X.index]
            else:
                best_solution = None

            dict_results = save_record(train_Xs=train_Xs, train_X=train_X, clusters=clusters, y=y_train, p= p,
                              best_solution = best_solution, elapsed_time=elapsed_time,
                              random_state=random_state, errors_dict=errors_dict)
            dict_results = pd.DataFrame(pd.Series(dict_results), columns= row_index).T
            results.loc[[idx], dict_results.columns] = dict_results
            results.loc[idx, "finished"] = True
            results.to_csv(file_path)


In [None]:
np.random.rand(list(model.x.values())[0].shape[0], model.k).shape

In [None]:
self.v = {}
self.h = {}
for key in self.x:
    self.h[key] = np.random.rand(self.k, self.x[key].shape[1])
    self.v[key] = np.random.rand(number_of_samples, self.k)

In [None]:
from sklearn import metrics
metrics.silhouette_score(train_X, clusters, random_state=random_state)

In [None]:
for i in range(1000):
    try:
        model.run(trials = 50, iterations = 100, verbose=0)
        # clusters = model.fit_predict(train_Xs)
        print(i)
    except Exception as ex:
        print(i, ex)
        pass


In [None]:
results[results["finished"] == True]

In [None]:
best_solution.loc[train_X.index]

In [None]:
a = pd.read_csv(file_path, index_col= list(indexes_results.keys()))
a

In [None]:
a["label_sizes"].apply(str).str.replace("nan", "np.nan").apply(eval)

In [None]:
b = a.select_dtypes(object).replace(np.nan, "np.nan").drop(columns= "comments")
b

In [None]:
for i in b.columns:
    print(i)
    b[i].apply(eval)

In [None]:
b.apply(eval, axis= 1)

In [None]:
a = pd.DataFrame(datasets, columns= ["dataset"])
for k,v in {k:v for k,v in indexes_results.items() if k != "dataset"}.items():
    a = a.merge(pd.Series(v, name= k), how= "cross")
a = a.set_index(list(indexes_results.keys()))
a["finished"] = False
for i in a.itertuples():
    print(len(i))
    print(i)
    break

In [None]:
for i in unfinished_results.iterrows():
    print(len(i))
    print(i)
    break

In [None]:
len(best_solution.loc[train_X.index])

In [None]:
i[0]

In [None]:
import scipy.io
import pandas as pd
import numpy as np
import os
from imvc.datasets import LoadDataset
from imvc.utils import DatasetUtils
import copy
from sklearn.model_selection import train_test_split


In [None]:
Xs = DatasetUtils.add_random_noise_to_views(Xs=Xs, p= 0.95, assess_percentage= True, random_state= 42, stratify=y.iloc[:, 1])
Xs[0].head()

In [None]:
datasets = ["nutrimouse", "bbcsport", "bdgp", "caltech101", "digits", "tcga", "nuswide", "metabric"]
probs = np.arange(0., 1., step= 0.1).round(1) * 100
imputation = [True, False]
runs_per_alg = np.arange(10)

results = pd.DataFrame(datasets, columns= ["dataset"]).merge(
    pd.Series(probs, name= "missing_percentage"), how= "cross").merge(
    pd.Series(imputation, name= "imputation"), how= "cross").merge(
    pd.Series(runs_per_alg, name= "run_n"), how= "cross")
results = results.set_index(['dataset', 'missing_percentage',"imputation", 'run_n'])
results["finished"] = False
results.to_csv("pr.csv")

In [None]:
pd.read_csv("pr.csv", index_col= ['dataset', 'missing_percentage',"imputation", 'run_n'])

In [None]:
Xs[0]

In [None]:
Xs[1]

In [None]:
DatasetUtils.convert_mvd_into_imvd(Xs, p= 0.1, assess_percentage= True, random_state= random_state)

In [None]:
def split_into_groups(num_elements, num_groups):
    # Calculate the base number of elements per group
    base_count = num_elements // num_groups
    remaining = num_elements % num_groups
    
    # Initialize the list to store the number of elements in each group
    groups_count = [base_count] * num_groups
    
    # Distribute the remaining elements equally among the groups
    for i in range(remaining):
        groups_count[i] += 1
    
    return groups_count

# Total number of elements
total_elements = 103
# Number of groups to split into
num_of_groups = 4

# Splitting elements into groups
result = split_into_groups(total_elements, num_of_groups)

# Displaying the number of elements in each group
for i, count in enumerate(result):
    print(f"Group {i + 1}: {count} elements")

In [None]:
103 % 4

In [None]:
base_count = 103 // 4
remaining = 103 % 4

# Initialize the list to store the number of elements in each group
groups_count = [base_count] * num_groups

In [None]:
for dataset_name in [
    # "nutrimouse",
    "tcga"
]:
    Xs, y = LoadDataset.load_dataset(dataset_name = dataset_name, return_y = True, p= 0.8, assess_percentage=True)
    # Xs = GetCompleteSamples().fit_transform(Xs)
    # aaaaaaaaaaaaaa
    # y.to_csv(os.path.join("imvc/datasets/data/", dataset_name, f"{dataset_name}_y.csv"))

In [None]:
survs = []
for dataset_name in os.listdir("imvc/datasets/data/tcga/original/"):
    if "survival" in dataset_name:
        print(dataset_name)
        surv = pd.read_csv(os.path.join("imvc/datasets/data/tcga/original/", dataset_name), sep= '""', index_col= 0, engine='python')
        if surv.shape[1] <2:
            surv = pd.read_csv(os.path.join("imvc/datasets/data/tcga/original/", "survival_lung.csv"), sep= "\t", index_col= 0)
        surv.index = surv.index.str.replace(".", "-").str.upper()
        surv.index = surv.index.str.extract(pat=r"(TCGA.{8})", expand=False)
        surv.index.name = None
        survs.append(surv[["Survival", "Death"]])
survs = pd.concat(survs).drop_duplicates()

In [None]:
y.index.intersection(survs.index)

In [None]:
_,met = LoadDataset.load_dataset(dataset_name = "tcga", return_metadata= True, p= 0.8, assess_percentage=True)
met

In [None]:
y[y == 0].dropna()

In [None]:
pattern = r"(TCGA.{8})"
matches = surv.index.str.extract(pat=pattern, expand=False)
matches

In [None]:
for dataset_name in [i for i in sorted(os.listdir("imvc/datasets/data/")[1:]) if i != "metabric"]:
    Xs = LoadDataset.load_dataset(dataset_name = dataset_name, return_y = False)
    for p in np.arange(0., 1., step= 0.1).round(1):
        imvd = DatasetUtils.convert_mvd_into_imvd(Xs, p= p, random_state = 42, assess_percentage = True)
        complete_Xs = GetCompleteSamples().fit_transform(imvd)
        print(dataset_name, "\t", p, "\t", len(Xs), "\t", len(DatasetUtils.get_sample_names(imvd)), "\t", len(DatasetUtils.get_sample_names(complete_Xs)))
    print()

In [None]:
probs = np.arange(0., 1., step= 0.1).round(1)
algorithms = ["Concat", "MOFA", "NMFC", "MONET", "MSNE", "SUMO", "NEMO"]
runs_per_alg = np.arange(10).tolist()
results = pd.DataFrame(os.listdir("imvc/datasets/data/"), columns= ["dataset"]).merge(
    pd.Series(algorithms, name= "algorithm"), how= "cross").merge(
    pd.Series(probs, name= "missing_percentage"), how= "cross").merge(
    pd.Series(runs_per_alg, name= "run_n"), how= "cross")
results = results.set_index(['dataset', 'algorithm', 'missing_percentage', 'run_n'])
results["finished"] = False
results

In [None]:
results.index.get_level_values("dataset") == ".ipynb_checkpoints"

In [None]:
for _ in a:
    print(_.shape)

In [None]:
GetCompleteSamples().fit_transform(a)

In [None]:
metadata = {}
metadata = {"modality": {0: "gene", 1: "lipid"}, "labels": {"genotype": pd.Series(a["genotype"].unique()).to_dict(), "diet": pd.Series(a["diet"].unique()).to_dict()}}

with open(os.path.join("imvc/datasets/data/nutrimouse", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [None]:
probs = np.arange(0., 1., step= 0.1).round(1)
runs_per_alg = np.arange(10).tolist()
pd.DataFrame(probs).merge(pd.DataFrame(runs_per_alg), how= "cross")

In [None]:
a = pd.DataFrame(["alg", "pad"])
b = pd.DataFrame(["alg1", "pad2"])
a.merge(b, how= "cross")

In [None]:
metadata = {}
metadata = {"modality": {0: "morphological features", 1: "Karhunen-Love coefficients", 2: "profile correlations", 3: "Zernike moments", 4: "Fourier coefficients of the character shapes", 5: "pixel averages of the images from 2x3 windows"}}
import json

with open(os.path.join("imvc/datasets/data/digits", 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/caltech101/Caltech101-all.mat')
mat

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/caltech101/Caltech101-all.mat')
for i,x in enumerate(mat["X"][0]):
    print(x.shape)
    pd.DataFrame(x).to_csv(f'imvc/datasets/data/caltech101/caltech101_{i}.csv', index= False)
pd.DataFrame(mat["Y"]).to_csv(f'imvc/datasets/data/caltech101/caltech101_y.csv', index= False)

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv", index_col= 0)
x.index.name= None
x.columns = x.columns.to_series().apply(lambda x: x.split("_")[1]).str[:12]
x = x.T
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_0.csv"))
x.head()

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/PanCan12.3602-corrected-v3.txt", sep= "\t", index_col= 0, header= [0,1])
x.index.name= None
x.columns = x.columns.droplevel(0).str[:12]
x = x.T
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_1.csv"))
x.head()

In [None]:
metadata = {}
metadata = {"modality": {0: "visual", 1: "mRNA", 2: "text", 3: "methyl"}, "labels": classes}
import json

with open(os.path.join(path, 'metadata.json'), 'w') as fp:
    json.dump(metadata, fp)

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/PanCan11_RBN_RPPA_without_Duplicates_20130325.csv", index_col= 0)
x.index.name= None
x = x[x.columns[5:]]
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_2.csv"))
x.head()

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/DNAmethylationClusteringMatrix.csv", index_col= 0)
x.columns = x.columns.str[:12]
x = x.T
x = x[~x.index.duplicated(keep='first')]
print(x.shape)
x.to_csv(os.path.join(path, "tcga_3.csv"))
x.head()

In [None]:
b = pd.read_csv(os.path.join(path, f"tcga_y.csv"), index_col= 0)
classes = pd.Series(b.iloc[:, 0].unique()).to_dict()
b.iloc[:, 0].replace({v:k for k,v in classes.items()}).to_csv(os.path.join(path, "tcga_y.csv"))

In [None]:
b

In [None]:
i = 1
a = pd.read_csv(os.path.join(path, f"tcga_{i}.csv"), index_col= 0).index
print(len(a))
for i in range(4):
    a = a.intersection(pd.read_csv(os.path.join(path, f"tcga_{i}.csv"), index_col= 0).index)
    print(len(a))

In [None]:
for i in range(4):
    b = pd.read_csv(os.path.join(path, f"tcga_{i}.csv"), index_col= 0)
    aa = a.intersection(b.index)
    print(i, "\t", b.shape, "\t", b.drop_duplicates().shape, "\t", b.loc[aa].shape, "\t", b.loc[aa].drop_duplicates().shape)

In [None]:
c = b.loc["TCGA-13-0791"]
c[c.columns[c.iloc[0] != c.iloc[1]]]

In [None]:
b.loc[a].shape

In [None]:
b.loc[a].index.difference(a)

In [None]:
b.drop_duplicates().loc[a.drop_duplicates()].drop_duplicates()

In [None]:
b.index.intersection(a)

In [None]:
a.intersection(b.index)

In [None]:
x = pd.read_csv("imvc/datasets/data/tcga/mrna_y.csv", index_col= 0)
x

In [None]:
x1 = pd.read_csv("imvc/datasets/data/tcga/mrna_y.csv", index_col =0)
x2 = pd.read_csv("imvc/datasets/data/tcga/mirna_y.csv", index_col =0)
x

In [None]:
x1.index

In [None]:
x2.index

In [None]:
x1.index.intersection(x2.index)

In [None]:
x.T.to_csv(os.path.join(path, "mrna.csv"))

In [None]:
x.columns.droplevel(1).to_series().to_csv(os.path.join(path, "mirna_y.csv"))

In [None]:
a = x.columns.to_frame().set_index(1)
# a.columns = [''] * len(a.columns)
a.index.name= None
a.index = a.index.str[:12]
a.to_csv(os.path.join(path, "mrna_y.csv"))

In [None]:
pd.read_csv("imvc/datasets/data/tcga/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv", index_col= 0)

In [None]:
pd.read_csv("imvc/datasets/data/tcga/PanCan.miRNAseq.RPM.215-MIMATs-most-variant-25pc.4229-samples.NMF-input.BCGSC.20140603.csv").columns.to_series().apply(lambda x: x.split("_")[0]).value_counts()

In [None]:
for i,name in enumerate(["X", "Ya"]):
    x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/{name}.mat')[name]
    print(x.shape)
    pd.DataFrame(x).to_csv(f'imvc/datasets/data/bdgp/bdgp_{i}.csv', index= False)
x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/Yc.mat')["Yc"]
pd.DataFrame(x.argmax(1)).to_csv(f'imvc/datasets/data/bdgp/bdgp_y.csv', index= False)

In [None]:
path = "imvc/datasets/data/tcga"
files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
for i,x in enumerate(["exp", "methy", "mirna"]):
    target = []
    files_x = [os.path.join(path, file) for file in files if file.startswith(x)]
    ds = []
    for file_x in files_x:
        d_x = pd.read_csv(file_x, index_col= 0).T
        print(file_x, d_x.shape)
        target.extend([file_x.split("_")[-1]]* d_x.shape[0])
        ds.append(d_x)
    d = pd.concat(ds)
    print(x, d.shape)
    d = d.dropna(axis= 1)
    print(x, d.shape)
    d.to_csv(os.path.join(path, f'tcga_{i}.csv'))
pd.Series(target).to_csv(os.path.join(path, 'tcga_y.csv'))

In [None]:
path = "imvc/datasets/data/tcga"
files = [os.path.join(path, f) for f in os.listdir(path) if f.startswith("tcga_")]
d = pd.concat([pd.read_csv(file) for file in files], axis= 1)
for i,file in enumerate(files):
    d_x = pd.read_csv(file)
    print(file, d_x.shape, d_x.loc[d.index])
    d_x.loc[d.index].to_csv(file)

In [None]:
path = "imvc/datasets/data/tcga"
for i,x in enumerate([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]):
    with ZipFile(os.path.join(path, x)) as zf:
        for file in zf.namelist():
            with zf.open(file) as f2:
                d = pd.read_csv(f2, sep= " ")
                print(file, d.shape)
                d.to_csv(f"{os.path.join(path, file)}_{x.split('.')[0]}.csv")

In [None]:
path = "imvc/datasets/data/digits"
for i,x in enumerate([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]):
    d = pd.read_csv(os.path.join(path, x))
    print(d.shape)
    d.iloc[:, :-1].to_csv(os.path.join(path, f"digits_{i}.csv"), index= False)
d.iloc[:, -1].to_csv(os.path.join(path, f"digits_y.csv"), index= False)

In [None]:
x = scipy.io.loadmat(f'imvc/datasets/data/bdgp/Yc.mat')["Yc"]
x.argmax(1)

In [None]:
[f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

In [None]:
os.listdir(path)

In [None]:
mat["X"][0][0].shape

In [None]:
mat["X"][0][1].shape

In [None]:
import pyreadr

In [None]:
mat = pyreadr.read_r('imvc/datasets/data/metabric/METABRIC_discovery')
mat

In [None]:
mat["mydatCNV"]

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(mat["Y"]).squeeze().value_counts()

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/bdgp/X.mat')
mat["X"].shape

In [None]:
mat = scipy.io.loadmat('imvc/datasets/data/bdgp/Yc.mat')
mat["Yc"][0]

In [None]:
mat["Yc"].shape