In [1]:
import os
import sys
import numpy as np
from sklearn.decomposition import PCA

from umap import UMAP
sys.path.insert(1, os.getenv("MOMAPS_HOME"))
from src.common.lib.utils import load_config_file
from src.datasets.dataset_spd import DatasetSPD
from src.common.lib.embeddings_utils import load_embeddings
import src.common.lib.synthetic_multiplexing as synthetic_multiplexing

from matplotlib import cm
import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc, RocCurveDisplay, \
    balanced_accuracy_score, roc_auc_score, classification_report, confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import matplotlib
from collections import defaultdict, Counter

from sklearn.preprocessing import label_binarize

from sklearn.base import clone

%matplotlib inline
%load_ext autoreload 
%autoreload 2





  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


MOMAPS_HOME: /home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps


# utils

In [2]:
def fit_and_plot_multiclass_cv_auc_roc_curves(estimator, cv_k_fold, X, y, n_classes,
                                              class_names, colors, sample_weight=None, title="", minimal_legend=False):
    """
    For every class, plot a curve per fold.
    A total of (n_classes * n_folds) AUC ROC curves are presented in the plot
    :param estimator:
    :param cv_k_fold:
    :param X:
    :param y:
    :param n_classes:
    :param title:
    :return:
    """

    tprs, aucs = [], []
    mean_fpr = np.linspace(0, 1, 100)
    _tpr, _fpr, _roc_auc = dict(), dict(), dict()
    _roc_auc2 = defaultdict(list)
    fig, ax = plt.subplots(figsize=(5, 5))
    

    for i, (train_fold, test_fold) in enumerate(cv_k_fold.split(X, y)):
        estimator_copy = clone(estimator)
        # -------- -------- -------- -------- -------- --------
        # Train a model on the train set
        estimator_copy.fit(X[train_fold], y[train_fold])
        # Get the true labels (classes)
        # Binarize labels in a one-vs-all fashion
        y_true = label_binarize(y[test_fold], classes=class_names)
        # -------- -------- -------- -------- -------- --------
        # Validate on test set
        y_proba = estimator_copy.predict_proba(X[test_fold])
        # -------- -------- -------- -------- -------- --------
        # For every class, plot a curve per fold (n_class * n_folds)
        for j, class_color, class_name in zip(range(n_classes), colors, class_names):

            # calculate AUC ROC - generate TPR and FPR curves
            _fpr[j], _tpr[j], _ = roc_curve(y_true[:, j],
                                          y_proba[:, j],
                                          #sample_weight=sample_weight,
                                          drop_intermediate=True)

            _roc_auc[j] = auc(_fpr[j], _tpr[j])
            _roc_auc2[class_name].append(_roc_auc[j])

            viz = RocCurveDisplay(fpr=_fpr[j], tpr=_tpr[j], roc_auc=_roc_auc[j], estimator_name=class_name)
            # Set a legend for each fold and each class
            viz.plot(ax=ax, name=class_name + ' fold {}'.format(i + 1), alpha=0.8, lw=1.8, linestyle=':',
                         color=class_color)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)
        # -------- -------- -------- -------- -------- --------

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='dimgray', label='Chance', alpha=.5)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='black',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=1.0)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.01, 1.05], ylim=[-0.01, 1.05])

    plt.title(title, color='black', fontsize=20)
    plt.xlabel('False Positive Rate (%)', fontsize=20, color='black')
    plt.ylabel('True Positive Rate (%)', fontsize=20, color='black')

    ax.legend(loc="lower right", prop={"size": 14}, bbox_to_anchor=(1.5, 1.5))

    plt.show()
    # plt.savefig(os.path.join(MAIN_FOLDER, 'figures', 'multiclass', 'AUC_ROC_per_class_training.png'), bbox_inches='tight',
    #             dpi=1200)
    plt.close()
    return np.asarray(aucs)

def plot_conf_matrix(clf, X_test, y_test, normalize='true'):
    from sklearn.metrics import ConfusionMatrixDisplay, multilabel_confusion_matrix, confusion_matrix

    unique_labels = np.unique(clf.classes_)
    print(unique_labels)

    predictions = clf.predict(X_test)
    conf_matrix = confusion_matrix(y_true=y_test, y_pred=predictions, labels=unique_labels, normalize=normalize)

    import seaborn as sns

    if normalize is None:
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
    else:
        sns.heatmap(conf_matrix, annot=True, fmt='.2%', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
    plt.show()

def validate_classifier(model, X, y, default_model_scorer, class_names, colors, sample_weight=None, title='', show_plot=False):
    """
    Perform full evaluation for a multi-class classifier on a given X and y

    :param model: sklearn Estimator (classifier)
    :param X: features matrix in a numpy matrix
    :param y: true labels in a numpy array (multi-class)
    :param default_model_scorer: string, sklearn.metric
    :param class_names: list of strings
    :param sample_weight: list of real numbers
    :param title: text
    :param show_plot: Boolean
    :return:

    """

    print("Validating - " + str(title) + "\n\n------------------")

    # -------- -------- -------- -------- -------- --------
    # Infer the probabilities for every class
    y_pred_prob = model.predict_proba(X)

    # Infer binary label to be used in classification_report()
    # by default, samples are classified to whichever class has the greatest probability (max)
    # y_pred_label = np.argmax(y_pred_prob, axis=1)  # nancy test
    y_pred = model.predict(X)
    # -------- -------- -------- -------- -------- --------

    print("\nBalanced accuracy score: " + str(balanced_accuracy_score(y_true=y, y_pred=y_pred, #sample_weight=sample_weight,
                            adjusted=False)) + ". \nBalanced accuracy score (adjusted): " + str(balanced_accuracy_score(y_true=y,
                                                                                                                        y_pred=y_pred,
                                                                                                                        #sample_weight=sample_weight,
                            adjusted=True)))
    # -------- -------- -------- -------- -------- --------
    macro_roc_auc_ovo = roc_auc_score(y, y_pred_prob, multi_class="ovo",
                                      average="macro")
    weighted_roc_auc_ovo = roc_auc_score(y, y_pred_prob, multi_class="ovo",
                                         average="weighted")

    macro_roc_auc_ovr = roc_auc_score(y, y_pred_prob, multi_class="ovr",
                                      average="macro" #, sample_weight=sample_weight
                                      )
    weighted_roc_auc_ovr = roc_auc_score(y, y_pred_prob, multi_class="ovr",
                                         average="weighted" #, sample_weight=sample_weight
    )
    print("\n\nOne-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
          "(weighted by prevalence)"
          .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
    print("\n\nOne-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
          "(weighted by prevalence)"
          .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))
    # -------- -------- -------- -------- -------- --------
    # ROC AUC curves for each class (one-vs-rest)
    _roc_auc_scores = plot_multiclass_area_under_curves(y_true=y, y_proba=y_pred_prob, n_classes=len(class_names),
                                                        class_names=class_names, colors=colors)#, sample_weight=sample_weight)

    print("\nHELDOUT SET - AUC ROC scores of each class: " + str(_roc_auc_scores))
    # -------- -------- -------- -------- -------- --------
    print("'nHELDOUT SET - Classification Report: \n" + str(
        classification_report(y_true=y, y_pred=y_pred, target_names=class_names)))#, sample_weight=sample_weight)))
    # -------- -------- -------- -------- -------- --------
    print("HELDOUT SET - Multilabel Confusion Matrix: \n" + str(
        multilabel_confusion_matrix(y_true=y, y_pred=y_pred, labels=class_names)))#, sample_weight=sample_weight)))
    # -------- -------- -------- -------- -------- --------
    matplotlib.rcParams.update({'font.size': 8})
    plot_conf_matrix(model, X, y)
    # fig = plt.figure(figsize=(4, 4))
    # conf_mat = multilabel_confusion_matrix(y, y_pred)
    # ConfusionMatrixDisplay(conf_mat, display_labels=class_names)
    # plt.title('Confusion matrix "one vs. rest"', fontsize=20, color='black')
    # plt.show(block=True)
    # plt.savefig(os.path.join(MAIN_FOLDER, 'figures', 'multiclass', 'ovr_confusion_matrix_all_classes_heldout.png'),bbox_inches='tight',dpi=1200)
    plt.close()
    # -------- -------- -------- -------- -------- --------
    cnf_matrix = confusion_matrix(y_true=y, y_pred=y_pred, #sample_weight=sample_weight,
                                  labels=class_names)
    print("\nHELDOUT SET - One-vs-rest Confusion Matrix:\n" + str(cnf_matrix))

    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    TPR = TP / (TP + FN)
    print("Sensitivity, hit rate, recall, or true positive rate: " + str(TPR))

    TNR = TN / (TN + FP)
    print("Specificity or true negative rate: " + str(TNR))

    PPV = TP / (TP + FP)
    print("Precision or positive predictive value: " + str(PPV))

    NPV = TN / (TN + FN)
    print("Negative predictive value: " + str(NPV))

    FPR = FP / (FP + TN)
    print("Fall out or false positive rate: " + str(FPR))

    FNR = FN / (TP + FN)
    print("False negative rate: " + str(FNR))

    FDR = FP / (TP + FP)
    print("False discovery rate: " + str(FDR))

    ACC = (TP + TN) / (TP + FP + FN + TN)
    print("Overall accuracy: " + str(ACC))

    # -------- -------- -------- -------- -------- --------
    test_score = model.score(X, y)#, sample_weight=sample_weight)

    print(
        "Performance score " + str(default_model_scorer) + " on the held-out test-set (25%): " + str(test_score))

    return test_score




def plot_multiclass_area_under_curves(y_true, y_proba, n_classes, class_names, colors):#, sample_weight):
    """
    Study the output of a multi-class classifier.
    One curve can be drawn per class/label and consider precision-recall and roc curves for each class.

    Note: it is necessary to binarize y_true, and y_proba should be class probabilities (using "predict_proba()").

    :param y_true: numpy
    :param y_proba: numpy
    :param n_classes: int

    :return: roc_auc
    """

    # Binarize labels in a one-vs-all fashion
    y = label_binarize(y_true, classes=class_names)

    # ROC curve
    tpr, fpr = dict(), dict()

    # ROC score
    roc_auc = dict()

    # precision recall curve
    precision = dict()
    recall = dict()
    # precision score
    precision_score = dict()

    # colors = cycle(['deepskyblue', 'orangered', 'seagreen'])
    fig = plt.figure(figsize=(5, 5))

    for i in range(n_classes):
        # calculate AUC ROC - generate TPR and FPR curves
        fpr[i], tpr[i], _ = roc_curve(y[:, i], y_proba[:, i])#, sample_weight=sample_weight)
        roc_auc[i] = auc(fpr[i], tpr[i])
        # calculate precision-recall
        precision[i], recall[i], _ = precision_recall_curve(y_true=y[:, i], probas_pred=y_proba[:, i])#,sample_weight=sample_weight)
        precision_score[i] = average_precision_score(y_true=y[:, i], y_score=y_proba[:, i], average="weighted")

    # ----------------------------------------------------------------
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), y_proba.ravel())#, sample_weight=sample_weight)
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # ----------------------------------------------------------------
    # aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

    # average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # ----------------------------------------------------------------
    # Draw ROC curves for all classes
    for i, color, name in zip(range(n_classes), colors, class_names):
        plt.plot(fpr[i], tpr[i], color=color, lw=1.4,
                 label='{0} (AUC = {1:0.2f})'
                       ''.format(name, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=1)
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    ax = fig.gca()
    ax.tick_params(axis='both', which='major', labelsize=14)
    plt.xlabel('False Positive Rate', fontsize=20)
    plt.ylabel('True Positive Rate', fontsize=20)
    plt.title("ROC multi-class prediction in held-out data", fontsize=20, color='black')
    plt.legend(loc="best", prop={'size': 14})
    plt.show(block=True)
    # plt.savefig(os.path.join(MAIN_FOLDER, 'figures', 'multiclass', 'AUC_ROC_per_class_heldout.png'), bbox_inches='tight', dpi=1200)
    plt.close()

    # ----------------------------------------------------------------
    # Draw Precision-Recall curves
    fig = plt.figure(figsize=(5, 5))
    for i, color, name in zip(range(n_classes), colors, class_names):
        plt.plot(recall[i], precision[i], color=color, lw=1.4, label=' {0} (weighted avg. precision = {1:0.2f})'
                                                                     ''.format(name, precision_score[i]))

    ax = fig.gca()
    ax.tick_params(axis='both', which='major', labelsize=14)
    plt.xlabel("Recall", fontsize=20)
    plt.ylabel("Precision", fontsize=20)

    plt.legend(loc="best", prop={'size': 14})
    plt.suptitle("Precision-Recall curves in held-out data", fontsize=20, color='black')
    plt.show(block=True)
    # plt.savefig(os.path.join(MAIN_FOLDER, 'figures', 'multiclass', 'precision_recall_curves_per_class_heldout.png'), bbox_inches='tight',
    #             dpi=1200)
    plt.close()

    # ----------------------------------------------------------------
    # Plot all ROC curves (EXTENDED PLOT..)
    # ----------------------------------------------------------------
    fig = plt.figure(figsize=(5, 5))

    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=1.4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=1.4)

    for i, color, name in zip(range(n_classes), colors, class_names):
        plt.plot(fpr[i], tpr[i], color=color, lw=1.4,
                 label='ROC curve of class {0} (area = {1:0.2f})'
                       ''.format(name, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])

    ax1 = fig.gca()
    ax1.tick_params(axis='both', which='major', labelsize=14)

    plt.xlabel('False Positive Rate', fontsize=20)
    plt.ylabel('True Positive Rate', fontsize=20)
    plt.title("ROC multi-class prediction in held-out data", fontsize=20, color='black')
    plt.legend(loc="best", prop={'size': 14})

    plt.show(block=True)
    # plt.savefig(os.path.join(MAIN_FOLDER, 'figures', 'multiclass', 'extension_AUC_ROC_per_class_heldout.png'), bbox_inches='tight',
    #             dpi=1200)
    plt.close()

    return roc_auc

def val_classifier(clf, X, y, unique_labels):
    validate_classifier(clf, X, y, 'roc_auc_ovr_weighted',
                        unique_labels, sample_weight=None, colors=plt.cm.get_cmap('Set1', len(unique_labels)).colors,
                        title='Validate classifier', show_plot=True)

def fit_eval(clf, X, y, cv=5, random_state=1):
    from sklearn.model_selection import train_test_split, StratifiedKFold
    
    clf_clone = clone(clf)
    
    y = y.reshape(-1,)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=random_state, shuffle=True)
    print(f"X_train shape {X_train.shape}, X_test shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")
    
    unique_labels = np.unique(y)

    skf = StratifiedKFold(n_splits=cv, random_state=random_state, shuffle=True)
    result = fit_and_plot_multiclass_cv_auc_roc_curves(clf_clone, skf, X_train, y_train,
                                                        n_classes=len(unique_labels),
                                                        class_names=unique_labels,
                                                        colors=plt.cm.get_cmap('Set1', len(unique_labels)).colors,
                                                        title='ROC multi-class one-vs-rest in training')

    print("\n\nTraining dataset (80%): K-Fold Cross Validation score: " + str(model_scorer) + " " + str(
        result) + " Mean: " + str(result.mean()))
    

    clf_clone = clone(clf)
    clf_clone = clf_clone.fit(X_train, y_train)
    val_classifier(clf_clone, X_test, y_test, unique_labels)
    



# Configs

In [3]:
config_path_model = "./src/models/neuroself/configs/model_config/TLNeuroselfB78NoDSModelConfig"
config_path_data = "./src/datasets/configs/embeddings_data_config/EmbeddingsB9VQ1DatasetConfig"

In [4]:
model_scorer = 'roc_auc_ovr_weighted'

# Load data

In [5]:
config_model = load_config_file(config_path_model, 'model')
config_data = load_config_file(config_path_data, 'data', config_model.CONFIGS_USED_FOLDER)

2023-09-03 17:43:34 INFO [TLNeuroselfB78NoDSModelConfig] Init (log path: /home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/outputs/models_outputs_batch78_nods_tl_ep23/logs/030923_174334_085968.log)
2023-09-03 17:43:34 INFO [TLNeuroselfB78NoDSModelConfig] MOMAPS_HOME=/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps, MOMAPS_DATA_HOME=/home/labs/hornsteinlab/Collaboration/MOmaps/input


In [6]:
config_data.SAMPLE_PCT

1

In [7]:
config_data.INPUT_FOLDERS

['/home/labs/hornsteinlab/Collaboration/MOmaps/input/images/processed/spd2/SpinningDisk/batch9_16bit_no_downsample']

In [8]:
config_data.EMBEDDINGS_LAYER

'vqvec1'

In [9]:
embeddings, labels = load_embeddings(embeddings_type='all', config_model=config_model, config_data=config_data)

2023-09-03 17:43:34 INFO [load_embeddings] Model: preloaded                    Dataset: preloaded,                        embeddings_type: all
2023-09-03 17:43:34 INFO [load_embeddings] experiment_type = neurons
2023-09-03 17:43:34 INFO [load_embeddings] embeddings_layer = vqvec1
2023-09-03 17:43:34 INFO Input folder: /home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/outputs/models_outputs_batch78_nods_tl_ep23/embeddings/neurons/vqvec1/batch9_16bit_no_downsample, depth used: 4
2023-09-03 17:43:34 INFO Running in parallel: 52 processes
2023-09-03 17:43:35 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSHeterozygous_Untreated_FUS of shape (270, 64, 25, 25) 
2023-09-03 17:43:35 INFO [_load_stored_embeddings] Loading stored embeddings of label OPTN_Untreated_CLTC of shape (379, 64, 25, 25) 
2023-09-03 17:43:35 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSHeterozygous_Untreated_TOMM20 of shape (468, 64, 25, 25) 
2023-09-03 17:43:35 INFO [_

2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSRevertant_Untreated_SQSTM1 of shape (755, 64, 25, 25) 
2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSHomozygous_Untreated_FUS of shape (797, 64, 25, 25) 
2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSRevertant_Untreated_FMRP of shape (751, 64, 25, 25) 
2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSRevertant_Untreated_CD41 of shape (882, 64, 25, 25) 
2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSHomozygous_Untreated_FMRP of shape (888, 64, 25, 25) 
2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSRevertant_Untreated_mitotracker of shape (925, 64, 25, 25) 
2023-09-03 17:43:36 INFO [_load_stored_embeddings] Loading stored embeddings of label FUSHomozygous_Untreated_SQSTM1 of shape (888, 64, 25

In [10]:
embeddings_raw = embeddings.copy()

In [11]:
embeddings_raw.shape

(320151, 64, 25, 25)

---

# Single marker

In [19]:
# ERROR: cannot assign slice from input of different size - Because of too many features (embeddings_marker.shape[1] is huge)...

reducer = UMAP(n_components=100, random_state=1)
reducer.fit(embeddings_marker[:,:10000])

In [20]:

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

sm_dict = {}

unique_ll = np.unique(labels)
labels_npy = np.asarray(labels)
for l in unique_ll[:len(unique_ll)]:
    marker = l.split('_')[-1]
    print('---------------------')
    print(f"Marker: {marker}")
    print('---------------------')
    marker_indexes = np.where(np.char.endswith(labels_npy, f'_{marker}'))[0]
    embeddings_marker, labels_marker = embeddings[marker_indexes], labels_npy[marker_indexes]
    
    print(np.unique(labels_marker))
    print(f"[{marker}] embeddings_marker shape: {embeddings_marker.shape}, labels_marker shape: {labels_marker.shape}")
    
    print(f"[{marker}] SM")
    df = synthetic_multiplexing.__embeddings_to_df(embeddings_marker, labels_marker.reshape(-1,), dataset_conf=config_data)
    embeddings_marker, y_marker, unique_groups_marker = synthetic_multiplexing.__get_multiplexed_embeddings(df, random_state=config_data.SEED)

    print(f"[{marker}] Save to dict")
    sm_dict[marker] = {'embeddings': embeddings_marker, 'y': y_marker, 'unique_groups': unique_groups_marker}

    print(f"[{marker}] UMAP...")
    reducer = UMAP(n_components=100, random_state=1)
    
    print("embeddings_marker shape: ", embeddings_marker.shape, embeddings_marker.reshape(embeddings_marker.shape[0], -1).shape)
    print("!!!!!!!!! TAKING ONLY FIRST 10000 FEATURES since otherwise there for bigger numbers there is an error (cannot assign slice from input of different size) !!!!!!")
    X_marker = reducer.fit_transform(embeddings_marker[:,:10000])
    
    print(f"[{marker}] Print X to dict..")
    sm_dict[marker]['X'] = X_marker
    
    print(f"[{marker}] Init model")
    mlp = MLPClassifier(hidden_layer_sizes=(20,), activation='relu', random_state=1)
    # rfc = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=1)
    
    print(f"[{marker}] fit_eval")
    fit_eval(mlp, X_marker, y_marker)


---------------------
Marker: ANXA11
---------------------
['FUSHeterozygous_Untreated_ANXA11' 'FUSHomozygous_Untreated_ANXA11'
 'FUSRevertant_Untreated_ANXA11' 'OPTN_Untreated_ANXA11'
 'SCNA_Untreated_ANXA11' 'TBK1_Untreated_ANXA11' 'TDP43_Untreated_ANXA11'
 'WT_Untreated_ANXA11' 'WT_stress_ANXA11']
[ANXA11] embeddings_marker shape: (14115, 64, 25, 25), labels_marker shape: (14115, 1)
[ANXA11] SM


2023-09-03 18:32:04 INFO [SM] Common markers: {'ANXA11'}
2023-09-03 18:32:04 INFO Pheno: FUSHeterozygous_Untreated
2023-09-03 18:32:04 INFO Detected 1199 subgroups
2023-09-03 18:32:04 INFO 1/1199
2023-09-03 18:32:04 INFO [1/1199] Shape: (1199, 3)
2023-09-03 18:32:04 INFO 2/1199
2023-09-03 18:32:04 INFO [2/1199] Shape: (1198, 3)
2023-09-03 18:32:04 INFO 3/1199
2023-09-03 18:32:04 INFO [3/1199] Shape: (1197, 3)
2023-09-03 18:32:04 INFO 4/1199
2023-09-03 18:32:04 INFO [4/1199] Shape: (1196, 3)
2023-09-03 18:32:04 INFO 5/1199
2023-09-03 18:32:04 INFO [5/1199] Shape: (1195, 3)
2023-09-03 18:32:04 INFO 6/1199
2023-09-03 18:32:04 INFO [6/1199] Shape: (1194, 3)
2023-09-03 18:32:04 INFO 7/1199
2023-09-03 18:32:04 INFO [7/1199] Shape: (1193, 3)
2023-09-03 18:32:04 INFO 8/1199
2023-09-03 18:32:04 INFO [8/1199] Shape: (1192, 3)
2023-09-03 18:32:04 INFO 9/1199
2023-09-03 18:32:04 INFO [9/1199] Shape: (1191, 3)
2023-09-03 18:32:04 INFO 10/1199
2023-09-03 18:32:04 INFO [10/1199] Shape: (1190, 3)
2023

[ANXA11] Save to dict
[ANXA11] UMAP...
embeddings_marker shape:  (14115, 40000) (14115, 40000)
!!!!!!!!! TAKING ONLY FIRST 10000 FEATURES since otherwise there for bigger numbers there is an error (cannot assign slice from input of different size) !!!!!!
[ANXA11] Print X to dict..
[ANXA11] Init model
[ANXA11] fit_eval
X_train shape (11292, 100), X_test shape: (2823, 100), y_train shape: (11292,), y_test shape: (2823,)


In [16]:
for l in unique_ll[len(unique_ll):]:
    marker = l.split('_')[-1]
    print('---------------------')
    print(f"Marker: {marker}")
    print('---------------------')
    marker_indexes = np.where(np.char.endswith(labels_npy, f'_{marker}'))[0]
    embeddings_marker, labels_marker = embeddings[marker_indexes], labels_npy[marker_indexes]
    
    print(np.unique(labels_marker))
    print(f"[{marker}] embeddings_marker shape: {embeddings_marker.shape}, labels_marker shape: {labels_marker.shape}")
    
    print(f"[{marker}] SM")
    df = synthetic_multiplexing.__embeddings_to_df(embeddings_marker, labels_marker.reshape(-1,), dataset_conf=config_data)
    embeddings_marker, y_marker, unique_groups_marker = synthetic_multiplexing.__get_multiplexed_embeddings(df, random_state=config_data.SEED)

    print(f"[{marker}] Save to dict")
    sm_dict[marker] = {'embeddings': embeddings_marker, 'y': y_marker, 'unique_groups': unique_groups_marker}

    print(f"[{marker}] UMAP...")
    reducer = UMAP(n_components=100, random_state=1)
    X_marker = reducer.fit_transform(embeddings_marker.reshape(embeddings_marker.shape[0], -1))
    
    print(f"[{marker}] Print X to dict..")
    sm_dict[marker]['X'] = X_marker
    
    print(f"[{marker}] Init model")
    mlp = MLPClassifier(hidden_layer_sizes=(20,), activation='relu', random_state=1)
    # rfc = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=1)
    
    print(f"[{marker}] fit_eval")
    fit_eval(mlp, X_marker, y_marker)

-------------------------

# Multiplex

In [None]:
print(embeddings.shape)

In [None]:

df = synthetic_multiplexing.__embeddings_to_df(embeddings, labels.reshape(-1,), dataset_conf=config_data)
embeddings, y, unique_groups = synthetic_multiplexing.__get_multiplexed_embeddings(df, random_state=config_data.SEED)


In [None]:
np.save("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_embeddings_b5_vqvec1", embeddings)
np.save("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_labels_b5_vqvec1", y)

In [None]:
embeddings9 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_embeddings_b9_vqvec1.npy")
y9 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_labels_b9_vqvec1.npy")
embeddings9.shape, y9.shape

In [None]:
embeddings5 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_embeddings_b5_vqvec1.npy")
y5 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_labels_b5_vqvec1.npy")
embeddings5.shape, y5.shape

In [None]:
embeddings.shape

# Dim reduction

In [None]:

# from sklearn.manifold import MDS
# from umap import UMAP

# reducer = UMAP(n_components=100, random_state=1)#, max_iter=10)
# # reducer.fit(embeddings.reshape(embeddings.shape[0], -1))
# # print(reducer.explained_variance_ratio_)
# reducer.fit(embeddings9.reshape(embeddings9.shape[0], -1))
# X9 = reducer.transform(embeddings9.reshape(embeddings9.shape[0], -1))
# X5 = reducer.transform(embeddings5.reshape(embeddings5.shape[0], -1))

# X9.shape, X5.shape

In [None]:

# from sklearn.manifold import MDS
# from umap import UMAP

# reducer = UMAP(n_components=100, random_state=1)#, max_iter=10)
# # reducer.fit(embeddings.reshape(embeddings.shape[0], -1))
# # print(reducer.explained_variance_ratio_)
# X = reducer.fit_transform(embeddings.reshape(embeddings.shape[0], -1))
# X.shape

-------- 

#### Save to file

In [None]:
np.save("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_embeddings_b5_vqvec1_umap100", X)
# np.save("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/mul_label_all_all", label_data)

#### Load from file

In [None]:
X9 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_embeddings_b9_new_umap100.npy")
y9 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_labels_b9_new.npy")
X9.shape, y9.shape

In [None]:
X5 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_embeddings_b5_umap100.npy")
y5 = np.load("/home/labs/hornsteinlab/Collaboration/MOmaps_Sagy/MOmaps/sandbox/sm_labels_b5.npy")
X5.shape, y5.shape

### Models

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(20,), activation='relu', random_state=1)
# mlp.fit(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=1)
# rfc.fit(X_train, y_train)

### Eval

In [None]:
mlp = mlp.fit(X9, y9)

In [None]:

val_classifier(mlp, X5, y5.reshape(-1,), np.unique(y5.reshape(-1,)))

### Fit & Eval

#### vqvec1

In [None]:
fit_eval(mlp, X, y)

In [None]:
fit_eval(mlp, X, y)

#### vqvec2

In [None]:
fit_eval(mlp, X, y)

In [None]:
fit_eval(mlp, X, y)

#### y shuffled

##### vqvec1

In [None]:
y_shuffled = np.copy(y)
np.random.shuffle(y_shuffled)
fit_eval(rfc, X, y_shuffled)

##### vqvec2

In [None]:
y_shuffled = np.copy(y)
np.random.shuffle(y_shuffled)
fit_eval(mlp, X, y_shuffled)

---------------------

# Old

In [None]:
# from sklearn.model_selection import StratifiedKFold

# clf = mlp
# unique_labels = np.unique(y)

# skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
# result = fit_and_plot_multiclass_cv_auc_roc_curves(clf, skf, X_train, y_train,
#                                                     n_classes=len(unique_labels),
#                                                     class_names=unique_labels,
#                                                     colors=plt.cm.get_cmap('Set1', len(unique_labels)).colors,
#                                                     title='ROC multi-class one-vs-rest in training')

# print("\n\nTraining dataset (75%): K-Fold Cross Validation score: " + str(model_scorer) + " " + str(
#     result) + " Mean: " + str(result.mean()))


In [None]:


# clf_trained = clf.fit(X_train, y_train)
# validate_classifier(clf_trained, X_test, y_test, 'roc_auc_ovr_weighted',
#                     unique_labels, sample_weight=None, colors=plt.cm.get_cmap('Set1', len(unique_labels)).colors,
#                     title='Validate classifier', show_plot=True)

-----------------------

# Plot

In [None]:

groups = np.unique(labels_flat)
colors = cm.get_cmap('Set1').colors

for i, g in enumerate(groups):
    ind = np.where(labels_flat == g)
    plt.scatter(x[ind,0], x[ind,1], c=colors[i])
plt.legend(groups)
plt.show()

In [None]:

groups = np.unique(labels_flat)
colors = cm.get_cmap('Set1').colors

for i, g in enumerate(groups):
    ind = np.where(labels_flat == g)
    plt.scatter(x[ind,0], x[ind,1], c=colors[i])
plt.legend(groups)
plt.show()

In [None]:

groups = np.unique(labels_flat)
colors = cm.get_cmap('Set1').colors

for i, g in enumerate(groups):
    ind = np.where(labels_flat == g)
    plt.scatter(x[ind,0], x[ind,1], c=colors[i])
plt.legend(groups)
plt.show()

In [None]:
# 50 umaps
groups = np.unique(labels_flat)
colors = cm.get_cmap('Set1').colors

for i, g in enumerate(groups):
    ind = np.where(labels_flat == g)
    plt.scatter(x[ind,0], x[ind,1], c=colors[i])
plt.legend(groups)
plt.show()

In [None]:
# 50 umaps multi
groups = np.unique(label_data.reshape(-1,))
colors = cm.get_cmap('Set1').colors

for i, g in enumerate(groups):
    ind = np.where(label_data.reshape(-1,) == g)
    plt.scatter(x[ind,0], x[ind,1], c=colors[i])
plt.legend(groups)
plt.show()

# Models

In [None]:
from sklearn.model_selection import train_test_split
labels_flat = label_data.reshape(-1,)
X_train, X_test, y_train, y_test = train_test_split(x, labels_flat, test_size=.2, stratify=labels_flat, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', random_state=1)
mlp.fit(X_train, y_train)

In [None]:
mlp.n_layers_

In [None]:
# for i in range(5):
#     print(mlp.predict(X_test[[i]]), y_test[i])

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=20, max_depth=3, random_state=1)
rfc.fit(X_train, y_train)

### Plots

In [None]:

    # for i, l in enumerate(unique_labels):
    #     print(l)
    #     disp = ConfusionMatrixDisplay(conf_matrix[i])
    #     disp.plot()
    #     plt.show()

In [None]:
plot_conf_matrix(rfc, 'true')

In [None]:
plot_conf_matrix(rfc, 'true')

In [None]:
plot_conf_matrix(mlp, 'true')

In [None]:
plot_conf_matrix(rfc, 'true')

In [None]:
plot_conf_matrix(mlp, 'true')

### Utils

### Reports

In [None]:
def cv_report(get_clf, x, labels_flat):
    from sklearn.metrics import classification_report
    from sklearn.model_selection import StratifiedKFold

    skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
    for i, (train_index, test_index) in enumerate(skf.split(x, labels_flat)):
        print(i)
        print(train_index[:2], test_index[:2])
        x_train, y_train = x[train_index], labels_flat[train_index]
        x_test, y_test = x[test_index], labels_flat[test_index]

        print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

        clf = get_clf() #MLPClassifier(hidden_layer_sizes=(50,), activation='relu', random_state=1)
        clf.fit(x_train, y_train)

        predictions = clf.predict(x_test)
        print(classification_report(y_test, predictions))
        
        

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
fit_and_plot_multiclass_cv_auc_roc_curves(mlp, skf, x, labels_flat)

In [None]:
cv_report(lambda : MLPClassifier(hidden_layer_sizes=(50,), activation='relu', random_state=1), x, labels_flat)

In [None]:
# from sklearn.metrics import classification_report
# from sklearn.model_selection import StratifiedKFold

# skf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
# for i, (train_index, test_index) in enumerate(skf.split(x, labels_flat)):
#     print(i)
#     print(train_index[:2], test_index[:2])
#     x_train, y_train = x[train_index], labels_flat[train_index]
#     x_test, y_test = x[test_index], labels_flat[test_index]

#     print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

#     mlp = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', random_state=1)
#     mlp.fit(x_train, y_train)

#     predictions = mlp.predict(x_test)
#     print(classification_report(y_test, predictions))

### 2 UMAPS

In [None]:
plot_conf_matrix(mlp)

In [None]:
plot_conf_matrix(rfc)

### 100 UMAPS

In [None]:
plot_conf_matrix(mlp)

In [None]:
plot_conf_matrix(rfc)

# PLOTS old

In [None]:
plot_conf_matrix(rfc)

In [None]:
plot_conf_matrix(mlp)

In [None]:
plot_conf_matrix(rfc)

In [None]:
plot_conf_matrix(mlp)

In [None]:
plot_conf_matrix(rfc)

In [None]:
# def plot_conf_matrix(clf):
#     from sklearn.metrics import ConfusionMatrixDisplay, multilabel_confusion_matrix, confusion_matrix

#     unique_labels = np.unique(mlp.classes_)

#     predictions = clf.predict(X_test)
#     conf_matrix = confusion_matrix(y_test, predictions, labels=unique_labels)

#     import seaborn as sns

#     sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
#     plt.show()
#     # for i, l in enumerate(unique_labels):
#     #     print(l)
#     #     disp = ConfusionMatrixDisplay(conf_matrix[i])
#     #     disp.plot()
#     #     plt.show()
    
# # plot_conf_matrix(rfc)

In [None]:
#50 umaps
def plot_conf_matrix(clf):
    from sklearn.metrics import ConfusionMatrixDisplay, multilabel_confusion_matrix, confusion_matrix

    unique_labels = np.unique(mlp.classes_)

    predictions = clf.predict(X_test)
    conf_matrix = confusion_matrix(y_test, predictions, labels=unique_labels)

    import seaborn as sns

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
    plt.show()
    # for i, l in enumerate(unique_labels):
    #     print(l)
    #     disp = ConfusionMatrixDisplay(conf_matrix[i])
    #     disp.plot()
    #     plt.show()
    
plot_conf_matrix(rfc)

In [None]:
#Multiplex

def plot_conf_matrix(clf):
    from sklearn.metrics import ConfusionMatrixDisplay, multilabel_confusion_matrix, confusion_matrix

    unique_labels = np.unique(mlp.classes_)

    predictions = clf.predict(X_test)
    conf_matrix = confusion_matrix(y_test, predictions, labels=unique_labels)

    import seaborn as sns

    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=unique_labels, yticklabels=unique_labels)
    plt.show()
    # for i, l in enumerate(unique_labels):
    #     print(l)
    #     disp = ConfusionMatrixDisplay(conf_matrix[i])
    #     disp.plot()
    #     plt.show()
    
plot_conf_matrix(rfc)

In [None]:
#multi 50umaps
plot_conf_matrix(mlp)

In [None]:
plot_conf_matrix(mlp)

In [None]:
#50umaps
plot_conf_matrix(mlp)

In [None]:
from xgboost import XGBClassifier

# create model instance
bst = XGBClassifier(n_estimators=20, max_depth=4, learning_rate=0.1, objective='multi:softmax')
# fit model
bst.fit(X_train, y_train)
# # make predictions
# preds = bst.predict(X_test)