In [1]:
# imports

# Models
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier as lgbm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# preprocessors
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest, SelectFromModel, RFE
from sklearn.impute import SimpleImputer

# Samplers
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE

# metrics and splitters
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split

# utils
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import sys
import matplotlib_venn as venn

# progress bar
from ipywidgets import IntProgress
from IPython.display import display

sys.path.append("../")

import utils_ML as uml
import AtlasAnalysisFunctions as AAF

In [2]:
data = pd.read_csv("../PEMatrix/norm_NSAF_data2.csv", index_col = "assay_id")
meta = pd.read_csv("../../Metadata/unified_metadata.csv")
meta = meta[meta.assay_id.isin(data.index)]

groups = pd.read_csv("../../Metadata/group_cells_annotation.csv", sep =";", index_col="Unnamed: 0")
meta["Group"] = meta.cell_line.apply(lambda x: groups[groups.cell_line == x]["group"].values[0])
meta = meta.set_index("assay_id")

data.sort_index(inplace=True)
meta.sort_index(inplace=True)

target_encoder = LabelEncoder()
targets = target_encoder.fit_transform(meta.Group)
unique_labels = pd.Series(targets).unique()
class_weights = compute_class_weight(class_weight='balanced', classes=unique_labels, y=targets)

weights = {unique_labels[i]: class_weights[i] for i in range(len(unique_labels))}
print(weights)

{2: 2.6564102564102563, 9: 0.7194444444444444, 11: 1.3282051282051281, 3: 2.3022222222222224, 1: 0.3453333333333333, 7: 3.453333333333333, 5: 1.4388888888888889, 0: 0.4427350427350427, 8: 1.5014492753623188, 12: 1.4388888888888889, 10: 1.3282051282051281, 6: 0.5755555555555556, 13: 1.817543859649123, 14: 3.453333333333333, 4: 0.8222222222222222}


What is better? 
- using log transformed data
- exponentiating them
- Minmax vs standardscaling

Benchmarking will be done on 50% occurence filtering with SMOTETomek

In [3]:
exp_data = np.exp2(data)

In [4]:
# exponentiated data

f = IntProgress(min=0, max= 10 * 3) 
display(f)

splitter = StratifiedKFold(10, shuffle = True)
models = [LogisticRegression(max_iter=10000), SVC(), RandomForestClassifier()]

fold=0
for train, test in splitter.split(exp_data, targets):
    
    fold += 1

    X_train = exp_data.iloc[train,:]
    Y_train = targets[train]
    X_test = exp_data.iloc[test,:]
    Y_test = targets[test]

    preprocessor = Pipeline(steps=[
        ('filter', uml.FilterByOccurence()),
        ('imputer', SimpleImputer(strategy='constant', fill_value=0))
    ])

    minmax_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()

    preprocessor.fit(X_train)
    X_train_preprocessed = preprocessor.transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)    

    minmax_scaler.fit(X_train_preprocessed)
    standard_scaler.fit(X_train_preprocessed)

    # Minmax
    X_train_mm = minmax_scaler.transform(X_train_preprocessed)
    X_test_mm = minmax_scaler.transform(X_test_preprocessed)
    X_train_mm, Y_train_mm = SMOTETomek().fit_resample(X_train_mm, Y_train)


    # Standard scaler
    X_train_std = standard_scaler.transform(X_train_preprocessed)
    X_test_std = standard_scaler.transform(X_test_preprocessed)
    X_train_std, Y_train_std = SMOTETomek().fit_resample(X_train_std, Y_train)

    for model in models:

        # Minmax
        model.fit(X_train_mm, Y_train_mm)
        Y_pred_mm = model.predict(X_test_mm)

        micro_f1, macro_f1, weighted_f1, cm = uml.scoring_functions(Y_pred=Y_pred_mm, Y_test=Y_test, labels=unique_labels)
        results_df = pd.DataFrame({"model": [type(model).__name__], "fold": [fold], "micro_f1": [micro_f1],
                                            "macro_f1": [macro_f1], "weighted_f1": [weighted_f1] ,"cm": [cm], "scaler": ["minmaxScaler"],
                                            "oversampler": ["SMOTETomek"], 'data_type': ["exp_data"]})
        uml.save_results(results_df, "scaling_comparison")

        # standard scaler
        model.fit(X_train_std, Y_train_std)
        Y_pred_std = model.predict(X_test_std)

        micro_f1, macro_f1, weighted_f1, cm = uml.scoring_functions(Y_pred=Y_pred_std, Y_test=Y_test, labels=unique_labels)
        results_df = pd.DataFrame({"model": [type(model).__name__], "fold": [fold], "micro_f1": [micro_f1],
                                            "macro_f1": [macro_f1], "weighted_f1": [weighted_f1] ,"cm": [cm], "scaler": ["standardScaler"],
                                            "oversampler": ["SMOTETomek"], "data_type": ["exp_data"]})
        uml.save_results(results_df, "scaling_comparison")

        f.value += 1

IntProgress(value=0, max=30)

In [5]:
# log transform data

f = IntProgress(min=0, max= 10 * 3) 
display(f)

splitter = StratifiedKFold(10, shuffle = True)
models = [LogisticRegression(max_iter=10000), SVC(), RandomForestClassifier()]

fold=0
for train, test in splitter.split(data, targets):
    
    fold += 1

    X_train = data.iloc[train,:]
    Y_train = targets[train]
    X_test = data.iloc[test,:]
    Y_test = targets[test]

    preprocessor = Pipeline(steps=[
        ('filter', uml.FilterByOccurence()),
        ('imputer', uml.LowestValueImputer())
    ])

    minmax_scaler = MinMaxScaler()
    standard_scaler = StandardScaler()

    preprocessor.fit(X_train)
    X_train_preprocessed = preprocessor.transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)    

    minmax_scaler.fit(X_train_preprocessed)
    standard_scaler.fit(X_train_preprocessed)

    # Minmax
    X_train_mm = minmax_scaler.transform(X_train_preprocessed)
    X_test_mm = minmax_scaler.transform(X_test_preprocessed)
    X_train_mm, Y_train_mm = SMOTETomek().fit_resample(X_train_mm, Y_train)


    # Standard scaler
    X_train_std = standard_scaler.transform(X_train_preprocessed)
    X_test_std = standard_scaler.transform(X_test_preprocessed)
    X_train_std, Y_train_std = SMOTETomek().fit_resample(X_train_std, Y_train)

    for model in models:

        # Minmax
        model.fit(X_train_mm, Y_train_mm)
        Y_pred_mm = model.predict(X_test_mm)

        micro_f1, macro_f1, weighted_f1, cm = uml.scoring_functions(Y_pred=Y_pred_mm, Y_test=Y_test, labels=unique_labels)
        results_df = pd.DataFrame({"model": [type(model).__name__], "fold": [fold], "micro_f1": [micro_f1],
                                            "macro_f1": [macro_f1], "weighted_f1": [weighted_f1] ,"cm": [cm], "scaler": ["minmaxScaler"],
                                            "oversampler": ["SMOTETomek"], "data_type": ["log-transformed"]})
        uml.save_results(results_df, "scaling_comparison")

        # standard scaler
        model.fit(X_train_std, Y_train_std)
        Y_pred_std = model.predict(X_test_std)

        micro_f1, macro_f1, weighted_f1, cm = uml.scoring_functions(Y_pred=Y_pred_std, Y_test=Y_test, labels=unique_labels)
        results_df = pd.DataFrame({"model": [type(model).__name__], "fold": [fold], "micro_f1": [micro_f1],
                                            "macro_f1": [macro_f1], "weighted_f1": [weighted_f1] ,"cm": [cm], "scaler": ["standardScaler"],
                                            "oversampler": ["SMOTETomek"], "data_type": ["log-transformed"]})
        uml.save_results(results_df, "scaling_comparison")

        f.value += 1

IntProgress(value=0, max=30)