StratifiedKFold vs Project split

In [11]:
# imports

# Models
from sklearn.base import BaseEstimator, TransformerMixin

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier as lgbm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# preprocessors
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, SelectKBest, SelectFromModel, RFE

# Samplers
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE

# metrics and splitters
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.model_selection import train_test_split

# utils
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import sys
import matplotlib_venn as venn
import random

# progress bar
from ipywidgets import IntProgress
from IPython.display import display

sys.path.append("../")

import utils_ML as uml

In [12]:
data = pd.read_csv("../PEMatrix/norm_NSAF_data2.csv", index_col = "assay_id")
meta = pd.read_csv("../../Metadata/unified_metadata.csv")
meta = meta[meta.assay_id.isin(data.index)]

groups = pd.read_csv("../../Metadata/group_cells_annotation.csv", sep =";", index_col="Unnamed: 0")
meta["Group"] = meta.cell_line.apply(lambda x: groups[groups.cell_line == x]["group"].values[0])
meta = meta.set_index("assay_id")

data.sort_index(inplace=True)
meta.sort_index(inplace=True)

target_encoder = LabelEncoder()
targets = target_encoder.fit_transform(meta.Group)
unique_labels = pd.Series(targets).unique()
class_weights = compute_class_weight(class_weight='balanced', classes=unique_labels, y=targets)

weights = {unique_labels[i]: class_weights[i] for i in range(len(unique_labels))}
print(weights)



{2: 2.6564102564102563, 9: 0.7194444444444444, 11: 1.3282051282051281, 3: 2.3022222222222224, 1: 0.3453333333333333, 7: 3.453333333333333, 5: 1.4388888888888889, 0: 0.4427350427350427, 8: 1.5014492753623188, 12: 1.4388888888888889, 10: 1.3282051282051281, 6: 0.5755555555555556, 13: 1.817543859649123, 14: 3.453333333333333, 4: 0.8222222222222222}


In [35]:
# Index must be 0-n_samples bcz prob index is taken with iloc instead of loc
class ProjectBasedSplit():
    def __init__(self, splits: int, metadata: pd.DataFrame, on = str):
        """Called when training model and splitting_procedure is set as 'project'
        
        metadata: the metadata table that is used to generate splits
        
        on: the column name that represent the class column name"""

        self.splits = splits
        self.metadata = metadata.reset_index(drop=True)
        self.label = on
        self.label_indices = list(range(metadata[self.label].nunique()))
        self.dropped_pxds = []

    def split(self, dataset, metadata, groups = None):
        
        dataset = dataset.reset_index(drop=True)
        metadata = self.metadata.loc[self.metadata.index.isin(dataset.index),:]

        index_splits = []
        for split in range(self.splits):
            train_index, test_index, dropped_pxds = self.train_test_project_split(dataset, metadata=metadata)
            index_splits.append((train_index, test_index))
            self.dropped_pxds.append(dropped_pxds)

            yield train_index, test_index
        
    def train_test_project_split(self, dataset, metadata: pd.DataFrame, groups = None):

        indices = list(range(15))
        random.shuffle(indices)

        # Test PXDs
        choosen_PXD = []

        for group, PXD in self.metadata.groupby(self.label).PXD_accession.unique().iloc[indices].iteritems():

            if True in [pxd in choosen_PXD for pxd in PXD]:
                continue
            if len(PXD) > 1:
                choosen_PXD.append(random.choice(PXD))

            # All classes must be in the training set
            if self.metadata[~self.metadata.PXD_accession.isin(choosen_PXD)].groupby(self.label).PXD_accession.nunique().shape[0] != 15:
                choosen_PXD = choosen_PXD[:-1]
            
            # Condition needed to oversample (at least 3 neighbours of the class)
            if (self.metadata[~self.metadata.PXD_accession.isin(choosen_PXD)].Group.value_counts() < 4).sum() > 0:
                choosen_PXD = choosen_PXD[:-1]
                
            if len(choosen_PXD) == 5:
                break

        test_index = self.metadata[self.metadata.PXD_accession.isin(choosen_PXD)].index
        train_index = dataset.loc[~dataset.index.isin(test_index), :].index.to_numpy()

        return train_index, test_index, choosen_PXD   

    def get_n_splits(self, x, y, groups = None):
        return self.splits

In [36]:
f = IntProgress(min=0, max= 5 * 6) 
display(f)

splitter = ProjectBasedSplit(5, meta, on = "Group")

fold=0
for train, test in splitter.split(data, None):
    
    fold += 1

    X_train = data.iloc[train,:]
    Y_train = targets[train]
    X_test = data.iloc[test,:]
    Y_test = targets[test]

    for filter_percentage in [.2, .4, .5, .6, .75, .9]:
        filtering = uml.FilterByOccurence(percentage=filter_percentage)
        imputer = uml.LowestValueImputer()
        scaler = MinMaxScaler()

        preprocessor = Pipeline(steps=[
            ('filtering', filtering),
            ('imputer', imputer),
            ('scaler', scaler)
        ])

        # Preprocess the data
        preprocessor.fit(X_train)
        X_train_preprocessed = preprocessor.transform(X_train)
        X_test_preprocessed = preprocessor.transform(X_test)

        X_oversampled, Y_oversampled = SMOTETomek(smote=SMOTE(k_neighbors=3)).fit_resample(X_train_preprocessed, Y_train)

        model = LogisticRegression(max_iter= 10000)

        model.fit(X_oversampled, Y_oversampled)
        Y_pred = model.predict(X_test_preprocessed)

        micro_f1, macro_f1, weighted_f1, cm = uml.scoring_functions(Y_pred=Y_pred, Y_test=Y_test, labels=unique_labels)
            
        results_df = pd.DataFrame({"model": [type(model).__name__], "fold": [fold], "micro_f1": [micro_f1],
                                        "macro_f1": [macro_f1], "weighted_f1": [weighted_f1] ,"cm": [cm], "filter_type": ["global"], 'projects': [splitter.dropped_pxds[fold-1]],
                                        "filter_percentage": [filter_percentage], "proteins": [len(preprocessor.named_steps.filtering.filtered_proteins)]})
            
        uml.save_results(results_df, "global_filtering_pxdsplit")
    
        f.value += 1

IntProgress(value=0, max=30)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru