<a href="https://colab.research.google.com/github/Chansikan/do_not_split_small_sample/blob/main/3)Compare_CV_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Radiomics machine learning study with a small sample size: Single random training-test set split may lead to unreliable results: Comparing CV methods

In [2]:
# import necessary modules
import numpy as np
import pandas as pd
from random import choices
from tqdm import tqdm

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,  KFold, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [4]:
# Low- vs. High-grade meningioma without undersampling
df_MEN = pd.read_csv('MEN_cohort.csv')
X_MEN = df_MEN.iloc[:, 1:]
y_MEN = df_MEN['Label']

print('Low- vs. High-grade meningioma dataset: \n \
    No. of samples and No. of features: {0} \n \
    Proportions of Low-grade and High-grade: {1} and {2} \n \
    '.format(X_MEN.shape, round((1 - np.mean(y_MEN)), 2), 
             round(np.mean(y_MEN), 2)))

Low- vs. High-grade meningioma dataset: 
     No. of samples and No. of features: (258, 186) 
     Proportions of Low-grade and High-grade: 0.63 and 0.37 
     


In [5]:
def get_compare_result(dataset, rs_list, params):
    
  combs = [(rs, method) for rs in rs_list 
          for method in ['BS', 'CV', 'CV_rep', 'nested_CV']]

  result_df = pd.DataFrame()
  for (rs, method) in tqdm(combs):

    df = pd.read_csv(dataset)
    X = df.iloc[:, 1:]
    y = df.loc[:, 'Label']

    X_train, X_test, y_train, y_test = \
      train_test_split(X, y, test_size=0.3, shuffle=True, stratify=y, random_state=rs)

    pipe = Pipeline([('scaler', StandardScaler()), 
                      ('fs', SelectKBest(f_classif)),
                      ('clf', LogisticRegression(penalty='l1', solver='liblinear',
                                max_iter=10000, dual=False, random_state=0))])

    # 1) Cross validation without repetition
    if method == 'CV':
      search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', cv=5)
      search.fit(X_train, y_train)

      auc_list = []
      for i in range(5): 
        key = 'split'+ str(i) + '_test_score'
        auc = search.cv_results_[key][search.best_index_]
        auc_list.append(auc)
      AUC_mean = np.mean(auc_list)
      AUC_sd = np.std(auc_list) 

    if method == 'CV':
      search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', cv=5)
      search.fit(X_train, y_train)

      auc_list = []
      for i in range(5): 
        key = 'split'+ str(i) + '_test_score'
        auc = search.cv_results_[key][search.best_index_]
        auc_list.append(auc)
      AUC_mean = np.mean(auc_list)
      AUC_sd = np.std(auc_list) 

    # 2) Cross validation with repetition
    if method == 'CV_rep':
      search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', 
                            cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=10))
      search.fit(X_train, y_train)

      auc_list = []
      for i in range(5): 
        key = 'split'+ str(i) + '_test_score'
        auc = search.cv_results_[key][search.best_index_]
        auc_list.append(auc)
      AUC_mean = np.mean(auc_list)
      AUC_sd = np.std(auc_list) 

    # 3) nested cross validation
    if method == 'nested_CV':
      inner_cv = KFold(n_splits=5, shuffle=True, random_state=rs)
      outer_cv = KFold(n_splits=5, shuffle=True, random_state=rs)

      search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', cv=inner_cv)
      search.fit(X_train, y_train)
      auc_list = cross_val_score(search, X=X_train, y=y_train, 
                                  scoring='roc_auc', cv=outer_cv)
      AUC_mean = np.mean(auc_list)
      AUC_sd = np.std(auc_list) 

    # 4) Splitting with bootstrap
    if method == 'BS':

      auc_list = []
      for i in range(10):
        idx_selected = np.random.choice(X_train.index, len(X_train), replace=True)
        idx_not_selected = ~X_train.index.isin(idx_selected)
        X_tr = X_train.loc[idx_selected, :]
        y_tr = y_train.loc[idx_selected]
        X_val = X_train.loc[idx_not_selected, :]
        y_val = y_train.loc[idx_not_selected]
      
        search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', cv=5)
        search.fit(X_tr, y_tr)

        y_pred = search.predict_proba(X_val)
        val_auc = roc_auc_score(y_val, y_pred[:, 1])

        auc_list.append(val_auc)
      
      AUC_mean = np.mean(auc_list)
      AUC_sd = np.std(auc_list) 

      search.fit(X_train, y_train)


    # Testing
    y_pred = search.predict_proba(X_test)
    Test_AUC = roc_auc_score(y_test, y_pred[:, 1])

    row = pd.DataFrame({'Task': dataset, 'Method': method, 
                        'Random_state': rs,
                        'CV_AUC': round(AUC_mean, 3), 
                        'CV_AUC_SD': round(AUC_sd, 3),
                        'Test_AUC': round(Test_AUC, 3)}, index = [rs])
  
    result_df = pd.concat([result_df, row], axis=0)
  
  return result_df

In [6]:
RS_for_MEN_moderate = [229, 996, 69, 953, 450, 397, 52, 321, 694, 248]
RS_for_MEN_extreme = [999, 346, 518, 171, 983, 642, 573, 308, 690, 357]

params = {'fs__k': range(20, 55, 5), 
          'clf__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}

get_compare_result('MEN_cohort.csv', 
  RS_for_MEN_moderate, params).to_csv('compare_methods_MEN_moderate.csv')
get_compare_result('MEN_cohort.csv', 
  RS_for_MEN_extreme, params).to_csv('compare_methods_MEN_extreme.csv')

100%|██████████| 40/40 [09:59<00:00, 14.98s/it]
100%|██████████| 40/40 [10:42<00:00, 16.05s/it]


: 