# Radiomics machine learning study with a small sample size: Single random training-test set split may result in unreliable results

## 1) Create selection frequency table

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
# GvM = 'GBM vs. Metastasis' dataset
# MEN = 'Low- vs. High-grade meningioma' dataset

df_GvM = pd.read_csv('GvM_cohort.csv')
X_GvM = df_GvM.iloc[:, 1:]
y_GvM = df_GvM['Label']

df_MEN = pd.read_csv('MEN_cohort.csv')
X_MEN = df_MEN.iloc[:, 1:]
y_MEN = df_MEN['Label']

# scaling
scaler = StandardScaler()
X_GvM = pd.DataFrame(scaler.fit_transform(X_GvM, y_GvM), columns=X_GvM.columns)
X_MEN = pd.DataFrame(scaler.fit_transform(X_MEN, y_MEN), columns=X_MEN.columns)

print('1) GBM vs. Metastasis dataset: \n \
    No. of samples and No. of features: {0} \n \
    Proportions of GBM and Metastasis: {1} and {2} \n \
    '.format(X_GvM.shape, 
              round((1 - np.mean(y_GvM)), 2), round(np.mean(y_GvM), 2)))
     
print('2) Low- vs. High-grade meningioma dataset: \n \
    No. of samples and No. of features: {0} \n \
    Proportions of Low-grade and High-grade: {1} and {2} \n \
    '.format(X_MEN.shape, 
              round((1 - np.mean(y_MEN)), 2), round(np.mean(y_MEN), 2)))

1) GBM vs. Metastasis dataset: 
     No. of samples and No. of features: (167, 558) 
     Proportions of GBM and Metastasis: 0.65 and 0.35 
     
2) Low- vs. High-grade meningioma dataset: 
     No. of samples and No. of features: (258, 186) 
     Proportions of Low-grade and High-grade: 0.63 and 0.37 
     


In [27]:
# define function
def get_freq(X, y, n_rep):
    
    result_dict = {key: np.zeros(X.shape[1]) for key in ['SVC', 'RF', 'LASSO', 'ADA']}

    for rs in tqdm(range(n_rep)):
        X_train, X_test, y_train, y_test = \
              train_test_split(X, y, test_size=0.3, stratify=y, shuffle=True, random_state=rs)

        # 4 feature selectors based on linear SVC, random forest, LASSO, and adaptive boosting 
        svc = SelectFromModel(LinearSVC(max_iter=10000, dual=False, random_state=0))
        rf = SelectFromModel(RandomForestClassifier(random_state=0))
        lr = SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear',
                                                max_iter=10000, dual=False, random_state=0))
        ada = SelectFromModel(AdaBoostClassifier(random_state=0))

        # add one count to each of the selected features
        result_dict['SVC'] = np.add(result_dict['SVC'], svc.fit(X_train, y_train).get_support().astype('int8'))
        result_dict['RF'] = np.add(result_dict['RF'], rf.fit(X_train, y_train).get_support().astype('int8'))
        result_dict['LASSO'] = np.add(result_dict['LASSO'], lr.fit(X_train, y_train).get_support().astype('int8'))
        result_dict['ADA'] = np.add(result_dict['ADA'], ada.fit(X_train, y_train).get_support().astype('int8'))

    result_df = pd.DataFrame.from_dict(result_dict) / n_rep # to obtain the frequency 
    result_df['mean'] = result_df.mean(axis=1) # add a column showing the averaged frequency across the models

    return result_df

# get the selection frequency tables and export them as .csv files
get_freq(X_GvM, y_GvM, n_rep=1000).to_csv('FeatSel_GvM.csv')
get_freq(X_MEN, y_MEN, n_rep=1000).to_csv('FeatSel_MEN.csv')


100%|██████████| 10/10 [00:04<00:00,  2.16it/s]
100%|██████████| 10/10 [00:02<00:00,  3.37it/s]


In [24]:
result_df

Unnamed: 0,SVC,RF,LASSO,ADA,average
0,0.0,0.0,0.0,0.0,0.000
1,0.0,0.3,0.0,0.0,0.075
2,0.2,0.0,0.0,0.0,0.050
3,0.0,0.0,0.0,0.0,0.000
4,0.1,0.3,0.0,0.0,0.100
...,...,...,...,...,...
553,0.3,0.1,0.0,0.0,0.100
554,0.2,0.1,0.0,0.1,0.100
555,0.6,0.5,0.0,0.0,0.275
556,0.0,0.1,0.0,0.0,0.025
