<a href="https://colab.research.google.com/github/TrousersOMG/comp189cw2/blob/main/vbm_cw_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict schizophrenia from brain grey matter (classification)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Import Packages

In [None]:
import numpy as np
import scipy as sci
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
%matplotlib inline

## Load Data: ROIs

In [None]:
path = '/content/drive/MyDrive/UCL/comp189/brain_anatomy_schizophrenia_UCL_2023/data'

In [None]:
# 载入数据

# participants_train中的WM,GM,CSF数据在rois_train中都有但是 年龄，分类，还有性别数据是前者没有的

# 导入participants_train和rois_train
participants_train = pd.read_csv(os.path.join(path,"train_participants.csv"))
rois_train = pd.read_csv(os.path.join(path,"train_rois.csv"))

# 导入participants_test和rois_test
participants_test = pd.read_csv(os.path.join(path,"test_participants.csv"))
rois_test = pd.read_csv(os.path.join(path,"test_rois.csv"))

# 选择需要从participant数据中抽取的列
add_features = ["sex","age","diagnosis","site"]

# 对train和test做相同的操作，把重要feature插入rois的数据中
rois_extend_train = pd.concat([rois_train.iloc[:,0:2],participants_train[add_features],rois_train.iloc[:,3:]],axis=1)

rois_extend_test = pd.concat([rois_test.iloc[:,0:2],participants_test[add_features],rois_test.iloc[:,3:]],axis=1)




In [None]:
# 赋值给更简单的名字的变量
train_data = rois_extend_train
train_data['set'] = 'train'
test_data = rois_extend_test
test_data['set'] = 'test'

# combine train and test data together
data = pd.concat([train_data,test_data])

# Investigate the Distribution of the Data

### Check is there any missing data

In [None]:
def check_df(df):

    print('############### shape ###################')
    print(df.shape)

    print('############### summary of features ####################')
    null = df.isna().sum()
    missing_rate = (np.array(null)/(df.shape[0])).tolist()
    data = {'missing_number': null, 'missing_rate':missing_rate}
    missing_table = pd.DataFrame(data, index=df.columns)
    missing_table['missing_rate'] = missing_table['missing_rate'].apply(lambda x: format(x, '.2%'))
    missing_table['type'] = df.dtypes
    # print(missing_table)
    missing_table = pd.DataFrame(missing_table)
    
    return missing_table


check_list = check_df(data)
print(check_list)
num = check_list['missing_number'][check_list['missing_number']!=0].count()
print(f'There are {num} features has missing data.')

### Age

In [None]:
# Age
sns.violinplot(x='set',y='age',data=data).set_title('Distribution of Age in Train and Test Data')

# From the figure, we see that the distribution of age in this two dataset are quite similar(the mean, 0.25-0.75 intervel), but in the train data, the proportion of young people is higher than test one

print(data[['set','age']].groupby('set').describe())
print(data[['set','sex','age']].groupby(['set','sex']).describe())

# quantitatively, there is no significantly difference between the age in test, train dataset. Furthermore, if the group is divided more precisely, by train,test and sex, each subgroup has the similar distribution

### Sex

In [None]:
# sex
sns.histplot(x='sex',data=data).set_title('Histogram of Sex')

print(data.groupby(['set','sex']).size())

# in the sex, distibution are similar in test and train data, but it is imbalance.

### Diagnosis

In [None]:
sns.histplot(x='diagnosis',data=data).set_title('Histogram of diagnosis')

print(data.groupby(['set','diagnosis']).size())

# Generally, the positive and negative sample are balance in train and test dataset

# Investigate the Feature that Could Related to Diagnosis 

In [None]:
# after 'l3thVen_GM_Vol', the columns are ROIs
corr_data = pd.concat([data.loc[:,'l3thVen_GM_Vol':],data['diagnosis']],axis=1)
corr_data = corr_data.drop('set',axis=1)

# change the str to num
map_dic = {'control':0,'schizophrenia':1}

corr_data['diagnosis'] = corr_data['diagnosis'].replace(map_dic)

In [None]:
# plot the heatmap
# fig = plt.figure(figsize=(100,120))

# sns.heatmap(np.abs(corr_data.corr()),cmap='Blues')

！！！！！ try to understand the theorem behind this part

In [None]:
# PCA

from sklearn.decomposition import PCA
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

rois = corr_data.drop(['diagnosis'],axis=1)

PCs = PCA(n_components=2).fit_transform(rois)
data['PC1'], data['PC2']  = PCs[:, 0], PCs[:, 1]

sns.scatterplot (x="PC1", y="PC2", hue="diagnosis",  data=data)
# here we do a fitting to have a flavour in the relationship between PC1 and PC2 with the diagnosis
oneway = smf.ols('PC1 ~ diagnosis', data).fit()
# print(oneway.summary())
print(sm.stats.anova_lm(oneway, typ=2))
oneway = smf.ols('PC2 ~ diagnosis', data).fit()
print(sm.stats.anova_lm(oneway, typ=2))


# From the ANONA table, we see that p-value are both smaller than 0.05, and we could reject that these two component has not contribution to the prediction of diagnosis.

# Preparation

In [None]:
!pip install ramp-workflow

# import package
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.base import BaseEstimator
import sklearn.preprocessing as preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
import sklearn.metrics as metrics

# change the direction to import the .py file
import sys
sys.path.append('/content/drive/MyDrive/UCL/comp189/brain_anatomy_schizophrenia_UCL_2023')
import problem

from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [None]:
# load data
path = '/content/drive/MyDrive/UCL/comp189/brain_anatomy_schizophrenia_UCL_2023'
X_train, y_train = problem.get_train_data(path=path)
X_test, y_test = problem.get_test_data(path=path)


# ！！！！！为什么我不把.ipynb放在这个文件夹下，就读取不了文件？——因为在.py文件中读取路径设置的问题

In [None]:
# the assert statement is checking if the number of columns in X_train is equal to 331979. If the condition is True, then the code continues to execute as normal. However, if the condition is False, then the assert statement raises an AssertionError with a default error message indicating that the assertion has failed.

assert X_train.shape[1] == 284 + 331695

### build the function to read the high dimension features data and low dimensional features data

In [None]:
# this part only divided the data into low dimensional feature and high dimensional features

class ROIsFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, :284]

class VBMFeatureExtractor(BaseEstimator, TransformerMixin):
    """Select only the 284 ROIs features:"""
    def fit(self, X, y):
        return self

    def transform(self, X):
        return X[:, 284:]

# Machine Learning Part

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold, KFold, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA

### Split Strategy

In [None]:
N_FOLDS = 2

def get_cv_KFold(X, y):
    cv_train = KFold(n_splits=N_FOLDS, shuffle=True, random_state=0)
    return cv_train.split(X, y)

def get_cv_SGKFold(X, y):
    cv_train = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=0)
    group = np.array(rois_extend_train['sex'])
    return cv_train.split(X, y, groups=group)

def get_cv_SKFold(X, y):
    cv_train = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=0)
    group = np.array(rois_extend_train['sex'])
    return cv_train.split(X, y, groups=group)



# Low Dimension: ROIs

## Logistic Regression

The cross validation are KFold and StratifiedGroupKFold(Stratified represent the split of the target is flow the ratio, and Group means the split in the group is also follows the ratio)

In [None]:
# mapping transform the str in y_train and y_test into int
def mapping(array):

    map_dic = {'schizophrenia':1, 'control':0}

    if array.dtype == object:
        arr = np.vectorize(map_dic.get)(array)
    else:
        arr = array
    return arr

In [None]:
# tranform the y_train and y_test
y_train = mapping(y_train)
y_test = mapping(y_test)

In [None]:
# the split strategy
cv_k = get_cv_KFold(X_train, y_train)
cv_sk = get_cv_SKFold(X_train, y_train)
cv_sgk = get_cv_SGKFold(X_train, y_train)

pipelines: with and without standardscaler

In [None]:
# two different estimators
lr = make_pipeline(
    VBMFeatureExtractor(),
    LogisticRegression(max_iter=10000, solver='saga', random_state=0)
)

lrS = make_pipeline(
    VBMFeatureExtractor(),
    StandardScaler(),
    LogisticRegression(max_iter=10000, solver='saga', random_state=0)
)


# hyperparameter grid
lr_hp = {
    'logisticregression__C': np.linspace(0.5, 5, num=9),
    'logisticregression__penalty': ['l1','l2',None]
}

# score list
scores = {'accuracy':'balanced_accuracy', 'recall':'recall', 'roc-auc':'roc_auc'}

train model without standard

In [None]:
# KFold #####################################################
# build girdsearch
lr_k = GridSearchCV(lr, lr_hp, scoring=scores, cv=cv_k, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
lr_k.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", lr_k.best_params_)
print("Best score: ", lr_k.best_score_)
print("Refit time: ", lr_k.refit_time_)

# test on test set
f_lr_k = lr_k.best_estimator_

y_pred_test = f_lr_k.predict(X_test)
score_pred_test = f_lr_k.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)


In [None]:
# StratifiedKFold ###################################################
# build girdsearch
lr_sk = GridSearchCV(lr, lr_hp, scoring=scores, cv=cv_sk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
lr_sk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", lr_sk.best_params_)
print("Best score: ", lr_sk.best_score_)
print("Refit time: ", lr_sk.refit_time_)

# test on test set
f_lr_sk = lr_sk.best_estimator_

y_pred_test = f_lr_sk.predict(X_test)
score_pred_test = f_lr_sk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedGroupKFold ########################################################
# build girdsearch
lr_sgk = GridSearchCV(lr, lr_hp, scoring=scores, cv=cv_sgk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
lr_sgk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", lr_sgk.best_params_)
print("Best score: ", lr_sgk.best_score_)
print("Refit time: ", lr_sgk.refit_time_)

# test on test set
f_lr_sgk = lr_sgk.best_estimator_

y_pred_test = f_lr_sgk.predict(X_test)
score_pred_test = f_lr_sgk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

train model with standard


In [None]:
# KFold #####################################################
# build girdsearch
lr_k = GridSearchCV(lrS, lr_hp, scoring=scores, cv=cv_k, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
lr_k.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", lr_k.best_params_)
print("Best score: ", lr_k.best_score_)
print("Refit time: ", lr_k.refit_time_)

# test on test set
f_lr_k = lr_k.best_estimator_

y_pred_test = f_lr_k.predict(X_test)
score_pred_test = f_lr_k.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)


In [None]:
# StratifiedKFold ###################################################
# build girdsearch
lr_sk = GridSearchCV(lrS, lr_hp, scoring=scores, cv=cv_sk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
lr_sk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", lr_sk.best_params_)
print("Best score: ", lr_sk.best_score_)
print("Refit time: ", lr_sk.refit_time_)

# test on test set
f_lr_sk = lr_sk.best_estimator_

y_pred_test = f_lr_sk.predict(X_test)
score_pred_test = f_lr_sk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedGroupKFold ########################################################
# build girdsearch
lr_sgk = GridSearchCV(lrS, lr_hp, scoring=scores, cv=cv_sgk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
lr_sgk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", lr_sgk.best_params_)
print("Best score: ", lr_sgk.best_score_)
print("Refit time: ", lr_sgk.refit_time_)

# test on test set
f_lr_sgk = lr_sgk.best_estimator_

y_pred_test = f_lr_sgk.predict(X_test)
score_pred_test = f_lr_sgk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

########################################################################
#########################################################################
########################################################################
########################################################################
########################################################################

## Random Forest

In [None]:
# the split strategy
cv_k = get_cv_KFold(X_train, y_train)
cv_sk = get_cv_SKFold(X_train, y_train)
cv_sgk = get_cv_SGKFold(X_train, y_train)

pipelines: with and without standardscaler

In [None]:
# two different estimators
rf = make_pipeline(
    VBMFeatureExtractor(),
    RandomForestClassifier(random_state=0)
)

rfS = make_pipeline(
    VBMFeatureExtractor(),
    StandardScaler(),
    RandomForestClassifier(random_state=0)
)


# hyperparameter grid
rf_hp = {
    'randomforestclassifier__n_estimators':[10, 30, 50, 100],
    'randomforestclassifier__criterion':['gini', 'entropy'],
    'randomforestclassifier__max_depth':[10, 15, 20],
    'randomforestclassifier__max_features':[0.5, 0.7, 0.9],
    'randomforestclassifier__min_samples_leaf': [1, 2, 5]

}

# score list
scores = {'accuracy':'balanced_accuracy', 'recall':'recall', 'roc-auc':'roc_auc'}

train model without standard

In [None]:
# KFold #####################################################
# build girdsearch
rf_k = GridSearchCV(rf, rf_hp, scoring=scores, cv=cv_k, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
rf_k.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", rf_k.best_params_)
print("Best score: ", rf_k.best_score_)
print("Refit time: ", rf_k.refit_time_)

# test on test set
f_rf_k = rf_k.best_estimator_

y_pred_test = f_rf_k.predict(X_test)
score_pred_test = f_rf_k.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedKFold #####################################################
# build girdsearch
rf_sk = GridSearchCV(rf, rf_hp, scoring=scores, cv=cv_sk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
rf_sk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", rf_sk.best_params_)
print("Best score: ", rf_sk.best_score_)
print("Refit time: ", rf_sk.refit_time_)

# test on test set
f_rf_sk = rf_sk.best_estimator_

y_pred_test = f_rf_sk.predict(X_test)
score_pred_test = f_rf_sk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedGroupKFold #####################################################
# build girdsearch
rf_sgk = GridSearchCV(rf, rf_hp, scoring=scores, cv=cv_sgk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
rf_sgk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", rf_sgk.best_params_)
print("Best score: ", rf_sgk.best_score_)
print("Refit time: ", rf_sgk.refit_time_)

# test on test set
f_rf_sgk = rf_sgk.best_estimator_

y_pred_test = f_rf_sgk.predict(X_test)
score_pred_test = f_rf_sgk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

train model with standard

In [None]:
# KFold #####################################################
# build girdsearch
rf_k = GridSearchCV(rfS, rf_hp, scoring=scores, cv=cv_k, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
rf_k.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", rf_k.best_params_)
print("Best score: ", rf_k.best_score_)
print("Refit time: ", rf_k.refit_time_)

# test on test set
f_rf_k = rf_k.best_estimator_

y_pred_test = f_rf_k.predict(X_test)
score_pred_test = f_rf_k.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedKFold #####################################################
# build girdsearch
rf_sk = GridSearchCV(rfS, rf_hp, scoring=scores, cv=cv_sk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
rf_sk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", rf_sk.best_params_)
print("Best score: ", rf_sk.best_score_)
print("Refit time: ", rf_sk.refit_time_)

# test on test set
f_rf_sk = rf_sk.best_estimator_

y_pred_test = f_rf_sk.predict(X_test)
score_pred_test = f_rf_sk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedGroupKFold #####################################################
# build girdsearch
rf_sgk = GridSearchCV(rfS, rf_hp, scoring=scores, cv=cv_sgk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
rf_sgk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", rf_sgk.best_params_)
print("Best score: ", rf_sgk.best_score_)
print("Refit time: ", rf_sgk.refit_time_)

# test on test set
f_rf_sgk = rf_sgk.best_estimator_

y_pred_test = f_rf_sgk.predict(X_test)
score_pred_test = f_rf_sgk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

## Non-Linear Model: Neural Network

In [None]:
# the split strategy
cv_k = get_cv_KFold(X_train, y_train)
cv_sk = get_cv_SKFold(X_train, y_train)
cv_sgk = get_cv_SGKFold(X_train, y_train)

In [None]:
# pipelines
nn = make_pipeline(
    VBMFeatureExtractor(),
    MLPClassifier(random_state=0, verbose=2, max_iter=500, solver='adam')
)

nnS = make_pipeline(
    VBMFeatureExtractor(),
    StandardScaler(),
    MLPClassifier(random_state=0, verbose=2, max_iter=500)
)

# hyperparameter grid
nn_hp = {
    'mlpclassifier__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (200, 150, 100, 50, 25, )],
    'mlpclassifier__activation': ['logistic', 'tanh', 'relu'],
    'mlpclassifier__alpha': [0.0001, 0.001, 0.01, 0.1]
}

# score list
scores = {'accuracy':'balanced_accuracy', 'recall':'recall', 'roc-auc':'roc_auc'}

train without standardScaler

In [None]:
# KFold #####################################################
# build girdsearch
nn_k = GridSearchCV(nn, nn_hp, scoring=scores, cv=cv_k, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
nn_k.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", nn_k.best_params_)
print("Best score: ", nn_k.best_score_)
print("Refit time: ", nn_k.refit_time_)

# test on test set
f_nn_k = nn_k.best_estimator_

y_pred_test = f_nn_k.predict(X_test)
score_pred_test = f_nn_k.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedKFold #####################################################
# build girdsearch
nn_sk = GridSearchCV(nn, nn_hp, scoring=scores, cv=cv_sk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
nn_sk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", nn_sk.best_params_)
print("Best score: ", nn_sk.best_score_)
print("Refit time: ", nn_sk.refit_time_)

# test on test set
f_nn_sk = nn_sk.best_estimator_

y_pred_test = f_nn_sk.predict(X_test)
score_pred_test = f_nn_sk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedGroupKFold #####################################################
# build girdsearch
nn_sgk = GridSearchCV(nn, nn_hp, scoring=scores, cv=cv_sgk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
nn_sgk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", nn_sgk.best_params_)
print("Best score: ", nn_sgk.best_score_)
print("Refit time: ", nn_sgk.refit_time_)

# test on test set
f_nn_sgk = nn_sgk.best_estimator_

y_pred_test = f_nn_sgk.predict(X_test)
score_pred_test = f_nn_sgk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

train with StandradScaler

In [None]:
# KFold #####################################################
# build girdsearch
nn_k = GridSearchCV(nnS, nn_hp, scoring=scores, cv=cv_k, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
nn_k.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", nn_k.best_params_)
print("Best score: ", nn_k.best_score_)
print("Refit time: ", nn_k.refit_time_)

# test on test set
f_nn_k = nn_k.best_estimator_

y_pred_test = f_nn_k.predict(X_test)
score_pred_test = f_nn_k.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedKFold #####################################################
# build girdsearch
nn_sk = GridSearchCV(nnS, nn_hp, scoring=scores, cv=cv_sk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
nn_sk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", nn_sk.best_params_)
print("Best score: ", nn_sk.best_score_)
print("Refit time: ", nn_sk.refit_time_)

# test on test set
f_nn_sk = nn_sk.best_estimator_

y_pred_test = f_nn_sk.predict(X_test)
score_pred_test = f_nn_sk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)

In [None]:
# StratifiedGroupKFold #####################################################
# build girdsearch
nn_sgk = GridSearchCV(nnS, nn_hp, scoring=scores, cv=cv_sgk, n_jobs=-1, verbose=2, return_train_score=True, refit='roc-auc')

# fit model
nn_sgk.fit(X_train, y_train)

In [None]:
# afterfitting print the result
print("Best parameters: ", nn_sgk.best_params_)
print("Best score: ", nn_sgk.best_score_)
print("Refit time: ", nn_sgk.refit_time_)

# test on test set
f_nn_sgk = nn_sgk.best_estimator_

y_pred_test = f_nn_sgk.predict(X_test)
score_pred_test = f_nn_sgk.predict_proba(X_test)[:, 1] # give the probability to the two classification and select the probability that y_predict = 1

# calculate the scores and print them
bacc_test = metrics.balanced_accuracy_score(y_test, y_pred_test)
auc_test = metrics.roc_auc_score(y_test, score_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)

print("# Test")
print('bACC=%.2f' % bacc_test,
      'ROC-AUC=%.2f' % auc_test,
      'RECALL=%.2f' % recall_test)