In [2]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.under_sampling import RandomUnderSampler # to check again how to use this in a pipeline 


from sklearn.svm import SVC
from sklearn import linear_model

from xgboost import XGBClassifier


In [3]:
df = pd.read_excel('Cleaned_Dataframe.xlsx')
df.set_index('Sample',inplace=True)

df_cancer = df.loc[df['Status'] == 'Cancer']
df_control = df.loc[df['Status'] == 'Control']

#randomly seelct 538 samples from the cancer population to create an equal sample size 
df_cancer_small = df_cancer.sample(n=538, random_state = 100)

df1 = pd.concat([df_cancer_small, df_control])
df1                    

Unnamed: 0_level_0,GP1,GP2,GP3,GP4,GP5,GP6,GP7,GP8,GP9,GP10,...,GP18,GP19,GP20,GP21,GP22,GP23,GP24,Age at sample,Gender,Status
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CRC_4283,0.13,0.29,0.46,31.89,0.41,8.58,0.38,16.14,9.10,6.73,...,4.89,2.26,0.23,1.30,0.27,0.99,1.83,70.353425,M,Cancer
CRC_3775,0.11,0.30,0.17,13.98,0.22,4.17,0.23,17.40,5.24,5.98,...,21.64,1.55,0.51,1.46,0.46,2.11,1.49,60.936986,F,Cancer
CRC_3239,0.09,0.32,0.34,31.72,0.26,6.18,0.27,20.54,8.71,5.12,...,6.29,1.85,0.30,0.70,0.08,1.26,1.61,66.295890,M,Cancer
CRC_3254,0.16,0.54,0.39,26.21,0.36,4.36,0.54,20.64,9.78,3.94,...,7.43,2.01,0.54,1.10,0.15,2.17,2.31,64.410959,M,Cancer
CRC_7499,0.31,0.74,0.43,29.89,0.60,7.97,0.48,18.22,10.23,5.34,...,5.99,1.69,0.46,0.75,0.10,0.82,1.35,58.142466,M,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC_9725,0.13,0.92,0.68,20.33,0.63,5.62,1.28,18.64,12.78,5.36,...,7.14,1.72,0.22,0.93,0.11,1.05,1.27,51.380822,M,Control
CRC_9763,0.04,0.54,0.42,21.59,0.35,6.19,0.76,18.78,7.92,5.31,...,9.78,1.61,0.33,0.42,0.14,0.53,0.87,47.915068,F,Control
CRC_9765,0.13,1.10,0.83,17.04,0.56,5.39,1.60,20.71,10.02,6.81,...,8.63,2.03,0.43,0.46,0.29,1.06,1.72,47.479452,M,Control
CRC_9784,0.08,0.80,0.32,22.14,0.35,4.67,0.93,18.52,15.42,4.29,...,6.72,1.45,0.34,0.65,0.09,0.97,1.05,50.323288,M,Control


In [4]:
#chnging type of data to 'category' from 'object'
df1.Gender = df1.Gender.astype('category')
df1.Status = df1.Status.astype('category')

#separate cancer markers and input data
df1_outputs= df1['Status']
df1_inputs = df1.drop(['Status', 'Age at sample'],axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df1_inputs, df1_outputs, random_state=100, test_size=0.3)

In [6]:
def grid_function(model, parameters, X_train, y_train):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])
    
    
    pipeline = Pipeline(steps=[('preprosessor', preprocessor), ('algorithm', model)])
    
    search = GridSearchCV(pipeline, parameters, cv=StratifiedKFold(5), n_jobs=-1)
    
    search.fit(X_train, y_train)

    best_model = search.best_estimator_

    return(best_model, search)

In [7]:
def pred_function(model, tune_parameters, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    #Finding the best parameters 
    best_model, search = grid_function(model, tune_parameters, X_train, y_train)
    print (best_model._final_estimator)
    
    #Make prediction using the best model
    best_model._final_estimator.fit(X_train_sc, y_train)
    y_pred = best_model._final_estimator.predict(X_test_sc)

    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    

#### Random Forest Classifier

In [8]:
#Define parameters
rf_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
    'algorithm__bootstrap': [True]
}

rf = RandomForestClassifier(random_state=0)

In [9]:
#To extract feature importance scores 
best_model_rf, search_rf = grid_function(rf, rf_tune, X_train, y_train)
rf_ranking = pd.DataFrame(best_model_rf._final_estimator.feature_importances_, index=X_train.columns)
rf_ranking.columns = ['Importance']
rf_ranking

Unnamed: 0,Importance
GP1,0.028957
GP2,0.030563
GP3,0.03793
GP4,0.081426
GP5,0.038313
GP6,0.043234
GP7,0.029066
GP8,0.031637
GP9,0.042898
GP10,0.028672


In [10]:
pred_function(rf, rf_tune, X_train, y_train, X_test, y_test)

RandomForestClassifier(max_depth=10, n_estimators=400, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.76      0.64      0.69       170
     Control       0.66      0.77      0.71       153

    accuracy                           0.70       323
   macro avg       0.71      0.70      0.70       323
weighted avg       0.71      0.70      0.70       323

[[108  62]
 [ 35 118]]


#### Linear SVM

In [11]:
#Define paramters
svm_tune = { 
    'algorithm__kernel': ['linear'], 
    'algorithm__degree' : [2,3,4],
    'algorithm__C':[0, 1.0],
}

svm = SVC(random_state=0)

In [12]:
pred_function(svm, svm_tune, X_train, y_train, X_test, y_test)



SVC(degree=2, kernel='linear', random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.78      0.67      0.72       170
     Control       0.68      0.79      0.73       153

    accuracy                           0.73       323
   macro avg       0.73      0.73      0.73       323
weighted avg       0.73      0.73      0.73       323

[[114  56]
 [ 32 121]]


In [13]:
#To extract feature importance scores 
best_model_svm, search_svm = grid_function(svm, svm_tune, X_train, y_train)

svm_ranking_table = pd.DataFrame(best_model_svm._final_estimator.coef_[0], index=X_train.columns)
svm_ranking_table.columns = ['Importance']
svm_ranking_table



Unnamed: 0,Importance
GP1,0.464032
GP2,-0.164333
GP3,0.107408
GP4,-0.585973
GP5,-0.172258
GP6,1.086801
GP7,-0.77155
GP8,0.577517
GP9,0.724658
GP10,-1.004245


### XGB

In [14]:
#Define parameters
xgb_tune = { 
    'algorithm__eta': [0.01, 0.05, 0.1, 0.3, 0.5, 1], #Step size shrinkage used in update to prevents overfitting
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
}

xgb = XGBClassifier(random_state=0)

In [15]:
#To extract feature importance scores 
best_model_xgb, search_xgb = grid_function(xgb, xgb_tune, X_train, y_train)
xgb_ranking = pd.DataFrame(best_model_xgb._final_estimator.feature_importances_, index=X_train.columns)
xgb_ranking.columns = ['Importance']
xgb_ranking





Unnamed: 0,Importance
GP1,0.032205
GP2,0.036978
GP3,0.026073
GP4,0.034562
GP5,0.039321
GP6,0.025132
GP7,0.032833
GP8,0.027767
GP9,0.040203
GP10,0.036619


In [16]:
pred_function(xgb, xgb_tune, X_train, y_train, X_test, y_test)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.100000001, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.75      0.66      0.70       170
     Control       0.66      0.75      0.71       153

    accuracy                           0.70       323
   macro avg       0.71      0.71      0.70       323
weighted avg       0.71      0.70      0.70       323

[[112  58]
 [ 38 115]]


### Stacked estimator

In [17]:
best_model_svm._final_estimator

SVC(degree=2, kernel='linear', random_state=0)

In [18]:
best_model_rf._final_estimator

RandomForestClassifier(max_depth=10, n_estimators=400, random_state=0)

In [19]:
def pred_evaluate(best_model_rf, best_model_svm, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    estimators = [('rf', best_model_rf._final_estimator),
                 ('svm', best_model_svm._final_estimator),
                 ('xgb', best_model_xgb._final_estimator)]
    
    sc = StackingClassifier(estimators = estimators, 
                           final_estimator = LogisticRegression())
    
    sc.fit(X_train_sc, y_train)
    
    y_pred = sc.predict(X_test_sc)
    
    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [20]:
pred_evaluate(best_model_rf, best_model_svm, X_train, y_train, X_test, y_test)
























Model Performance
              precision    recall  f1-score   support

      Cancer       0.77      0.68      0.72       170
     Control       0.68      0.77      0.72       153

    accuracy                           0.72       323
   macro avg       0.72      0.72      0.72       323
weighted avg       0.73      0.72      0.72       323

[[115  55]
 [ 35 118]]
