In [51]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.under_sampling import RandomUnderSampler # to check again how to use this in a pipeline 


from sklearn.svm import SVC
from sklearn import linear_model

from xgboost import XGBClassifier


In [41]:
df = pd.read_excel('Cleaned_Dataframe.xlsx')
df.set_index('Sample',inplace=True)

df_cancer = df.loc[df['Status'] == 'Cancer']
df_control = df.loc[df['Status'] == 'Control']

#randomly seelct 538 samples from the cancer population to create an equal sample size
df_cancer_small = df_cancer.sample(n=538)

df1 = pd.concat([df_cancer_small, df_control])
df1                    

Unnamed: 0_level_0,GP1,GP2,GP3,GP4,GP5,GP6,GP7,GP8,GP9,GP10,...,GP18,GP19,GP20,GP21,GP22,GP23,GP24,Age at sample,Gender,Status
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CRC_2894,0.96,0.73,0.66,26.79,0.42,5.78,0.75,18.76,12.22,5.27,...,6.09,1.59,0.39,0.86,0.10,0.98,1.02,55.142466,M,Cancer
CRC_4054,0.27,0.49,0.67,28.69,0.57,4.84,0.42,16.55,11.04,3.94,...,6.70,2.25,0.68,1.29,0.24,1.99,2.15,69.594521,M,Cancer
CRC_2686,0.34,1.04,0.73,21.36,0.36,4.08,0.58,19.37,11.84,4.51,...,8.67,1.92,0.38,1.89,0.29,1.52,1.44,44.824658,F,Cancer
CRC_3848,0.20,1.35,0.49,26.83,0.51,8.06,1.28,16.01,10.48,5.45,...,5.80,1.72,0.40,1.33,0.44,1.11,1.62,74.728767,F,Cancer
CRC_7633,0.14,0.76,0.74,26.87,0.64,9.05,0.95,16.90,9.34,6.68,...,6.18,1.68,0.77,0.93,0.17,0.69,1.00,54.876712,M,Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC_9725,0.13,0.92,0.68,20.33,0.63,5.62,1.28,18.64,12.78,5.36,...,7.14,1.72,0.22,0.93,0.11,1.05,1.27,51.380822,M,Control
CRC_9763,0.04,0.54,0.42,21.59,0.35,6.19,0.76,18.78,7.92,5.31,...,9.78,1.61,0.33,0.42,0.14,0.53,0.87,47.915068,F,Control
CRC_9765,0.13,1.10,0.83,17.04,0.56,5.39,1.60,20.71,10.02,6.81,...,8.63,2.03,0.43,0.46,0.29,1.06,1.72,47.479452,M,Control
CRC_9784,0.08,0.80,0.32,22.14,0.35,4.67,0.93,18.52,15.42,4.29,...,6.72,1.45,0.34,0.65,0.09,0.97,1.05,50.323288,M,Control


In [42]:
#chnging type of data to 'category' from 'object'
df1.Gender = df1.Gender.astype('category')
df1.Status = df1.Status.astype('category')

#separate cancer markers and input data
df1_outputs= df1['Status']
df1_inputs = df1.drop(['Status', 'Age at sample'],axis=1)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df1_inputs, df1_outputs, random_state=100, test_size=0.3)

In [44]:
def grid_function(model, parameters, X_train, y_train):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])
    
    
    pipeline = Pipeline(steps=[('preprosessor', preprocessor), ('algorithm', model)])
                        #RandomUnderSampler(random_state = 42))
    
    search = GridSearchCV(pipeline, parameters, cv=StratifiedKFold(5), n_jobs=-1)
    
    search.fit(X_train, y_train)

    best_model = search.best_estimator_

    return(best_model, search)

In [65]:
def pred_function(model, tune_parameters, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    #Finding the best parameters 
    best_model, search = grid_function(model, tune_parameters, X_train, y_train)
    print (best_model._final_estimator)
    
    #Make prediction using the best model
    best_model._final_estimator.fit(X_train_sc, y_train)
    y_pred = best_model._final_estimator.predict(X_test_sc)

    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    

#### Random Forest Classifier

In [28]:
#Define parameters
rf_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
    'algorithm__bootstrap': [True]
}

rf = RandomForestClassifier(random_state=0)

In [15]:
#To extract feature importance scores 
best_model_rf, search_rf = grid_function(rf, rf_tune, X_train, y_train)
rf_ranking = pd.DataFrame(best_model_rf._final_estimator.feature_importances_, index=X_train.columns)
rf_ranking.columns = ['Importance']
rf_ranking

Unnamed: 0,Importance
GP1,0.031323
GP2,0.028697
GP3,0.031646
GP4,0.076973
GP5,0.04004
GP6,0.040274
GP7,0.030196
GP8,0.030665
GP9,0.048731
GP10,0.028669


In [66]:
pred_function(rf, rf_tune, X_train, y_train, X_test, y_test)

RandomForestClassifier(max_depth=7, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.74      0.63      0.68       170
     Control       0.65      0.76      0.70       153

    accuracy                           0.69       323
   macro avg       0.70      0.69      0.69       323
weighted avg       0.70      0.69      0.69       323

[[107  63]
 [ 37 116]]


#### Linear SVM

In [47]:
#Define paramters
svm_tune = { 
    'algorithm__kernel': ['linear'], 
    'algorithm__degree' : [2,3,4],
    'algorithm__C':[0, 1.0],
}

svm = SVC(random_state=0)

In [67]:
pred_function(svm, svm_tune, X_train, y_train, X_test, y_test)

SVC(degree=2, kernel='linear', random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.77      0.69      0.73       170
     Control       0.69      0.77      0.73       153

    accuracy                           0.73       323
   macro avg       0.73      0.73      0.73       323
weighted avg       0.73      0.73      0.73       323

[[118  52]
 [ 35 118]]




In [19]:
#To extract feature importance scores 
best_model_svm, search_svm = grid_function(svm, svm_tune, X_train, y_train)

svm_ranking = best_model_svm._final_estimator.coef_[0]

svm_ranking_table = pd.DataFrame(svm_ranking, index=X_train.columns)
svm_ranking_table.columns = ['Importance']
svm_ranking_table



Unnamed: 0,Importance
GP1,0.624631
GP2,-0.342848
GP3,0.143209
GP4,-0.447163
GP5,-0.212552
GP6,0.771183
GP7,-0.806407
GP8,0.353247
GP9,0.482703
GP10,-0.804781


### XGB

In [20]:
#Define parameters
xgb_tune = { 
    'algorithm__eta': [0.01, 0.05, 0.1, 0.3, 0.5, 1], #Step size shrinkage used in update to prevents overfitting
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
}

xgb = XGBClassifier(random_state=0)

In [21]:
#To extract feature importance scores 
best_model_xgb, search_xgb = grid_function(xgb, xgb_tune, X_train, y_train)
xgb_ranking = pd.DataFrame(best_model_xgb._final_estimator.feature_importances_, index=X_train.columns)
xgb_ranking.columns = ['Importance']
xgb_ranking





Unnamed: 0,Importance
GP1,0.037186
GP2,0.023617
GP3,0.028669
GP4,0.038457
GP5,0.031117
GP6,0.022952
GP7,0.035371
GP8,0.031611
GP9,0.043042
GP10,0.022348


In [68]:
pred_function(xgb, xgb_tune, X_train, y_train, X_test, y_test)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.72      0.68      0.70       170
     Control       0.66      0.71      0.69       153

    accuracy                           0.69       323
   macro avg       0.69      0.69      0.69       323
weighted avg       0.70      0.69      0.69       323

[[115  55]
 [ 44 109]]


### Stacked estimator

In [49]:
best_model_svm._final_estimator

SVC(degree=2, kernel='linear', random_state=0)

In [50]:
best_model_rf._final_estimator

RandomForestClassifier(max_depth=10, random_state=0)

In [62]:
def pred_evaluate(best_model_rf, best_model_svm, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    estimators = [('rf', best_model_rf._final_estimator),
                 ('svm', best_model_svm._final_estimator),
                 ('xgb', best_model_xgb._final_estimator)]
    
    sc = StackingClassifier(estimators = estimators, 
                           final_estimator = LogisticRegression())
    
    sc.fit(X_train_sc, y_train)
    
    y_pred = sc.predict(X_test_sc)
    
    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

In [63]:
pred_evaluate(best_model_rf, best_model_svm, X_train, y_train, X_test, y_test)
















Model Performance
              precision    recall  f1-score   support

      Cancer       0.76      0.70      0.73       170
     Control       0.69      0.76      0.72       153

    accuracy                           0.73       323
   macro avg       0.73      0.73      0.73       323
weighted avg       0.73      0.73      0.73       323

[[119  51]
 [ 37 116]]
