In [2]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.under_sampling import RandomUnderSampler # to check again how to use this in a pipeline 


from sklearn.svm import SVC
from sklearn import linear_model

from xgboost import XGBClassifier


In [4]:
df1 = pd.read_excel('Cleaned_Dataframe_1298_datapoints.xlsx')
df1.set_index('Sample',inplace=True)

df1

Unnamed: 0_level_0,SOCCS.ID,GP1,GP2,GP3,GP4,GP5,GP6,GP7,GP8,GP9,...,GP18,GP19,GP20,GP21,GP22,GP23,GP24,Gender,Age at sample,Status
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CRC_4468,324640002,0.17,0.37,0.52,15.35,0.27,3.74,0.45,22.06,9.67,...,11.50,1.54,0.48,0.50,0.07,0.95,0.96,F,20.747945,Control
CRC_8680,236220199,0.17,0.26,0.26,8.81,0.18,3.20,0.47,18.93,7.31,...,13.86,2.57,0.70,1.28,0.28,1.62,1.86,F,22.413699,Control
CRC_8879,381640099,0.13,0.48,0.30,20.42,0.25,4.16,0.45,17.99,8.82,...,12.89,1.79,0.34,0.86,0.11,2.36,2.01,M,27.789041,Control
CRC_8260,406120111,0.19,0.18,0.31,12.42,0.35,2.61,0.62,22.48,10.54,...,12.34,2.01,0.52,0.65,0.09,1.83,1.71,F,31.410959,Control
CRC_8292,382140007,0.18,0.39,0.66,15.38,0.46,4.80,0.36,14.80,9.40,...,12.45,2.15,1.13,1.46,0.37,2.26,2.17,M,66.624658,Control
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC_7434,237000023,0.16,1.33,0.42,22.45,0.51,6.03,1.36,17.55,10.24,...,6.89,2.40,0.80,1.75,0.22,1.98,2.39,F,61.016438,Cancer
CRC_7051,238500001,0.26,1.64,0.71,23.56,0.39,8.11,0.97,17.64,8.48,...,7.16,2.44,0.33,0.93,0.28,1.30,2.77,F,60.819178,Cancer
CRC_7397,252000007,0.40,0.57,0.67,42.35,0.36,6.18,0.50,13.80,8.33,...,5.71,1.17,0.41,0.90,0.11,1.29,1.35,M,61.221918,Cancer
CRC_7981,257000022,0.09,0.38,0.37,22.65,0.29,6.48,0.43,18.47,9.29,...,9.13,2.45,0.30,0.75,0.10,1.71,2.83,F,61.142466,Cancer


In [5]:
#chnging type of data to 'category' from 'object'
df1.Gender = df1.Gender.astype('category')
df1.Status = df1.Status.astype('category')

#separate cancer markers and input data
df1_outputs= df1['Status']
df1_inputs = df1.drop('Status',axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df1_inputs, df1_outputs, random_state=100, test_size=0.3)

In [7]:
def grid_function(model, parameters, X_train, y_train):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])
    
    
    pipeline = Pipeline(steps=[('preprosessor', preprocessor), ('algorithm', model)])
                        #RandomUnderSampler(random_state = 42))
    
    search = GridSearchCV(pipeline, parameters, cv=StratifiedKFold(5), n_jobs=-1)
    
    search.fit(X_train, y_train)

    best_model = search.best_estimator_

    return(best_model, search)

In [8]:
def pred_function(model, tune_parameters, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    #Finding the best parameters 
    best_model, search = grid_function(model, tune_parameters, X_train, y_train)
    print (best_model._final_estimator)
    
    #Make prediction using the best model
    best_model._final_estimator.fit(X_train_sc, y_train)
    y_pred = best_model._final_estimator.predict(X_test_sc)

    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    

#### Random Forest Classifier

In [9]:
#Define parameters
rf_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
    'algorithm__bootstrap': [True]
}

rf = RandomForestClassifier(random_state=0)

In [10]:
#To extract feature importance scores 
best_model_rf, search_rf = grid_function(rf, rf_tune, X_train, y_train)
rf_ranking = pd.DataFrame(best_model_rf._final_estimator.feature_importances_, index=X_train.columns)
rf_ranking.columns = ['Importance']
rf_ranking

Unnamed: 0,Importance
SOCCS.ID,0.082086
GP1,0.027606
GP2,0.027919
GP3,0.03551
GP4,0.052757
GP5,0.027502
GP6,0.033807
GP7,0.029507
GP8,0.033168
GP9,0.040881


In [11]:
pred_function(rf, rf_tune, X_train, y_train, X_test, y_test)

RandomForestClassifier(max_depth=10, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.69      0.79      0.74       218
     Control       0.67      0.54      0.60       172

    accuracy                           0.68       390
   macro avg       0.68      0.67      0.67       390
weighted avg       0.68      0.68      0.68       390

[[173  45]
 [ 79  93]]


#### Linear SVM

In [12]:
#Define paramters
svm_tune = { 
    'algorithm__kernel': ['linear'], 
    'algorithm__degree' : [2,3,4],
    'algorithm__C':[0, 1.0],
}

svm = SVC(random_state=0)

In [13]:
pred_function(svm, svm_tune, X_train, y_train, X_test, y_test)

SVC(degree=2, kernel='linear', random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.68      0.82      0.74       218
     Control       0.69      0.51      0.58       172

    accuracy                           0.68       390
   macro avg       0.68      0.66      0.66       390
weighted avg       0.68      0.68      0.67       390

[[178  40]
 [ 85  87]]




In [14]:
#To extract feature importance scores 
best_model_svm, search_svm = grid_function(svm, svm_tune, X_train, y_train)

svm_ranking_table = pd.DataFrame(best_model_svm._final_estimator.coef_[0], index=X_train.columns)
svm_ranking_table.columns = ['Importance']
svm_ranking_table



Unnamed: 0,Importance
SOCCS.ID,0.220913
GP1,0.14831
GP2,-0.190882
GP3,0.215659
GP4,-0.475363
GP5,0.093943
GP6,0.990919
GP7,-0.47461
GP8,0.343085
GP9,0.568964


### XGB

In [15]:
#Define parameters
xgb_tune = { 
    'algorithm__eta': [0.01, 0.05, 0.1, 0.3, 0.5, 1], #Step size shrinkage used in update to prevents overfitting
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
}

xgb = XGBClassifier(random_state=0)

In [16]:
#To extract feature importance scores 
best_model_xgb, search_xgb = grid_function(xgb, xgb_tune, X_train, y_train)
xgb_ranking = pd.DataFrame(best_model_xgb._final_estimator.feature_importances_, index=X_train.columns)
xgb_ranking.columns = ['Importance']
xgb_ranking





Unnamed: 0,Importance
SOCCS.ID,0.067565
GP1,0.045216
GP2,0.021472
GP3,0.041809
GP4,0.035936
GP5,0.024149
GP6,0.027962
GP7,0.024429
GP8,0.032448
GP9,0.034861


In [17]:
pred_function(xgb, xgb_tune, X_train, y_train, X_test, y_test)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.1, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.100000001, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.72      0.82      0.77       218
     Control       0.72      0.59      0.65       172

    accuracy                           0.72       390
   macro avg       0.72      0.71      0.71       390
weighted avg       0.72      0.72      0.72       390

[[179  39]
 [ 70 102]]


### Stacked estimator

In [18]:
best_model_svm._final_estimator

SVC(degree=2, kernel='linear', random_state=0)

In [19]:
best_model_rf._final_estimator

RandomForestClassifier(max_depth=10, random_state=0)

In [20]:
def pred_evaluate(df, best_model_rf, best_model_svm, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    estimators = [('rf', best_model_rf._final_estimator),
                 ('svm', best_model_svm._final_estimator),
                 ('xgb', best_model_xgb._final_estimator)]
    
    sc = StackingClassifier(estimators = estimators, 
                           final_estimator = LogisticRegression())
    
    sc.fit(X_train_sc, y_train)
    
    y_pred = sc.predict(X_test_sc)
    
    #Trying to figure out extract the prediction into a table
    y_pred_df = pd.DataFrame(data = y_pred, columns = ['Prediction']
                             , index = X_test.index.copy())
    
    df_out = pd.merge(df1, y_pred_df, how = 'left', 
                      left_index = True, right_index = True)
    
    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    return df_out

In [21]:
pred_evaluate(df1, best_model_rf, best_model_svm, X_train, y_train, X_test, y_test)
















Model Performance
              precision    recall  f1-score   support

      Cancer       0.71      0.82      0.76       218
     Control       0.71      0.57      0.63       172

    accuracy                           0.71       390
   macro avg       0.71      0.69      0.69       390
weighted avg       0.71      0.71      0.70       390

[[178  40]
 [ 74  98]]


Unnamed: 0_level_0,SOCCS.ID,GP1,GP2,GP3,GP4,GP5,GP6,GP7,GP8,GP9,...,GP19,GP20,GP21,GP22,GP23,GP24,Gender,Age at sample,Status,Prediction
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CRC_4468,324640002,0.17,0.37,0.52,15.35,0.27,3.74,0.45,22.06,9.67,...,1.54,0.48,0.50,0.07,0.95,0.96,F,20.747945,Control,
CRC_8680,236220199,0.17,0.26,0.26,8.81,0.18,3.20,0.47,18.93,7.31,...,2.57,0.70,1.28,0.28,1.62,1.86,F,22.413699,Control,
CRC_8879,381640099,0.13,0.48,0.30,20.42,0.25,4.16,0.45,17.99,8.82,...,1.79,0.34,0.86,0.11,2.36,2.01,M,27.789041,Control,
CRC_8260,406120111,0.19,0.18,0.31,12.42,0.35,2.61,0.62,22.48,10.54,...,2.01,0.52,0.65,0.09,1.83,1.71,F,31.410959,Control,
CRC_8292,382140007,0.18,0.39,0.66,15.38,0.46,4.80,0.36,14.80,9.40,...,2.15,1.13,1.46,0.37,2.26,2.17,M,66.624658,Control,Control
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CRC_7434,237000023,0.16,1.33,0.42,22.45,0.51,6.03,1.36,17.55,10.24,...,2.40,0.80,1.75,0.22,1.98,2.39,F,61.016438,Cancer,Cancer
CRC_7051,238500001,0.26,1.64,0.71,23.56,0.39,8.11,0.97,17.64,8.48,...,2.44,0.33,0.93,0.28,1.30,2.77,F,60.819178,Cancer,
CRC_7397,252000007,0.40,0.57,0.67,42.35,0.36,6.18,0.50,13.80,8.33,...,1.17,0.41,0.90,0.11,1.29,1.35,M,61.221918,Cancer,
CRC_7981,257000022,0.09,0.38,0.37,22.65,0.29,6.48,0.43,18.47,9.29,...,2.45,0.30,0.75,0.10,1.71,2.83,F,61.142466,Cancer,
