In [104]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier


from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier, BalancedRandomForestClassifier, EasyEnsembleClassifier
from imblearn.under_sampling import RandomUnderSampler # to check again how to use this in a pipeline 


from sklearn.svm import SVC
from sklearn import linear_model

from xgboost import XGBClassifier


In [72]:
df = pd.read_excel('Cleaned_Dataframe.xlsx')
df.set_index('Sample',inplace=True)

#chnging type of data to 'category' from 'object'
df.Gender = df.Gender.astype('category')
df.Status = df.Status.astype('category')

#separate cancer markers and input data
df_outputs= df['Status']
df_inputs = df.drop('Status',axis=1)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(df_inputs, df_outputs, random_state=100, stratify=df_outputs, test_size=0.3)\

In [74]:
def grid_function(model, parameters, X_train, y_train):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])
    
    
    pipeline = Pipeline(steps=[('preprosessor', preprocessor), ('algorithm', model)])
                        #RandomUnderSampler(random_state = 42))
    
    search = GridSearchCV(pipeline, parameters, cv=StratifiedKFold(5), n_jobs=-1)
    
    search.fit(X_train, y_train)

    best_model = search.best_estimator_

    return(best_model, search)

In [75]:
def pred_function(best_model, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    best_model._final_estimator.fit(X_train_sc, y_train)
    
    y_pred = best_model._final_estimator.predict(X_test_sc)
    
    score = accuracy_score(y_test, y_pred)
    
    #incorporate confusion matrix
    
    return(score)

In [76]:
def evaluation(model, tune_parameters, X_train, y_train, X_test, y_test):
    
    #Finding the best parameters 
    best_model, search = grid_function(model, tune_parameters, X_train, y_train)
    print (best_model._final_estimator)
   
    #Calculate the labels for the test set
    best_model_predictions = best_model.predict(X_test)
    
    #Print test performance of the model
    print()
    print('Model Performance')
    print(classification_report(y_test, best_model_predictions))
    print(confusion_matrix(y_test, best_model_predictions))
    
    #print ('The score in CV for the best estimator:', search.best_score_)
    #print ('The score in testing for the best estimator:', pred_function(best_model, X_train, y_train, X_test, y_test))
    #print ('Accurary Score on testing set:', accuracy_score(best_model.predict(X_test),y_test))

#### Random Forest Classifier

In [86]:
#Define parameters
rf_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
    'algorithm__bootstrap': [True]
}

rf = RandomForestClassifier(random_state=0)

In [87]:
#To extract feature importance scores 
best_model_rf, search_rf = grid_function(rf, rf_tune, X_train, y_train)
rf_ranking = pd.DataFrame(best_model_rf._final_estimator.feature_importances_, index=X_train.columns)
rf_ranking.columns = ['Importance']
rf_ranking

Unnamed: 0,Importance
GP1,0.016178
GP2,0.020204
GP3,0.031298
GP4,0.069531
GP5,0.018813
GP6,0.041145
GP7,0.018878
GP8,0.024805
GP9,0.03173
GP10,0.021885


In [89]:
evaluation(rf, rf_tune, X_train, y_train, X_test, y_test)

RandomForestClassifier(max_depth=7, n_estimators=300, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.82      0.88      0.85       426
     Control       0.61      0.48      0.54       162

    accuracy                           0.77       588
   macro avg       0.71      0.68      0.69       588
weighted avg       0.76      0.77      0.76       588

[[376  50]
 [ 84  78]]


#### Linear SVM

In [62]:
#Define paramters
svm_tune = { 
    'algorithm__kernel': ['linear'], 
    'algorithm__degree' : [2,3,4],
    'algorithm__C':[0, 1.0],
}

svm = SVC(random_state=0)

In [63]:
evaluation(svm, svm_tune, X_train, y_train, X_test, y_test)



SVC(degree=2, kernel='linear', random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.81      0.87      0.84       426
     Control       0.58      0.47      0.52       162

    accuracy                           0.76       588
   macro avg       0.70      0.67      0.68       588
weighted avg       0.75      0.76      0.75       588

[[371  55]
 [ 86  76]]


In [85]:
#To extract feature importance scores 
best_model_svm, search_svm = grid_function(svm, svm_tune, X_train, y_train)

svm_ranking = best_model_svm._final_estimator.coef_[0]

svm_ranking_table = pd.DataFrame(svm_ranking, index=X_train.columns)
svm_ranking_table.columns = ['Importance']
svm_ranking_table



Unnamed: 0,Importance
GP1,0.334059
GP2,-0.121222
GP3,-0.105627
GP4,-0.368241
GP5,-0.153284
GP6,0.59834
GP7,-0.404958
GP8,0.143984
GP9,0.587813
GP10,-0.347148


### XGB

In [26]:
#Define parameters
xgb_tune = { 
    'algorithm__eta': [0.01, 0.05, 0.1, 0.3, 0.5, 1], #Step size shrinkage used in update to prevents overfitting
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
}

xgb = XGBClassifier(random_state=0)

In [27]:
#To extract feature importance scores 
best_model_xgb, search_xgb = grid_function(xgb, xgb_tune, X_train, y_train)
xgb_ranking = pd.DataFrame(best_model_xgb._final_estimator.feature_importances_, index=X_train.columns)
xgb_ranking.columns = ['Importance']
xgb_ranking





Unnamed: 0,Importance
GP1,0.026793
GP2,0.023544
GP3,0.03879
GP4,0.047205
GP5,0.023087
GP6,0.021347
GP7,0.022705
GP8,0.025787
GP9,0.023368
GP10,0.02293


In [28]:
evaluation(xgb, xgb_tune, X_train, y_train, X_test, y_test)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.05, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.0500000007, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.83      0.85      0.84       426
     Control       0.58      0.54      0.56       162

    accuracy                           0.77       588
   macro avg       0.71      0.70      0.70       588
weighted avg       0.76      0.77      0.76       588

[[363  63]
 [ 74  88]]


## To solve data imbalance

#### Balanced Bagging Classifier 

In [102]:
#Define parameters
bbc_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__bootstrap': [True, False]
    #'algorithm__base_estimator':['HistGradientBoostingClassifier', 'DeicisionTreeClassifier']
}

bbc = BalancedBaggingClassifier(random_state=0)

In [103]:
evaluation(bbc, bbc_tune, X_train, y_train, X_test, y_test)

BalancedBaggingClassifier(n_estimators=400, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.89      0.73      0.80       426
     Control       0.52      0.76      0.61       162

    accuracy                           0.74       588
   macro avg       0.70      0.74      0.71       588
weighted avg       0.79      0.74      0.75       588

[[311 115]
 [ 39 123]]


#### Balanced Random Forest Classifier 

In [35]:
#Define parameters
brf_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__bootstrap': [True, False],
}

brf = BalancedRandomForestClassifier(random_state=0)

In [36]:
evaluation(brf, brf_tune, X_train, y_train, X_test, y_test)

BalancedRandomForestClassifier(bootstrap=False, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.93      0.70      0.80       426
     Control       0.52      0.86      0.65       162

    accuracy                           0.74       588
   macro avg       0.72      0.78      0.72       588
weighted avg       0.82      0.74      0.75       588

[[297 129]
 [ 23 139]]


#### Easy Ensemble Classifier

In [111]:
#Define parameters
eec_tune = { 
    'algorithm__n_estimators': [100,200, 300]
}

eec = EasyEnsembleClassifier(random_state=0)

In [112]:
evaluation(eec, eec_tune, X_train, y_train, X_test, y_test)

EasyEnsembleClassifier(n_estimators=300, random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.93      0.67      0.78       426
     Control       0.50      0.88      0.64       162

    accuracy                           0.73       588
   macro avg       0.72      0.77      0.71       588
weighted avg       0.82      0.73      0.74       588

[[286 140]
 [ 20 142]]


#### Random Forrest Classifier with adjusted class weight 

In [107]:
#Define parameters
rf_balanced_tune = { 
    'algorithm__n_estimators': [100,200, 300, 400, 500, 1000],
    'algorithm__max_depth' : [4,5,6,7,8,9,10],
    'algorithm__bootstrap': [True]
}

rf_balanced = RandomForestClassifier(random_state=0, class_weight = 'balanced')
rf_subsample_balanced = RandomForestClassifier(random_state=0, class_weight = 'balanced_subsample')

In [108]:
evaluation(rf_balanced, rf_balanced_tune, X_train, y_train, X_test, y_test)

RandomForestClassifier(class_weight='balanced', max_depth=10, n_estimators=1000,
                       random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.84      0.83      0.83       426
     Control       0.56      0.58      0.57       162

    accuracy                           0.76       588
   macro avg       0.70      0.70      0.70       588
weighted avg       0.76      0.76      0.76       588

[[353  73]
 [ 68  94]]


In [109]:
evaluation(rf_subsample_balanced, rf_balanced_tune, X_train, y_train, X_test, y_test)

RandomForestClassifier(class_weight='balanced_subsample', max_depth=10,
                       random_state=0)

Model Performance
              precision    recall  f1-score   support

      Cancer       0.85      0.83      0.84       426
     Control       0.58      0.62      0.60       162

    accuracy                           0.77       588
   macro avg       0.72      0.72      0.72       588
weighted avg       0.78      0.77      0.77       588

[[354  72]
 [ 62 100]]
