In [108]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer

from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import linear_model

In [59]:
df = pd.read_excel('Cleaned_Dataframe.xlsx')
df.set_index('Sample',inplace=True)

#chnging type of data to 'category' from 'object'
df.Gender = df.Gender.astype('category')
df.Status = df.Status.astype('category')

#separate cancer markers and input data
df_outputs= df['Status']
df_inputs = df.drop('Status',axis=1)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df_inputs, df_outputs, random_state=100, stratify=df_outputs, test_size=0.3)\

In [61]:
def grid_function(model, parameters, X_train, y_train):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])
    
    pipeline = Pipeline(steps=[('preprosessor', preprocessor),
                               ('algorithm', model)])
    
    search = GridSearchCV(pipeline, parameters, cv=StratifiedKFold(5), n_jobs=-1)
    
    search.fit(X_train, y_train)

    best_model = search.best_estimator_

    return(best_model, search)

In [122]:
def pred_function(best_model, X_train, y_train, X_test, y_test):
    
    num_transformer = StandardScaler()
    cat_transformer = OneHotEncoder(drop='if_binary', handle_unknown='error')
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', num_transformer, selector(dtype_exclude="category")),
        ('cat', cat_transformer, selector(dtype_include="category"))])

    X_train_sc = preprocessor.fit_transform(X_train)
    X_test_sc = preprocessor.transform(X_test)
    
    best_model._final_estimator.fit(X_train_sc, y_train)
    
    y_pred = best_model._final_estimator.predict(X_test_sc)
    
    score = accuracy_score(y_test, y_pred)
    
    #incorporate confusion matrix
    
    return(score)

In [126]:
def output(model, tune_parameters, X_train, y_train):
    
    best_model, search = grid_function(model, tune_parameters, X_train, y_train)
    
    print (best_model._final_estimator) 
    print ('The score in CV for the best estimator:', search.best_score_)
    print ('The score in testing for the best estimator:', pred_function(best_model, X_train, y_train, X_test, y_test))
    #print ('Accurary Score on testing set:', accuracy_score(best_model.predict(X_test),y_test))

#### Random Forest Classifier

In [124]:
#define parameters
rf_tune = { 
    'algorithm__n_estimators': [200, 500],
    'algorithm__max_depth' : [4,5,6,7,8]
}

rf = RandomForestClassifier(random_state=0)

In [127]:
output(rf, rf_tune, X_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=500, random_state=0)
The score in CV for the best estimator: 0.7841141340411413
The score in testing for the best estimator: 0.7670068027210885
Accurary Score on testing set: 0.7670068027210885


In [119]:
best_model_rf, search_rf = grid_function(rf, rf_tune, X_train, y_train)
rf_ranking = pd.DataFrame(best_model_rf._final_estimator.feature_importances_, index=X_val.columns)
rf_ranking.columns = ['Importance']
rf_ranking

Unnamed: 0,Importance
GP1,0.015351
GP2,0.014911
GP3,0.027179
GP4,0.085093
GP5,0.013145
GP6,0.038484
GP7,0.01239
GP8,0.015803
GP9,0.030703
GP10,0.014478


#### SVM

In [120]:
#define paramters
svm_tune = { 
    'algorithm__kernel': ['linear', 'rbf','poly','sigmoid'],
    'algorithm__degree' : [2,3,4],
    'algorithm__C':[0, 1.0],
}

svm = SVC(random_state=0)

In [121]:
output(svm, svm_tune, X_train, y_train)

SVC(degree=2, random_state=0)
The score in CV for the best estimator: 0.7680769741207698
The score in testing for the best estimator: 0.7687074829931972
Accurary Score on testing set: 0.7687074829931972
