#load relevant libraries
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp, SparkTrials
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, LabelBinarizer, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from keras.utils import to_categorical 

In [2]:
#read test split file, use sample names as labels and check it is importing correctly
train_inputs = pd.read_excel('Train_test_split.xlsx', sheet_name='X_train')
train_inputs.set_index('Name',inplace=True)

train_outputs = pd.read_excel('Train_test_split.xlsx', sheet_name='Y_train')
train_outputs.set_index('Name',inplace=True)

test_inputs = pd.read_excel('Train_test_split.xlsx', sheet_name='X_test')
test_inputs.set_index('Name',inplace=True)

test_outputs = pd.read_excel('Train_test_split.xlsx', sheet_name='Y_test')
test_outputs.set_index('Name',inplace=True)

train_inputs.head()

Unnamed: 0_level_0,GP1,GP2,GP3,GP4,GP5,GP6,GP7,GP8,GP9,GP10,...,GP15,GP16,GP17,GP18,GP19,GP20,GP21,GP22,GP23,GP24
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cancer 601,0.200806,0.622143,0.434054,21.974659,0.347204,4.403482,0.375938,16.208376,7.776824,7.973101,...,1.782842,4.102392,0.993213,7.565349,1.884447,0.231676,1.073694,0.203141,2.271263,1.607701
Control 511,0.439251,0.517359,0.235905,16.237487,0.379311,3.290237,0.724747,16.236001,8.38847,5.629089,...,1.012021,4.43415,1.052674,10.481091,1.588289,0.308982,0.797531,0.237942,1.857701,1.574524
Control 952,0.016163,0.04559,0.277896,18.01311,0.437176,5.428121,0.368711,18.861123,11.363616,3.958209,...,2.196522,3.559293,0.981267,6.822181,1.999011,0.702018,0.796898,0.059681,0.304341,2.0797
Cancer 554,0.117568,0.335593,0.596205,26.876041,0.056025,7.50204,0.479525,19.507167,9.518051,6.20504,...,2.040528,2.799585,0.720523,10.37054,1.891216,0.163211,0.848956,0.166757,1.707862,2.65364
Control 941,0.197376,0.609891,0.26049,13.922251,0.351637,4.844553,0.395345,20.647958,10.586655,7.66481,...,1.893717,2.393989,0.802805,5.036682,1.73148,0.5834,0.892605,-0.008055,0.582438,0.469921


In [3]:
#read full dataset from excel for fitting the data scaler
df = pd.read_excel('Colorectal Generated Data_New.xlsx')
df.set_index('Name',inplace=True)

df_inputs = df.drop('Marker',axis=1)

In [4]:
#fit scaler to numerical columns of full dataset and transform train and test sets
cs = MinMaxScaler()
cs.fit(df_inputs.select_dtypes(np.number))
train_inputs = cs.transform(train_inputs.select_dtypes(np.number))
test_inputs = cs.transform(test_inputs.select_dtypes(np.number))

test_inputs

array([[0.25886863, 0.37401868, 0.3617394 , ..., 0.48098811, 0.63407148,
        0.35721597],
       [0.67897333, 0.41252047, 1.        , ..., 0.42888629, 0.57768081,
        0.79525928],
       [0.33636708, 0.45246295, 0.48906978, ..., 0.62058621, 0.63351437,
        0.43909822],
       ...,
       [0.78819945, 0.78819945, 0.2192656 , ..., 0.55677081, 0.38918417,
        0.43422512],
       [0.44312648, 0.36648563, 0.35853441, ..., 0.4071722 , 0.34343011,
        0.45739059],
       [0.70108945, 0.46858461, 0.51384782, ..., 0.39492111, 0.60615352,
        0.54581738]])

In [5]:
#encode categorical outputs
train_outputs= to_categorical(train_outputs)
test_outputs= to_categorical(test_outputs)

test_outputs

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.

In [6]:
#define optimisation search space
space = hp.choice('classifier_type', [
    {
        'type': 'naive_bayes',
    },
    {
      
        'type': 'svm',
        'C': hp.lognormal('C', 0, 1.0),
        'kernel': hp.choice('kernel',['linear', 'poly', 'rbf']),
        'degree':hp.choice('degree',[2,3,4])
    },
])

In [7]:
#define optimisation model
best_score=1.0

def objective(space):
    classifier_type = space['type']
    del space['type']
    
    global best_score
    global best_model
    
    if classifier_type == 'naive_bayes':
        model = BernoulliNB(**space)
    elif classifier_type == 'svm':
        model = SVC(**space)
    else:
        return 0

    score = cross_val_score(model, train_inputs, np.argmax(train_outputs, axis=1), cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='f1', verbose=False).mean() 
    # Careful here (score). The objective function will be  minimized, thus somme treatment on your score might be needed.
    score=-score
    
    if (score < best_score):
        best_score=score
        best_model= model
    
    return score

In [8]:
#optimisation search function
best = fmin(objective, 
            space = space, 
            algo = tpe.suggest, 
            max_evals = 50,
            trials = Trials())


100%|███████████████████████████████████████████████| 50/50 [01:14<00:00,  1.48s/trial, best loss: -0.7866604587458852]


In [9]:
#print best choices
print(best)

{'C': 0.5292367418501142, 'classifier_type': 1, 'degree': 0, 'kernel': 1}


In [10]:
#print best model
best_model

SVC(C=0.5292367418501142, degree=2, kernel='poly')

In [11]:
#fit best model to training set for test metrics evaluation
best_model.fit(train_inputs, np.argmax(train_outputs, axis=1))

SVC(C=0.5292367418501142, degree=2, kernel='poly')

In [12]:
#predict test labels with model
y_pred = best_model.predict(test_inputs)

In [13]:
#get performance metrics
print(confusion_matrix(np.argmax(test_outputs,axis=1),y_pred))
print(classification_report(np.argmax(test_outputs,axis=1),y_pred))

[[165  46]
 [ 33 166]]
              precision    recall  f1-score   support

           0       0.83      0.78      0.81       211
           1       0.78      0.83      0.81       199

    accuracy                           0.81       410
   macro avg       0.81      0.81      0.81       410
weighted avg       0.81      0.81      0.81       410

