In [1]:
import pandas as pd
import numpy as np
import math
import itertools
#import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import train_test_split


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from numpy.random import seed
seed(1)

In [59]:
def single_model(trainX, trainy, testX, testy, model):
    model.fit(trainX, trainy)
    y_predictions = model.predict(testX)
    y_pred_prob = model.predict_proba(testX)
    y_pred_prob = y_pred_prob[:, 1]
    return measurements(testy, y_predictions, y_pred_prob)

In [61]:
def measurements(y_test, y_pred, y_pred_prob):  
    acc = metrics.accuracy_score(y_test, y_pred)
    sensitivity = metrics.recall_score(y_test, y_pred)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    specificity = TN/(TN+FP)
    precision = metrics.precision_score(y_test, y_pred)
    mcc = metrics.matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    return [sensitivity, specificity, precision, acc, mcc, auc]

In [64]:
def print_result(model_name, purpose, result):
    print('\033[1mOptimized {} model {} performance: \033[0m'.format(model_name, purpose))
    print("Accuracy:    {0:.3f}".format(result[3]))
    print("AUC:         {0:.3f}".format(result[5]))
    print("Sensitivity: {0:.3f}".format(result[0]))
    print("Specificity: {0:.3f}".format(result[1]))
    print("Precision:   {0:.3f}".format(result[2]))
    print("MCC:         {0:.3f}".format(result[4]))

### Import dataset
#### Data is used for  training and validation
#### Test is only used for testing

In [24]:
data = pd.read_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\data.csv''', low_memory=False)
test = pd.read_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\testing.csv''', low_memory=False)

In [26]:
X = data.iloc[:,3:].values
y = data.loc[:,'DILIst.1'].values

### Parameter tuning 

In [None]:
col_names =  ['Group', 'model','sensitivity','specificity', 'precision', 'acc', 'mcc', 'auc']

In [None]:
### define a dataframe to save the training performance
training_metrics  = pd.DataFrame(columns = col_names)

for k in [3,5,7,9,11]:  
    model = KNeighborsClassifier(n_neighbors=k)
    for j in range(100):
        print(k, j)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=j)
        result = single_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        training_metrics.loc[len(training_metrics)] = [str(j), 'knn_k'+str(k),result[0], result[1], result[2], result[3],  result[4], result[5]]

training_metrics.to_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\knn_training_metrics_all.csv''')

In [None]:
### define a dataframe to save the training performance
training_metrics  = pd.DataFrame(columns = col_names)

import itertools
kernels = ['poly', 'rbf']
Cs = [0.01, 0.1, 1, 10, 100]
gammas = [0.1, 0.01, 0.001, 0.0001]
paras = [l for l in itertools.product(kernels, Cs, gammas)]


for i in range(len(paras)):
    para=paras[i]
    kernel, C, gamma = para[0], para[1], para[2]
    model = SVC(C=C, kernel=kernel, gamma=gamma, probability=True)
    for j in range(100):
        name = 'svm_'+'paras_'+str(i)+'_kernel_'+para[0]+'_C_'+str(para[1])+'_gamma_'+str(para[2])
        print(i, j)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=j)
        result = single_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        training_metrics.loc[len(training_metrics)] = [str(j), name, result[0], result[1], result[2], result[3],  result[4], result[5]]

training_metrics.to_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\svm_training_metrics_all.csv''')

In [None]:
### define a dataframe to save the training performance
training_metrics  = pd.DataFrame(columns = col_names)

n_estimators = [100, 200, 300, 400, 500]
max_depth = [8, 10, 12]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

paras = [l for l in itertools.product(n_estimators, max_depth, min_samples_split, min_samples_leaf)]

for i in range(len(paras)):
    para = paras[i]
    n_estimator, depth, samples_split, samples_leaf = para[0], para[1], para[2], para[3]
    model = RandomForestClassifier(n_estimators=n_estimator, max_depth=depth, min_samples_split=samples_split, min_samples_leaf=samples_leaf, random_state=7)

    for j in range(100):
        print(i, j)
        name = 'rf_'+'_paras_'+str(i)+'_n_'+str(n_estimator)+'_depth_'+str(depth)+'_split_'+str(samples_split) + '_leaf_' + str(samples_leaf)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=j)
        result = single_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        training_metrics.loc[len(training_metrics)] = [str(j), name, result[0], result[1], result[2], result[3],  result[4], result[5]]

training_metrics.to_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\rf_training_metrics_all.csv''')

### Optimized model

In [67]:
### KNN
optimized_knn = KNeighborsClassifier(n_neighbors=3)
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=51)
### Optimized KNN model training performance
print_result('KNN', 'training', single_model(X_train, y_train, X_test, y_test, optimized_knn))
### Optimized KNN model testing performance
print_result('KNN', 'testing', single_model(X_train, y_train, test.iloc[:, 3:].values, test.loc[:,'DILIst.1'].values, optimized_knn))

[1mOptimized KNN model training performance: [0m
Accuracy:    0.735
AUC:         0.762
Sensitivity: 0.834
Specificity: 0.591
Precision:   0.750
MCC:         0.441
[1mOptimized KNN model testing performance: [0m
Accuracy:    0.721
AUC:         0.764
Sensitivity: 0.821
Specificity: 0.574
Precision:   0.739
MCC:         0.409


In [68]:
### SVM
optimized_svm = SVC(C=10, kernel='rbf', gamma=0.0001, probability=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=78)
### Optimized SVM model training performance
print_result('SVM', 'training', single_model(X_train, y_train, X_test, y_test, optimized_svm))
### Optimized SVM model testing performance
print_result('SVM', 'testing', single_model(X_train, y_train, test.iloc[:, 3:].values, test.loc[:,'DILIst.1'].values, optimized_svm))

[1mOptimized SVM model training performance: [0m
Accuracy:    0.753
AUC:         0.778
Sensitivity: 0.856
Specificity: 0.602
Precision:   0.759
MCC:         0.478
[1mOptimized SVM model testing performance: [0m
Accuracy:    0.743
AUC:         0.777
Sensitivity: 0.888
Specificity: 0.529
Precision:   0.735
MCC:         0.455


In [69]:
### RF
optimized_rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=5,min_samples_leaf=1, random_state=7)
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=9)
### Optimized KNN model training performance
print_result('RF', 'training', single_model(X_train, y_train, X_test, y_test, optimized_rf))
### Optimized KNN model testing performance
print_result('RF', 'testing', single_model(X_train, y_train, test.iloc[:, 3:].values, test.loc[:,'DILIst.1'].values, optimized_rf))

[1mOptimized RF model training performance: [0m
Accuracy:    0.774
AUC:         0.771
Sensitivity: 0.977
Specificity: 0.476
Precision:   0.732
MCC:         0.549
[1mOptimized RF model testing performance: [0m
Accuracy:    0.752
AUC:         0.747
Sensitivity: 0.975
Specificity: 0.424
Precision:   0.713
MCC:         0.502
