In [1]:
import pandas as pd
import numpy as np
import math
import itertools

from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from numpy.random import seed
seed(1)

In [2]:
def single_model(trainX, trainy, testX, testy, model):
    model.fit(trainX, trainy)
    y_predictions = model.predict(testX)
    y_pred_prob = model.predict_proba(testX)
    y_pred_prob = y_pred_prob[:, 1]
    return measurements(testy, y_predictions, y_pred_prob)

In [3]:
def measurements(y_test, y_pred, y_pred_prob):  
    acc = metrics.accuracy_score(y_test, y_pred)
    sensitivity = metrics.recall_score(y_test, y_pred)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    specificity = TN/(TN+FP)
    precision = metrics.precision_score(y_test, y_pred)
    npv = TN/(TN+FN)
    mcc = metrics.matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    f1 = metrics.f1_score(y_test, y_pred)
    return [auc, sensitivity, specificity, acc, f1, mcc, precision, npv]

In [4]:
def print_result(model_name, purpose, result):
    print('\033[1mOptimized {} model {} performance: \033[0m'.format(model_name, purpose))
    print("AUC:         {0:.3f}".format(result[0]))
    print("Sensitivity: {0:.3f}".format(result[1]))
    print("Specificity: {0:.3f}".format(result[2]))
    print("Accuracy:    {0:.3f}".format(result[3]))
    print("F1:          {0:.3f}".format(result[4]))
    print("MCC:         {0:.3f}".format(result[5]))
    print("PPV:         {0:.3f}".format(result[6]))
    print("NPV:         {0:.3f}".format(result[7]))

### Import dataset
#### Data is used for  training and validation
#### Test is only used for testing

In [5]:
data = pd.read_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\data.csv''', low_memory=False)
test = pd.read_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\testing.csv''', low_memory=False)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,sig_id,DILIst.1,5720,466,6009,2309,387,3553,427,...,9738,6793,7358,58472,50865,23200,51293,10962,10153,874
0,2433,DOS039_A549_24H:BRD-K81418486:0.1,0,-1.4398,-0.00845,-2.1229,2.63815,2.04815,1.41105,-0.09235,...,0.62255,1.0562,-0.14135,0.7882,-0.11335,2.35545,0.2334,-0.594,-1.06115,-1.31655
1,5536,CPC019_HT29_6H:BRD-K81418486:10,0,1.547422,-7.163943,-1.581482,1.650699,-1.088651,0.633513,2.663577,...,4.4186,-1.834802,-0.223164,-0.255458,5.733514,0.381488,-0.328168,1.290495,0.04982,0.984403
2,1735,CPC006_NCIH1694_6H:BRD-K81418486:10,0,1.48395,-1.1266,-1.8934,0.12675,0.05735,-0.5056,0.9811,...,-0.1636,0.3601,1.28305,-0.25085,0.5431,1.08195,-1.38025,-3.2754,-0.53095,-0.97735
3,5476,CPC002_HA1E_24H:BRD-A76528577-065-01-2:10,1,-5.01455,-3.31925,0.17835,-4.71535,-0.7833,6.01625,-4.4898,...,-4.1861,3.1962,-5.17785,0.70785,-0.4577,1.12585,-5.50515,-1.54205,4.93105,-2.72
4,4880,CPC011_VCAP_6H:BRD-K55696337-003-16-0:10,0,-2.6307,0.6382,-0.7925,-2.0211,-2.1888,2.0562,0.7165,...,-0.4546,5.7552,-0.1787,0.8583,0.7385,0.7271,-1.6301,0.6459,-1.4513,-2.5655


In [7]:
test.head()

Unnamed: 0.1,Unnamed: 0,sig_id,DILIst.1,5720,466,6009,2309,387,3553,427,...,9738,6793,7358,58472,50865,23200,51293,10962,10153,874
0,939,CPC004_VCAP_24H:BRD-A26384407-001-15-2:10,0,0.348344,0.425933,0.370705,-0.399295,-0.390995,-0.104098,0.514108,...,1.521188,-0.264948,0.251738,-0.882381,-0.001716,-2.107069,0.302459,-0.540988,-0.134683,0.666495
1,1179,CPC020_HA1E_6H:BRD-K68132782-001-01-7:10,1,-0.0783,-0.2213,0.013667,1.0008,0.099267,0.7304,-0.915333,...,0.012133,-0.064233,-0.729133,0.362333,0.777433,0.3324,0.022433,0.558833,0.862133,0.248533
2,4445,CPC006_NOMO1_6H:BRD-A47829399-001-01-1:10,1,0.1057,1.2143,-2.7672,-0.7878,-0.1128,-0.2807,-0.7268,...,-0.6959,0.1611,1.3605,0.7966,-1.4483,-1.2354,0.6755,3.9723,2.7938,-0.6683
3,3449,DOS045_A549_24H:BRD-K81418486:10,0,0.137677,-0.818573,0.104566,-0.648945,3.305253,0.974244,1.617118,...,0.402898,2.396454,-5.932832,-2.11632,0.685945,1.999205,0.054139,-1.864651,0.351202,-0.520308
4,4507,HDAC002_PC3_24H:BRD-K81418486-001-10-3:5,0,0.8006,-3.8433,-2.7344,-0.9009,-2.8792,-2.1548,0.2673,...,1.0941,1.9964,0.1018,-0.8091,0.6039,1.0129,-1.7846,-3.3053,-0.6386,-8.8717


In [8]:
data['DILIst.1'].value_counts()

1    2854
0    1946
Name: DILIst.1, dtype: int64

In [9]:
test['DILIst.1'].value_counts()

1    714
0    486
Name: DILIst.1, dtype: int64

In [10]:
X = data.iloc[:,3:].values
y = data.loc[:,'DILIst.1'].values

### Parameter tuning 

In [None]:
col_names =  ['Group', 'model','auc', 'sensitivity','specificity', 'acc', 'f1', 'mcc','precision', 'npv']

In [None]:
### define a dataframe to save the training performance
training_metrics  = pd.DataFrame(columns = col_names)

for k in [3,5,7,9,11]:  
    model = KNeighborsClassifier(n_neighbors=k)
    for j in range(100):
        print(k, j)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=j)
        result = single_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        training_metrics.loc[len(training_metrics)] = [str(j), 'knn_k'+str(k),result[0], result[1], result[2], result[3],  result[4], result[5]]

training_metrics.to_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\knn_training_metrics_all.csv''')

In [None]:
### define a dataframe to save the training performance
training_metrics  = pd.DataFrame(columns = col_names)

import itertools
kernels = ['poly', 'rbf']
Cs = [0.01, 0.1, 1, 10, 100]
gammas = [0.1, 0.01, 0.001, 0.0001]
paras = [l for l in itertools.product(kernels, Cs, gammas)]


for i in range(len(paras)):
    para=paras[i]
    kernel, C, gamma = para[0], para[1], para[2]
    model = SVC(C=C, kernel=kernel, gamma=gamma, probability=True)
    for j in range(100):
        name = 'svm_'+'paras_'+str(i)+'_kernel_'+para[0]+'_C_'+str(para[1])+'_gamma_'+str(para[2])
        print(i, j)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=j)
        result = single_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        training_metrics.loc[len(training_metrics)] = [str(j), name, result[0], result[1], result[2], result[3],  result[4], result[5]]

training_metrics.to_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\svm_training_metrics_all.csv''')

In [None]:
### define a dataframe to save the training performance
training_metrics  = pd.DataFrame(columns = col_names)

n_estimators = [100, 200, 300, 400, 500]
max_depth = [8, 10, 12]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

paras = [l for l in itertools.product(n_estimators, max_depth, min_samples_split, min_samples_leaf)]

for i in range(len(paras)):
    para = paras[i]
    n_estimator, depth, samples_split, samples_leaf = para[0], para[1], para[2], para[3]
    model = RandomForestClassifier(n_estimators=n_estimator, max_depth=depth, min_samples_split=samples_split, min_samples_leaf=samples_leaf, random_state=7)

    for j in range(100):
        print(i, j)
        name = 'rf_'+'_paras_'+str(i)+'_n_'+str(n_estimator)+'_depth_'+str(depth)+'_split_'+str(samples_split) + '_leaf_' + str(samples_leaf)
        X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=j)
        result = single_model(trainX=X_train, trainy=y_train, testX=X_test, testy=y_test, model=model)
        training_metrics.loc[len(training_metrics)] = [str(j), name, result[0], result[1], result[2], result[3],  result[4], result[5]]

training_metrics.to_csv(r'''C:\Users\Ting.Li\Documents\2019\projects\L1000\data\github\rf_training_metrics_all.csv''')

### Optimized model

In [11]:
### KNN
optimized_knn = KNeighborsClassifier(n_neighbors=3)
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=51)
### Optimized KNN model training performance
print_result('KNN', 'training', single_model(X_train, y_train, X_test, y_test, optimized_knn))
### Optimized KNN model testing performance
print_result('KNN', 'testing', single_model(X_train, y_train, test.iloc[:, 3:].values, test.loc[:,'DILIst.1'].values, optimized_knn))

[1mOptimized KNN model training performance: [0m
AUC:         0.762
Sensitivity: 0.834
Specificity: 0.591
Accuracy:    0.735
F1:          0.789
MCC:         0.441
PPV:         0.750
NPV:         0.708
[1mOptimized KNN model testing performance: [0m
AUC:         0.764
Sensitivity: 0.821
Specificity: 0.574
Accuracy:    0.721
F1:          0.778
MCC:         0.409
PPV:         0.739
NPV:         0.686


In [12]:
### SVM
optimized_svm = SVC(C=10, kernel='rbf', gamma=0.0001, probability=True)
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=78)
### Optimized SVM model training performance
print_result('SVM', 'training', single_model(X_train, y_train, X_test, y_test, optimized_svm))
### Optimized SVM model testing performance
print_result('SVM', 'testing', single_model(X_train, y_train, test.iloc[:, 3:].values, test.loc[:,'DILIst.1'].values, optimized_svm))

[1mOptimized SVM model training performance: [0m
AUC:         0.778
Sensitivity: 0.856
Specificity: 0.602
Accuracy:    0.753
F1:          0.805
MCC:         0.478
PPV:         0.759
NPV:         0.741
[1mOptimized SVM model testing performance: [0m
AUC:         0.777
Sensitivity: 0.888
Specificity: 0.529
Accuracy:    0.743
F1:          0.804
MCC:         0.455
PPV:         0.735
NPV:         0.763


In [13]:
### RF
optimized_rf = RandomForestClassifier(n_estimators=500, max_depth=None, min_samples_split=5,min_samples_leaf=1, random_state=7)
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,test_size=0.2, random_state=9)
### Optimized KNN model training performance
print_result('RF', 'training', single_model(X_train, y_train, X_test, y_test, optimized_rf))
### Optimized KNN model testing performance
print_result('RF', 'testing', single_model(X_train, y_train, test.iloc[:, 3:].values, test.loc[:,'DILIst.1'].values, optimized_rf))

[1mOptimized RF model training performance: [0m
AUC:         0.771
Sensitivity: 0.977
Specificity: 0.476
Accuracy:    0.774
F1:          0.837
MCC:         0.549
PPV:         0.732
NPV:         0.934
[1mOptimized RF model testing performance: [0m
AUC:         0.747
Sensitivity: 0.975
Specificity: 0.424
Accuracy:    0.752
F1:          0.824
MCC:         0.502
PPV:         0.713
NPV:         0.920
