In [41]:
# imports

import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection, metrics, utils
from imblearn.over_sampling import SMOTE
import csv

In [48]:
# Util functions

def classifier_estimation(classifier, x_test, y_test):
    print
    print('Test set has ', len([i for i in y_test if i is 1]), 'positives and ', len([i for i in y_test if i is 0]), ' negatives')
    print
    print ('Best classifier score')
    print
    print(metrics.classification_report(y_test, classifier.predict(x_test), target_names=['non-binding', 'binding']))
    
    
def calculate_performance(classifier, x_test, y_test):
    
    prec = metrics.precision_score(y_test,classifier.predict(x_test),average=None)
    sensi = metrics.recall_score(y_test,classifier.predict(x_test),average=None)
    f1 = metrics.f1_score(y_test,classifier.predict(x_test),average=None)
    
    performance = [round(prec[0],3),round(prec[1],3),round(sensi[0],3),round(sensi[1],3),round(f1[0],3),round(f1[1],3)]
    return performance


def bootstrapping(classifier, x_test, y_test):
    size = int(round(len(x_test)/2))
    performances = []

    for i in range(0,999):
        part_x,part_y = utils.resample(x_test,y_test,replace=False,n_samples=size)
        tmp_performance = calculate_performance(classifier,part_x,part_y)
        string_performance = "\t".join([str(i) for i in tmp_performance])
        performances.append(string_performance)
       
    return performances

In [43]:
# general specifications

path = "data/"
model = "minimal_model_sw"

In [44]:
# import data

positives_dna = []
negatives_dna = []
positives_enzyme = []
negatives_enzyme = []

with open(path+model+'_pdidb_positives.txt') as p_dna:
    for row in csv.reader(p_dna,delimiter="\t"):
        positives_dna.append(row)

with open(path+model+'_pdidb_negatives.txt') as n_dna:
    for row in csv.reader(n_dna,delimiter="\t"):
        negatives_dna.append(row)
       
with open(path+model+'_enzyme_positives.txt') as p_enzyme:
    for row in csv.reader(p_enzyme,delimiter="\t"):
        positives_enzyme.append(row)
        
with open(path+model+'_enzyme_negatives.txt') as n_enzyme:
    for row in csv.reader(n_enzyme,delimiter="\t"):
        negatives_enzyme.append(row)

In [23]:
# pre-process data

# remove elements representing distances //TODO


tmp_dna = positives_dna + negatives_dna
tmp_enzyme = positives_enzyme + negatives_enzyme

labels_dna = [1]*len(positives_dna)+[0]*len(negatives_dna)
labels_enzyme = [1]*len(positives_enzyme)+[0]*len(negatives_enzyme)

dna = []
enzyme = []

for j in tmp_dna:
    new_j = [float(i) for i in j]
    dna.append(new_j)
    
for j in tmp_enzyme:
    new_j = [float(i) for i in j]
    enzyme.append(new_j)

In [24]:
# train and test datasets

dna_x_train,dna_x_test,dna_y_train,dna_y_test = model_selection.train_test_split(dna, labels_dna, test_size=0.2, random_state=6)
enzyme_x_train,enzyme_x_test,enzyme_y_train,enzyme_y_test = model_selection.train_test_split(enzyme,labels_enzyme,test_size=0.2,random_state=6)

x_train = dna_x_train + enzyme_x_train
y_train = dna_y_train + enzyme_y_train
x_test = dna_x_test + enzyme_x_test
y_test = dna_y_test + enzyme_y_test

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_sample(x_train,y_train)

kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

In [25]:
# training the neural network

parameters = {'alpha': [0.0001], 'random_state': [1]}
grid = model_selection.GridSearchCV(MLPClassifier(), parameters, cv=kf)
grid.fit(x_train,y_train)
classifier = grid.best_estimator_

In [26]:
# get information about the best classifier

print(classifier)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)


In [40]:
# testing

# prediction = classifier.predict(enzyme_x_test)
# prediction = classifier.predict(x_test)
# unique, counts = np.unique(prediction, return_counts=True)
# print(dict(zip(unique, counts)))

classifier_estimation(classifier,enzyme_x_test,enzyme_y_test)
classifier_estimation(classifier,dna_x_test, dna_y_test)
classifier_estimation(classifier,x_test,y_test)

print("Training set score: %f" % classifier.score(x_train, y_train))
print("Test set score: %f" % classifier.score(x_test, y_test))
print(classifier.n_layers_)
print(classifier.n_iter_)

{0: 12298, 1: 3335}
Test set has  1335 positives and  11615  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.92      0.82      0.86     11615
    binding       0.19      0.38      0.26      1335

avg / total       0.84      0.77      0.80     12950

Test set has  363 positives and  2320  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.90      0.78      0.83      2320
    binding       0.24      0.46      0.32       363

avg / total       0.81      0.73      0.77      2683

Test set has  1698 positives and  13935  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.92      0.81      0.86     13935
    binding       0.20      0.40      0.27      1698

avg / total       0.84      0.76      0.80     15633



[0.91689705643194019, 0.20269865067466267, 0.80918550412630064, 0.39811542991755006, 0.85968055502611207, 0.26862706139479436]
Training set score: 0.800120
Test set score: 0.764537
3
128


In [49]:
# bootstrapping to get performance distribution

performances = bootstrapping(classifier,x_test,y_test)
performances_dna = bootstrapping(classifier,dna_x_test,dna_y_test)
performances_enzyme = bootstrapping(classifier,enzyme_x_test,enzyme_y_test)
    
with open(path+'performance_'+model+'.txt',"w") as o:
    o.write("\n".join(performances))
    
with open(path+'performance_'+model+'_pdidb.txt',"w") as o:
    o.write("\n".join(performances_dna))
    
with open(path+'performance_'+model+'_enzyme.txt',"w") as o:
    o.write("\n".join(performances_enzyme))

In [5]:
# import random data

positives_dna_rnd = []
negatives_dna_rnd = []
positives_enzyme_rnd = []
negatives_enzyme_rnd = []

with open('data/rnd_more_minimal_model_pdidb_positives.txt') as p_dna:
    for row in csv.reader(p_dna,delimiter="\t"):
        positives_dna_rnd.append(row)

with open('data/rnd_more_minimal_model_pdidb_negatives.txt') as n_dna:
    for row in csv.reader(n_dna,delimiter="\t"):
        negatives_dna_rnd.append(row)
       
with open('data/rnd_more_minimal_model_enzyme_positives.txt') as p_enzyme:
    for row in csv.reader(p_enzyme,delimiter="\t"):
        positives_enzyme_rnd.append(row)
        
with open('data/rnd_more_minimal_model_enzyme_negatives.txt') as n_enzyme:
    for row in csv.reader(n_enzyme,delimiter="\t"):
        negatives_enzyme_rnd.append(row)
        
#positives_dna_rnd = [x[1:len(positives_dna_rnd[0])-1] for x in positives_dna_rnd]
#negatives_dna_rnd = [x[1:len(negatives_dna_rnd[0])-1] for x in negatives_dna_rnd]
#positives_enzyme_rnd = [x[1:len(positives_enzyme_rnd[0])-1] for x in positives_enzyme_rnd]
#negatives_enzyme_rnd = [x[1:len(negatives_enzyme_rnd[0])-1] for x in negatives_enzyme_rnd]
        
tmp_dna_rnd = positives_dna_rnd + negatives_dna_rnd
tmp_enzyme_rnd = positives_enzyme_rnd + negatives_enzyme_rnd

labels_dna_rnd = [1]*len(positives_dna_rnd)+[0]*len(negatives_dna_rnd)
labels_enzyme_rnd = [1]*len(positives_enzyme_rnd)+[0]*len(negatives_enzyme_rnd)

dna_rnd = []
enzyme_rnd = []

for j in tmp_dna_rnd:
    new_j = [float(i) for i in j]
    dna_rnd.append(new_j)

for j in tmp_enzyme_rnd:
    new_j = [float(i) for i in j]
    enzyme_rnd.append(new_j)

data_rnd = dna_rnd + enzyme_rnd
labels_rnd = labels_dna_rnd + labels_enzyme_rnd

dna_x_train_rnd,dna_x_test_rnd,dna_y_train_rnd,dna_y_test_rnd = model_selection.train_test_split(dna_rnd,labels_dna_rnd,test_size=0.2,random_state=6)
enzyme_x_train_rnd,enzyme_x_test_rnd,enzyme_y_train_rnd,enzyme_y_test_rnd = model_selection.train_test_split(enzyme_rnd,labels_enzyme_rnd,test_size=0.2,random_state=6)

x_test_rnd = dna_x_test_rnd + enzyme_x_test_rnd
y_test_rnd = dna_y_test_rnd + enzyme_y_test_rnd

In [53]:
#comparison to random

#prediction = classifier.predict(enzyme_x_test)
prediction = classifier.predict(data_rnd)
unique, counts = np.unique(prediction, return_counts=True)
print(dict(zip(unique, counts)))

classifier_estimation(classifier,enzyme_rnd,labels_enzyme_rnd)
classifier_estimation(classifier,dna_rnd,labels_dna_rnd)
classifier_estimation(classifier,data_rnd,labels_rnd)

print("Test set score: %f" % classifier.score(data_rnd,labels_rnd))

{0: 44187, 1: 33978}
Test set has  6581 positives and  58169  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.90      0.57      0.70     58169
    binding       0.11      0.46      0.17      6581

avg / total       0.82      0.56      0.64     64750

Test set has  1678 positives and  11737  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.88      0.57      0.69     11737
    binding       0.14      0.47      0.21      1678

avg / total       0.79      0.56      0.63     13415

Test set has  8259 positives and  69906  negatives
Best classifier score


             precision    recall  f1-score   support

non-binding       0.90      0.57      0.70     69906
    binding       0.11      0.46      0.18      8259

avg / total       0.82      0.56      0.64     78165

Test set score: 0.556643
