In [1]:
# imports

import numpy as np
import functions as f
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from imblearn.over_sampling import SMOTE
import csv

In [24]:
# general specifications

path = "data/"
model_path = "combined_model/"
performance_path = "performance/"
model = "mm3_avg_lr"

In [25]:
# import data

positives_dna = []
negatives_dna = []
positives_enzyme = []
negatives_enzyme = []

with open(path+model_path+model+'_pdidb_positives.txt') as p_dna:
    for row in csv.reader(p_dna,delimiter="\t"):
        positives_dna.append(row)

with open(path+model_path+model+'_pdidb_negatives.txt') as n_dna:
    for row in csv.reader(n_dna,delimiter="\t"):
        negatives_dna.append(row)
       
with open(path+model_path+model+'_enzyme_positives.txt') as p_enzyme:
    for row in csv.reader(p_enzyme,delimiter="\t"):
        positives_enzyme.append(row)
        
with open(path+model_path+model+'_enzyme_negatives.txt') as n_enzyme:
    for row in csv.reader(n_enzyme,delimiter="\t"):
        negatives_enzyme.append(row)

In [26]:
# pre-process data

tmp_dna = positives_dna + negatives_dna
tmp_enzyme = positives_enzyme + negatives_enzyme

labels_dna = [1]*len(positives_dna)+[0]*len(negatives_dna)
labels_enzyme = [1]*len(positives_enzyme)+[0]*len(negatives_enzyme)

dna = []
enzyme = []

for j in tmp_dna:
    new_j = [float(i) for i in j]
    dna.append(new_j)
    
for j in tmp_enzyme:
    new_j = [float(i) for i in j]
    enzyme.append(new_j)

In [27]:
# train and test datasets

dna_x_train,dna_x_test,dna_y_train,dna_y_test = model_selection.train_test_split(dna, labels_dna, test_size=0.2, random_state=6)
enzyme_x_train,enzyme_x_test,enzyme_y_train,enzyme_y_test = model_selection.train_test_split(enzyme,labels_enzyme,test_size=0.2,random_state=6)

tmp_x_train = dna_x_train + enzyme_x_train
tmp_y_train = dna_y_train + enzyme_y_train
x_test = dna_x_test + enzyme_x_test
y_test = dna_y_test + enzyme_y_test

x_train,x_cross,y_train,y_cross = model_selection.train_test_split(tmp_x_train,tmp_y_train,test_size=0.2,random_state=6)

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_sample(x_train,y_train)

#kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

In [28]:
# training the neural network

#parameters = {'alpha': [0.0001], 'random_state': [1], 'hidden_layer_sizes': [(100,),(200,),(300,)]}
#grid = model_selection.GridSearchCV(MLPClassifier(), parameters, cv=kf)
#grid.fit(x_train,y_train)
#classifier = grid.best_estimator_

classifier = MLPClassifier(hidden_layer_sizes=(200,), alpha=0.0001, random_state=1)
classifier.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [29]:
# get information about the best classifier

# print(classifier)
# print(len(classifier.coefs_))
print(len(classifier.coefs_[0]))
# print(len(classifier.coefs_[1]))
# print(classifier.n_layers_)
print(classifier.n_iter_)

unique, counts = np.unique(y_cross, return_counts=True)
print(dict(zip(unique, counts)))

24
132
{0: 11154, 1: 1353}


In [30]:
# validation using cross-training

performances = f.bootstrapping(classifier, x_cross, y_cross)
# performances_dna = bootstrapping(classifier,dna_x_test,dna_y_test)
# performances_enzyme = bootstrapping(classifier,enzyme_x_test,enzyme_y_test)
    
with open(path+performance_path+'performance_'+model+'.txt',"w") as o:
    o.write("\n".join(performances))
    
# with open(path+'performance_'+model+'_pdidb.txt',"w") as o:
#    o.write("\n".join(performances_dna))
    
# with open(path+'performance_'+model+'_enzyme.txt',"w") as o:
#    o.write("\n".join(performances_enzyme))

In [50]:
# testing

# prediction = classifier.predict(enzyme_x_test)
# prediction = classifier.predict(x_test)
# unique, counts = np.unique(prediction, return_counts=True)
# print(dict(zip(unique, counts)))

f.classifier_estimation(classifier,enzyme_x_test,enzyme_y_test)
f.classifier_estimation(classifier,dna_x_test, dna_y_test)
f.classifier_estimation(classifier,x_test,y_test)

print("Training set score: %f" % classifier.score(x_train, y_train))
print("Test set score: %f" % classifier.score(x_test, y_test))
print(classifier.n_layers_)
print(classifier.n_iter_)

Test set has  1335 positives and  11615  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.93      0.78      0.85     11615
    binding       0.20      0.48      0.29      1335

avg / total       0.85      0.75      0.79     12950

Test set has  363 positives and  2320  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.91      0.76      0.83      2320
    binding       0.26      0.53      0.35       363

avg / total       0.82      0.73      0.76      2683

Test set has  1698 positives and  13935  negatives
Best classifier score
             precision    recall  f1-score   support

non-binding       0.93      0.78      0.85     13935
    binding       0.21      0.49      0.30      1698

avg / total       0.85      0.75      0.79     15633



Training set score: 0.672116
Test set score: 0.746434
3
89
