In [1]:
# imports

import matplotlib.pyplot as plt
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection, metrics, utils
from imblearn.over_sampling import SMOTE
import csv

In [13]:
# general specifications

path = "data/"
model_path = "combined_model/"
model = "mm3_5"

In [14]:
# import data

positives_dna = []
negatives_dna = []
positives_enzyme = []
negatives_enzyme = []

with open(path+model_path+model+'_pdidb_positives.txt') as p_dna:
    for row in csv.reader(p_dna,delimiter="\t"):
        positives_dna.append(row)

with open(path+model_path+model+'_pdidb_negatives.txt') as n_dna:
    for row in csv.reader(n_dna,delimiter="\t"):
        negatives_dna.append(row)
       
with open(path+model_path+model+'_enzyme_positives.txt') as p_enzyme:
    for row in csv.reader(p_enzyme,delimiter="\t"):
        positives_enzyme.append(row)
        
with open(path+model_path+model+'_enzyme_negatives.txt') as n_enzyme:
    for row in csv.reader(n_enzyme,delimiter="\t"):
        negatives_enzyme.append(row)

In [15]:
# pre-process data

tmp_dna = positives_dna + negatives_dna
tmp_enzyme = positives_enzyme + negatives_enzyme

labels_dna = [1]*len(positives_dna)+[0]*len(negatives_dna)
labels_enzyme = [1]*len(positives_enzyme)+[0]*len(negatives_enzyme)

dna = []
enzyme = []

for j in tmp_dna:
    new_j = [float(i) for i in j]
    dna.append(new_j)
    
for j in tmp_enzyme:
    new_j = [float(i) for i in j]
    enzyme.append(new_j)

In [16]:
dna_x_train,dna_x_test,dna_y_train,dna_y_test = model_selection.train_test_split(dna, labels_dna, test_size=0.2, random_state=6)
enzyme_x_train,enzyme_x_test,enzyme_y_train,enzyme_y_test = model_selection.train_test_split(enzyme,labels_enzyme,test_size=0.2,random_state=6)

x_train = dna_x_train + enzyme_x_train
y_train = dna_y_train + enzyme_y_train

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_sample(x_train,y_train)

In [17]:
# calculate training and validation scores

param_range = ((10,),(50,),(100,),(200,),(300,))
train_scores, test_scores = model_selection.validation_curve(
    MLPClassifier(), x_train,y_train, param_name="hidden_layer_sizes", param_range=param_range,
    cv=2, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)



In [19]:
# plot figure

fig = plt.figure()

plt.title("Validation Curve for MM3 (5)")
plt.xlabel("Hidden units")
plt.ylabel("Accuracy")
plt.ylim(0.6, 1.0)
lw = 2
plt.plot(param_range, train_scores_mean, label="Training score",
             color="red", lw=lw)
plt.plot(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.legend(loc="best")

fig.savefig("data/plots/mm3_5_hidden_units.png")