In [1]:
# imports

import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection, metrics
from imblearn.over_sampling import SMOTE
import csv
import random

In [2]:
# general specifications

path = "data/"
model_path = "combined_model/"
model = "mm3_5"
plot_path = "D:/Dropbox/masterthesis/thesis/plots/machine_learning/nn_analysis/"

In [3]:
# import data

positives_dna = []
negatives_dna = []
positives_enzyme = []
negatives_enzyme = []

with open(path+model_path+model+'_pdidb_positives.txt') as p_dna:
    for row in csv.reader(p_dna,delimiter="\t"):
        positives_dna.append(row)

with open(path+model_path+model+'_pdidb_negatives.txt') as n_dna:
    for row in csv.reader(n_dna,delimiter="\t"):
        negatives_dna.append(row)
       
with open(path+model_path+model+'_enzyme_positives.txt') as p_enzyme:
    for row in csv.reader(p_enzyme,delimiter="\t"):
        positives_enzyme.append(row)
        
with open(path+model_path+model+'_enzyme_negatives.txt') as n_enzyme:
    for row in csv.reader(n_enzyme,delimiter="\t"):
        negatives_enzyme.append(row)

In [4]:
# pre-process data

tmp_dna = positives_dna + negatives_dna
tmp_enzyme = positives_enzyme + negatives_enzyme

labels_dna = [1]*len(positives_dna)+[0]*len(negatives_dna)
labels_enzyme = [1]*len(positives_enzyme)+[0]*len(negatives_enzyme)

dna = []
enzyme = []

for j in tmp_dna:
    new_j = [float(i) for i in j]
    dna.append(new_j)
    
for j in tmp_enzyme:
    new_j = [float(i) for i in j]
    enzyme.append(new_j)

In [5]:
dna_x_train,dna_x_test,dna_y_train,dna_y_test = model_selection.train_test_split(dna, labels_dna, test_size=0.2, random_state=6)
enzyme_x_train,enzyme_x_test,enzyme_y_train,enzyme_y_test = model_selection.train_test_split(enzyme,labels_enzyme,test_size=0.2,random_state=6)

tmp_x_train = dna_x_train + enzyme_x_train
tmp_y_train = dna_y_train + enzyme_y_train

x_train,x_cross,y_train,y_cross = model_selection.train_test_split(tmp_x_train,tmp_y_train,test_size=0.2,random_state=6)

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_sample(x_train,y_train)

In [38]:
# generate random labels

# random.shuffle(y_train)
random.shuffle(y_cross)

print(y_train)

[0 0 0 ..., 1 1 1]


In [6]:
# calculate training and validation scores

# hidden_layers = ((10,), (50,), (100,), (200,), (300,), (500,))
hidden_layers = ((1,), (700,))
iterations = (20, 40, 60, 80, 100, 120, 140, 160, 300, 500)

for layers in hidden_layers:
    print(layers)
    
    train_scores = []
    test_scores = []
    
    for iter in iterations:
        print(iter)
        classifier = MLPClassifier(hidden_layer_sizes=layers,max_iter=iter,tol=-100)   
        classifier.fit(x_train,y_train)
        
        train_score = classifier.score(x_train,y_train)
        test_score = classifier.score(x_cross,y_cross)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
        
        print(train_score)
        print(test_score)
        
        print(classifier.n_iter_)

    # plot figure

    fig = plt.figure()

    plt.title("Validation Curve for MM3 (5) " + str(layers))
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.ylim(0.6, 1.0)
    lw = 2
    plt.plot(iterations, train_scores, label="Training score",
             color="red", lw=lw)
    plt.plot(iterations, test_scores, label="Cross-validation score",
             color="navy", lw=lw)
    plt.legend(loc="best")

    fig.savefig(plot_path+model+"_" + str(layers) + ".png")
    plt.close("all")

(1,)
20




0.657774951469
0.72423442872
20
40


0.500446259232
0.889821699848
40
60


0.66070910592
0.723594786919
60
80


0.662416047482
0.70784360758
80
100


0.635551241716
0.488206604302
100
120


0.662460673405
0.699928040297
120
140


0.664937412143
0.673622771248
140
160


0.661992101212
0.717198368913
160
300


0.664033737198
0.687854801311
300
500


0.663699042774
0.704165667226
500
(700,)
20


MemoryError: 

In [18]:
# calculate training and validation scores

hidden_layers = ((10,), (50,), (100,), (200,), (300,), (500,))

train_scores = []
test_scores = []

for layers in hidden_layers:
    print(layers)
    
    classifier = MLPClassifier(hidden_layer_sizes=layers)   
    classifier.fit(x_train,y_train)
        
    train_score = classifier.score(x_train,y_train)
    test_score = classifier.score(x_cross,y_cross)
        
    train_scores.append(train_score)
    test_scores.append(test_score)
        
    # print(train_score)
    # print(test_score)
        
    print(classifier.n_iter_)

(10,)


49
(50,)


116
(100,)


91
(200,)


110
(300,)


97
(500,)


85


In [19]:
# plot figure

layers_plot = (10,50,100,200,300,500)

fig = plt.figure()

plt.title("Validation Curve for MM3 (5)")
plt.xlabel("Hidden units")
plt.ylabel("Accuracy")
plt.ylim(0.6, 1.0)
lw = 2
plt.plot(layers_plot, train_scores, label="Training score",
        color="red", lw=lw)
plt.plot(layers_plot, test_scores, label="Cross-validation score",
        color="navy", lw=lw)
plt.legend(loc="best")

fig.savefig(plot_path+model+"_" + "hidden_units" + ".png")

In [7]:
# calculate training and f1 scores

hidden_layers = ((10,), (50,), (100,), (200,), (300,), (500,), (700,))
# hidden_layers = ((10,),(50,),(100,))
iterations = (20, 40, 60, 80, 100, 120, 140, 160, 300, 500)

for layers in hidden_layers:
    print(layers)
    
    train_scores = []
    test_scores = []
    
    for iter in iterations:
        print(iter)
        classifier = MLPClassifier(hidden_layer_sizes=layers,max_iter=iter,tol=-100)   
        classifier.fit(x_train,y_train)
        
        f1_train = metrics.f1_score(y_train, classifier.predict(x_train), average=None)[1]
        f1_test = metrics.f1_score(y_cross, classifier.predict(x_cross), average=None)[1]
        
        train_scores.append(f1_train)
        test_scores.append(f1_test)
        
        # print(train_score)
        # print(test_score)
        
        print(classifier.n_iter_)

    # plot figure

    fig = plt.figure()

    plt.title("Validation Curve for MM3" + str(layers))
    plt.xlabel("Iterations")
    plt.ylabel("F1 score")
    plt.ylim(0.0, 1.0)
    lw = 2
    plt.plot(iterations, train_scores, label="Training score",
             color="red", lw=lw)
    plt.plot(iterations, test_scores, label="Cross-validation score",
             color="navy", lw=lw)
    plt.legend(loc="best")

    fig.savefig(plot_path+model+"_" + str(layers) + "_f1score.png")
    plt.close("all")

(10,)
20




20
40


40
60


60
80


80
100


100
120


120
140


140
160


160
300


300


(50,)
20


20
40


40
60


60
80


80
100


100
120


120
140


140
160


160
300


300
(100,)
20


20
40


40
60


60
80


80
100


100
120


120
140


140
160


160
300


300


(200,)
20


20
40


40
60


60
80


80
100


100
120


120
140


140
160


160
300


300
(300,)
20


20
40


40
60


60
80


80
100
