In [1]:
import random
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn import metrics, utils
import numpy as np
from math import sqrt
import csv
import matplotlib.pyplot as plt

In [2]:
def import_data(ids, cols_to_remove):
    positives = []
    negatives = []

    for id in ids:
        # print(id)
        with open("data/single_files/" + id + ".pos") as p:
            for row in csv.reader(p, delimiter="\t"):
                row = [i for j, i in enumerate(row) if j not in cols_to_remove]
                row = [float(i) for i in row]
                # print(row)
                positives.append(row)
        with open("data/single_files/" + id + ".neg") as n:
            for row in csv.reader(n, delimiter="\t"):
                row = [i for j, i in enumerate(row) if j not in cols_to_remove]
                row = [float(i) for i in row]
                negatives.append(row)

    # print(positives)

    data = positives + negatives
    labels = [1] * len(positives) + [0] * len(negatives)

    # print(data)

    return data, labels


def calculate_performance(classifier, x_test, y_test):
    prec = metrics.precision_score(y_test, classifier.predict(x_test), average=None)
    sensi = metrics.recall_score(y_test, classifier.predict(x_test), average=None)
    f1 = metrics.f1_score(y_test, classifier.predict(x_test), average=None)
    acc = metrics.accuracy_score(y_test, classifier.predict(x_test))

    performance = [round(prec[0], 3), round(prec[1], 3), round(sensi[0], 3), round(sensi[1], 3), round(f1[0], 3),
                   round(f1[1], 3), round(acc, 3)]
    return performance


def bootstrapping(classifier, x_test, y_test):
    size = int(round(len(x_test) / 2))
    performances = []

    for i in range(0, 999):
        part_x, part_y = utils.resample(x_test, y_test, replace=True, n_samples=size)
        performance = calculate_performance(classifier, part_x, part_y)
        # string_performance = "\t".join([str(i) for i in tmp_performance])
        performances.append(performance)

    return performances


def get_average_model(data,ranges):
    new_data = []
        
    for i in range(0,len(data)):
        row = data[i]
        new_row = []
        for j in range(0,len(ranges)):
            curr_range = ranges[j]
            first = curr_range[0]
            last = curr_range[1]+1
            avg = sum(row[first:last])/len(row[first:last])
            new_row.append(avg)
        new_data.append(new_row)
        
    return new_data

In [3]:
# generate id lists for train, cross-train and test splits

ids_dna = []
ids_enzyme = []
ids_more_data = []

with open("data/ids_dna.txt") as f:
    for row in f:
        ids_dna.append(row.strip())

with open("data/ids_enzyme.txt") as f:
    for row in f:
        ids_enzyme.append(row.strip())

with open("data/ids_more_data.txt") as f:
    for row in f:
        ids_more_data.append(row.strip())
        
random_indices_dna = random.sample(range(0,len(ids_dna)),5)
random_indices_enzyme = random.sample(range(0,len(ids_enzyme)),36)

test_dna = [ids_dna[x] for x in random_indices_dna]
test_enzyme = [ids_enzyme[x] for x in random_indices_enzyme]

ids_dna = [i for j, i in enumerate(ids_dna) if j not in random_indices_dna]
ids_enzyme = [i for j, i in enumerate(ids_enzyme) if j not in random_indices_enzyme]

ids = ids_dna + ids_enzyme
random_indices = random.sample(range(0,len(ids)),len(ids))

split1_indices = random_indices[0:74]
split2_indices = random_indices[75:149]
split3_indices = random_indices[150:224]
split4_indices = random_indices[225:298]
split5_indices = random_indices[299:372]

test_ids = test_dna + test_enzyme
split1_ids = [ids[x] for x in split1_indices]
split2_ids = [ids[x] for x in split2_indices]
split3_ids = [ids[x] for x in split3_indices]
split4_ids = [ids[x] for x in split4_indices]
split5_ids = [ids[x] for x in split5_indices]

In [4]:
with open("data/test_set_ids.txt","w") as f:
    f.write("\n".join(test_ids))

with open("data/training_set_ids.txt","w") as f:
    f.write("\n".join(split1_ids))
    f.write("\n")
    f.write("\n".join(split2_ids))
    f.write("\n")
    f.write("\n".join(split3_ids))
    f.write("\n")
    f.write("\n".join(split4_ids))

with open("data/cross_train_set_ids.txt","w") as f:
    f.write("\n".join(split5_ids))

In [59]:
# import data needed for the current model

cols_to_remove = [40, 41]
# ranges = [[0,0], [1,2], [3,4], [5,5], [6,7], [8,9], [10,10], [11,12], [13,14], [15,15], [16,17], [18,19], 
#          [20,20], [21,22], [23,24], [25,25], [26,27], [28,29], [30,30], [31,32], [33,34], [35,35], [36,37], [38,39]]

model = "mm3_5"

x_test, y_test = import_data(test_ids, cols_to_remove)
x_split1, y_split1 = import_data(split1_ids, cols_to_remove)
x_split2, y_split2 = import_data(split2_ids, cols_to_remove)
x_split3, y_split3 = import_data(split3_ids, cols_to_remove)
x_split4, y_split4 = import_data(split4_ids, cols_to_remove)
x_split5, y_split5 = import_data(split5_ids, cols_to_remove)
x_more, y_more = import_data(ids_more_data, cols_to_remove)

# x_test = get_average_model(x_test,ranges)
# x_split1 = get_average_model(x_split1,ranges)
# x_split2 = get_average_model(x_split2,ranges)
# x_split3 = get_average_model(x_split3,ranges)
# x_split4 = get_average_model(x_split4,ranges)
# x_split5 = get_average_model(x_split5,ranges)


print(len(x_test[0]))

40


In [60]:
# train neural network and analyse performance

x_train = x_split1 + x_split2 + x_split3 + x_split4 #+ x_more
y_train = y_split1 + y_split2 + y_split3 + y_split4 #+ y_more
x_cross = x_split5
y_cross = y_split5

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_sample(x_train,y_train)

classifier = MLPClassifier(hidden_layer_sizes=(200,), alpha=0.0001, random_state=1,tol=0.0000001)
classifier.fit(x_train,y_train)

print(classifier.n_iter_)

122


In [62]:
proba = classifier.predict_proba(x_cross)

cutoffs = range(0,10000,1)
precision = []
coverage = []

for cut in cutoffs:
    # print(cut)
    float_cut = cut/10000
    
    prediction = []
    for el in proba:
        if el[1]>=float_cut:
            prediction.append(1)
        else:
            prediction.append(0)
    prec = metrics.precision_score(y_cross, prediction, average=None)[1]
    cov = metrics.recall_score(y_cross, prediction, average=None)[1]
    precision.append(prec)
    coverage.append(cov)

  'precision', 'predicted', average, warn_for)


In [63]:
# plot figure

print(classifier.classes_)
print(prec)
print(cov)
print(float_cut)

# print(precision)
# print(coverage)

new_coverage, new_precision = zip(*sorted(zip(coverage, precision)))

# print(new_precision)
# print(new_coverage)

fig = plt.figure()

plt.title("Precision-Coverage-Curve for " + model)
plt.xlabel("Coverage")
plt.ylabel("Precision")
plt.ylim(0.0, 1.0)
lw = 2
plt.plot(new_coverage, new_precision,color="navy")
plt.plot(0.33,0.2,'or',color="red")
# plt.plot(0.337,0.239,'or',color="green")

fig.savefig("D:/Dropbox/masterthesis/thesis/plots/machine_learning/prec-cov-curves/"+model+"_prec_cov_curve.png")
plt.close("all")

[0 1]
0.0
0.0
0.9999


In [61]:
# calculate performance

print(len(classifier.coefs_[0]))

performances = bootstrapping(classifier, x_cross, y_cross)
performances = np.array(performances)

#print(performances)

mean_performances = np.mean(performances,axis=0)
sd_performances = np.std(performances,axis=0)
sd_performances = sd_performances/sqrt(999)

40


In [62]:
with open("data/performance/performance_mean.txt","a") as f:
    f.write(model)
    for x in mean_performances:
        f.write("\t"+str(x))
    f.write("\t"+str(classifier.n_iter_)+"\t"+str(len(classifier.coefs_[0]))+"\n")

with open("data/performance/performance_sd.txt","a") as f:
    f.write(model)
    for x in sd_performances:
        f.write("\t"+str(x))
    f.write("\n")

In [None]:
# test for overtraining and different hidden units


# hidden_layers = ((10,), (50,), (100,), (200,), (300,), (500,))
hidden_layers = ((1,), (700,))
iterations = (20, 40, 60, 80, 100, 120, 140, 160, 300, 500)

for layers in hidden_layers:
    print(layers)
    
    train_scores = []
    test_scores = []
    
    for iter in iterations:
        print(iter)
        classifier = MLPClassifier(hidden_layer_sizes=layers,max_iter=iter,tol=-100)   
        classifier.fit(x_train,y_train)
        
        train_score = classifier.score(x_train,y_train)
        test_score = classifier.score(x_cross,y_cross)
        
        train_scores.append(train_score)
        test_scores.append(test_score)
        
        print(train_score)
        print(test_score)
        
        print(classifier.n_iter_)

    # plot figure

    fig = plt.figure()

    plt.title("Validation Curve for MM3 (5) " + str(layers))
    plt.xlabel("Iterations")
    plt.ylabel("Accuracy")
    plt.ylim(0.6, 1.0)
    lw = 2
    plt.plot(iterations, train_scores, label="Training score",
             color="red", lw=lw)
    plt.plot(iterations, test_scores, label="Cross-validation score",
             color="navy", lw=lw)
    plt.legend(loc="best")

    fig.savefig("D:/Dropbox/masterthesis/thesis/plots/machine_learning/hidden_units/"+model+"_" + str(layers) + ".png")
    plt.close("all")