In [76]:
import random
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn import metrics, utils
import numpy as np
from math import sqrt
import csv

In [97]:
def import_data(ids, cols_to_remove):
    positives = []
    negatives = []

    for id in ids:
        # print(id)
        with open("data/single_files/" + id + ".pos") as p:
            for row in csv.reader(p, delimiter="\t"):
                row = [i for j, i in enumerate(row) if j not in cols_to_remove]
                row = [float(i) for i in row]
                # print(row)
                positives.append(row)
        with open("data/single_files/" + id + ".neg") as n:
            for row in csv.reader(n, delimiter="\t"):
                row = [i for j, i in enumerate(row) if j not in cols_to_remove]
                row = [float(i) for i in row]
                negatives.append(row)

    # print(positives)

    data = positives + negatives
    labels = [1] * len(positives) + [0] * len(negatives)

    # print(data)

    return data, labels


def calculate_performance(classifier, x_test, y_test):
    prec = metrics.precision_score(y_test, classifier.predict(x_test), average=None)
    sensi = metrics.recall_score(y_test, classifier.predict(x_test), average=None)
    f1 = metrics.f1_score(y_test, classifier.predict(x_test), average=None)
    acc = metrics.accuracy_score(y_test, classifier.predict(x_test))

    performance = [round(prec[0], 3), round(prec[1], 3), round(sensi[0], 3), round(sensi[1], 3), round(f1[0], 3),
                   round(f1[1], 3), round(acc, 3)]
    return performance


def bootstrapping(classifier, x_test, y_test):
    size = int(round(len(x_test) / 2))
    performances = []

    for i in range(0, 999):
        part_x, part_y = utils.resample(x_test, y_test, replace=True, n_samples=size)
        performance = calculate_performance(classifier, part_x, part_y)
        # string_performance = "\t".join([str(i) for i in tmp_performance])
        performances.append(performance)

    return performances

In [98]:
# generate id lists for train, cross-train and test splits

ids_dna = []
ids_enzyme = []

with open("data/ids_dna.txt") as f:
    for row in f:
        ids_dna.append(row.strip())

with open("data/ids_enzyme.txt") as f:
    for row in f:
        ids_enzyme.append(row.strip())
        
random_indices_dna = random.sample(range(0,len(ids_dna)),5)
random_indices_enzyme = random.sample(range(0,len(ids_enzyme)),36)

test_dna = [ids_dna[x] for x in random_indices_dna]
test_enzyme = [ids_enzyme[x] for x in random_indices_enzyme]

ids_dna = [i for j, i in enumerate(ids_dna) if j not in random_indices_dna]
ids_enzyme = [i for j, i in enumerate(ids_enzyme) if j not in random_indices_enzyme]

ids = ids_dna + ids_enzyme
random_indices = random.sample(range(0,len(ids)),len(ids))

split1_indices = random_indices[0:74]
split2_indices = random_indices[75:149]
split3_indices = random_indices[150:224]
split4_indices = random_indices[225:298]
split5_indices = random_indices[299:372]

test_ids = test_dna + test_enzyme
split1_ids = [ids[x] for x in split1_indices]
split2_ids = [ids[x] for x in split2_indices]
split3_ids = [ids[x] for x in split3_indices]
split4_ids = [ids[x] for x in split4_indices]
split5_ids = [ids[x] for x in split5_indices]

In [130]:
# import data needed for the current model

cols_to_remove = [3,4,8,9,10,11,12,13,14,18,19,23,24,25,26,27,28,29,33,34,38,39,40,41]
model = "mm2"

x_test, y_test = import_data(test_ids, cols_to_remove)
x_split1, y_split1 = import_data(split1_ids, cols_to_remove)
x_split2, y_split2 = import_data(split2_ids, cols_to_remove)
x_split3, y_split3 = import_data(split3_ids, cols_to_remove)
x_split4, y_split4 = import_data(split4_ids, cols_to_remove)
x_split5, y_split5 = import_data(split5_ids, cols_to_remove)

print(len(x_test[0]))

18


In [131]:
# train neural network and analyse performance

x_train = x_split1 + x_split2 + x_split3 + x_split4
y_train = y_split1 + y_split2 + y_split3 + y_split4
x_cross = x_split5
y_cross = y_split5

sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_sample(x_train,y_train)

classifier = MLPClassifier(hidden_layer_sizes=(200,), alpha=0.0001, random_state=1)
classifier.fit(x_train,y_train)

print(classifier.n_iter_)

101


In [132]:
# calculate performance

print(len(classifier.coefs_[0]))

performances = bootstrapping(classifier, x_cross, y_cross)
performances = np.array(performances)

#print(performances)

mean_performances = np.mean(performances,axis=0)
sd_performances = np.std(performances,axis=0)
sd_performances = sd_performances/sqrt(999)

18


In [133]:
with open("data/performance/performance_mean.txt","a") as f:
    f.write(model)
    for x in mean_performances:
        f.write("\t"+str(x))
    f.write("\n")

with open("data/performance/performance_sd.txt","a") as f:
    f.write(model)
    for x in sd_performances:
        f.write("\t"+str(x))
    f.write("\n")