In [11]:
import sent2vec

expression_unified_ds = [line.strip().split("\t") for line in open("../../unified_dataset/expression.tsv")][1:]
kinaseact_unified_ds = [line.strip().split("\t") for line in open("../../unified_dataset/kinaseact.tsv")][1:]

sentences_only_expression_data = [row[0] for row in expression_unified_ds]
sentences_only_kinaseact_data = [row[0] for row in kinaseact_unified_ds]
bio_sent_vec_model_location = "../../biosentvec/model.bin"

In [12]:
model_path = bio_sent_vec_model_location
biosentvec_model = sent2vec.Sent2vecModel()
try:
    biosentvec_model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


In [13]:
corpus_expr = biosentvec_model.embed_sentences(sentences_only_expression_data)
corpus_kinaseact = biosentvec_model.embed_sentences(sentences_only_kinaseact_data)

In [14]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import joblib

base_clf = MLPClassifier(hidden_layer_sizes=(10, 10), alpha=0.1)
training_set_expression = [corpus_expr[idx] for idx, row in enumerate(expression_unified_ds) if row[5] == "TRAINING"]
testing_set_expression = [corpus_expr[idx] for idx, row in enumerate(expression_unified_ds) if row[5] == "TESTING"]

training_set_kinaseact = [corpus_kinaseact[idx] for idx, row in enumerate(kinaseact_unified_ds) if row[5] == "TRAINING"]
testing_set_kinaseact = [corpus_kinaseact[idx] for idx, row in enumerate(kinaseact_unified_ds) if row[5] == "TESTING"]

## Task 1

### Expression

In [15]:
training_set_classes_expr = [int(row[1]) for row in expression_unified_ds if row[5] == "TRAINING"]
test_set_classes_expr = [int(row[1]) for row in expression_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_expression, training_set_classes_expr)
test_predictions = list(clf.predict(testing_set_expression))
precision, recall, fscore, support = score(test_set_classes_expr, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.9525802  0.68907563]
recall: [0.94861111 0.70689655]
fscore: [0.95059151 0.69787234]
support: [720 116]




### Kinaseact

In [16]:
training_set_classes_kinaseact = [int(row[1]) for row in kinaseact_unified_ds if row[5] == "TRAINING"]
test_set_classes_kinaseact = [int(row[1]) for row in kinaseact_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_kinaseact, training_set_classes_kinaseact)
test_predictions = list(clf.predict(testing_set_kinaseact))
precision, recall, fscore, support = score(test_set_classes_kinaseact, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.95689655 0.76470588]
recall: [0.96521739 0.72222222]
fscore: [0.96103896 0.74285714]
support: [345  54]




## Task 2

### Expression

In [17]:
training_set_classes_expr = [int(row[2]) for row in expression_unified_ds if row[5] == "TRAINING"]
test_set_classes_expr = [int(row[2]) for row in expression_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_expression, training_set_classes_expr)
test_predictions = list(clf.predict(testing_set_expression))
precision, recall, fscore, support = score(test_set_classes_expr, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.9653092  0.84971098]
recall: [0.96096096 0.86470588]
fscore: [0.96313017 0.85714286]
support: [666 170]


### Kinaseact

In [18]:
training_set_classes_kinaseact = [int(row[2]) for row in kinaseact_unified_ds if row[5] == "TRAINING"]
test_set_classes_kinaseact = [int(row[2]) for row in kinaseact_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_kinaseact, training_set_classes_kinaseact)
test_predictions = list(clf.predict(testing_set_kinaseact))
precision, recall, fscore, support = score(test_set_classes_kinaseact, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.94642857 0.77777778]
recall: [0.95783133 0.73134328]
fscore: [0.95209581 0.75384615]
support: [332  67]




## Task 3

### Expression

In [19]:
training_set_classes_expr = [int(row[3]) for row in expression_unified_ds if row[5] == "TRAINING"]
test_set_classes_expr = [int(row[3]) for row in expression_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_expression, training_set_classes_expr)
test_predictions = list(clf.predict(testing_set_expression))
precision, recall, fscore, support = score(test_set_classes_expr, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.943074   0.89644013]
recall: [0.93950851 0.90228013]
fscore: [0.94128788 0.89935065]
support: [529 307]


### Kinaseact

In [20]:
training_set_classes_kinaseact = [int(row[3]) for row in kinaseact_unified_ds if row[5] == "TRAINING"]
test_set_classes_kinaseact = [int(row[3]) for row in kinaseact_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_kinaseact, training_set_classes_kinaseact)
test_predictions = list(clf.predict(testing_set_kinaseact))
precision, recall, fscore, support = score(test_set_classes_kinaseact, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.95373665 0.95762712]
recall: [0.98168498 0.8968254 ]
fscore: [0.96750903 0.92622951]
support: [273 126]
