In [1]:
import sent2vec

expression_unified_ds = [line.strip().split("\t") for line in open("../unified_dataset/expression.tsv")][1:]
kinaseact_unified_ds = [line.strip().split("\t") for line in open("../unified_dataset/kinaseact.tsv")][1:]

sentences_only_expression_data = [row[0] for row in expression_unified_ds]
sentences_only_kinaseact_data = [row[0] for row in kinaseact_unified_ds]
bio_sent_vec_model_location = "../biosentvec/model.bin"

In [2]:
model_path = bio_sent_vec_model_location
biosentvec_model = sent2vec.Sent2vecModel()
try:
    biosentvec_model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


In [3]:
corpus_expr = biosentvec_model.embed_sentences(sentences_only_expression_data)
corpus_kinaseact = biosentvec_model.embed_sentences(sentences_only_kinaseact_data)

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import joblib

base_clf = MLPClassifier(hidden_layer_sizes=(10, 10), alpha=0.1)
training_set_expression = [corpus_expr[idx] for idx, row in enumerate(expression_unified_ds) if row[5] == "TRAINING"]
testing_set_expression = [corpus_expr[idx] for idx, row in enumerate(expression_unified_ds) if row[5] == "TESTING"]

training_set_kinaseact = [corpus_kinaseact[idx] for idx, row in enumerate(kinaseact_unified_ds) if row[5] == "TRAINING"]
testing_set_kinaseact = [corpus_kinaseact[idx] for idx, row in enumerate(kinaseact_unified_ds) if row[5] == "TESTING"]

## Task 1

### Expression

In [5]:
training_set_classes_expr = [int(row[1]) for row in expression_unified_ds if row[5] == "TRAINING"]
test_set_classes_expr = [int(row[1]) for row in expression_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_expression, training_set_classes_expr)
test_predictions = list(clf.predict(testing_set_expression))
precision, recall, fscore, support = score(test_set_classes_expr, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.95621469 0.6953125 ]
recall: [0.94553073 0.74166667]
fscore: [0.9508427  0.71774194]
support: [716 120]




### Kinaseact

In [6]:
training_set_classes_kinaseact = [int(row[1]) for row in kinaseact_unified_ds if row[5] == "TRAINING"]
test_set_classes_kinaseact = [int(row[1]) for row in kinaseact_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_kinaseact, training_set_classes_kinaseact)
test_predictions = list(clf.predict(testing_set_kinaseact))
precision, recall, fscore, support = score(test_set_classes_kinaseact, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.96226415 0.74311927]
recall: [0.95947902 0.75700935]
fscore: [0.96086957 0.75      ]
support: [691 107]




## Task 2

### Expression

In [7]:
training_set_classes_expr = [int(row[2]) for row in expression_unified_ds if row[5] == "TRAINING"]
test_set_classes_expr = [int(row[2]) for row in expression_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_expression, training_set_classes_expr)
test_predictions = list(clf.predict(testing_set_expression))
precision, recall, fscore, support = score(test_set_classes_expr, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.97384615 0.79032258]
recall: [0.94196429 0.89634146]
fscore: [0.95763994 0.84      ]
support: [672 164]


### Kinaseact

In [8]:
training_set_classes_kinaseact = [int(row[2]) for row in kinaseact_unified_ds if row[5] == "TRAINING"]
test_set_classes_kinaseact = [int(row[2]) for row in kinaseact_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_kinaseact, training_set_classes_kinaseact)
test_predictions = list(clf.predict(testing_set_kinaseact))
precision, recall, fscore, support = score(test_set_classes_kinaseact, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.96078431 0.76296296]
recall: [0.95216741 0.79844961]
fscore: [0.95645646 0.78030303]
support: [669 129]




## Task 3

### Expression

In [9]:
training_set_classes_expr = [int(row[3]) for row in expression_unified_ds if row[5] == "TRAINING"]
test_set_classes_expr = [int(row[3]) for row in expression_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_expression, training_set_classes_expr)
test_predictions = list(clf.predict(testing_set_expression))
precision, recall, fscore, support = score(test_set_classes_expr, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.93579767 0.8757764 ]
recall: [0.92322457 0.8952381 ]
fscore: [0.9294686  0.88540031]
support: [521 315]


### Kinaseact

In [10]:
training_set_classes_kinaseact = [int(row[3]) for row in kinaseact_unified_ds if row[5] == "TRAINING"]
test_set_classes_kinaseact = [int(row[3]) for row in kinaseact_unified_ds if row[5] == "TESTING"]
clf = base_clf.fit(training_set_kinaseact, training_set_classes_kinaseact)
test_predictions = list(clf.predict(testing_set_kinaseact))
precision, recall, fscore, support = score(test_set_classes_kinaseact, test_predictions)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.97277677 0.94736842]
recall: [0.97632058 0.93975904]
fscore: [0.97454545 0.94354839]
support: [549 249]


