In [1]:
import random

import joblib
import sent2vec
from scipy import spatial
from numpy import loadtxt

In [2]:

model_path = "../biosentvec/model.bin"
biosentvec_model = sent2vec.Sent2vecModel()
try:
    biosentvec_model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


### For each datatype, taken 5 nn high, 5 med, 5 low and 15 neg, extracted all sentences with GROBID
### Merged all sentences together and taken random sample of 1000 sentences

In [3]:
neg_expr = [line.strip() for line in open("../extracted_sentences/GROBID_neg_sentences_expr.txt")]
high_expr = [line.strip() for line in open("../extracted_sentences/GROBID_validation_nnc_high_expr.txt")]
med_expr = [line.strip() for line in open("../extracted_sentences/GROBID_validation_nnc_med_expr.txt")]
low_expr = [line.strip() for line in open("../extracted_sentences/GROBID_validation_nnc_low_expr.txt")]
validation_expr = list({*neg_expr, *high_expr, *med_expr, *low_expr})
random.shuffle(validation_expr)
validation_expr = validation_expr[0:1000]
validation_expr = [line.split("\t") for line in validation_expr]

In [5]:
neg_kinase = [line.strip() for line in open("../extracted_sentences/GROBID_neg_sentences_kinase.txt")]
high_kinase = [line.strip() for line in open("../extracted_sentences/GROBID_validation_nnc_high_kinase.txt")]
med_kinase = [line.strip() for line in open("../extracted_sentences/GROBID_validation_nnc_med_kinase.txt")]
low_kinase = [line.strip() for line in open("../extracted_sentences/GROBID_validation_nnc_low_kinase.txt")]
validation_kinase = list({*neg_kinase, *high_kinase, *med_kinase, *low_kinase})
random.shuffle(validation_kinase)
validation_kinase = validation_kinase[0:1000]
validation_kinase = [line.split("\t") for line in validation_kinase]

### Get sentence embeddings on validation sets

In [6]:
embedded_expr = biosentvec_model.embed_sentences([sent[1] for sent in validation_expr])
embedded_kinase = biosentvec_model.embed_sentences([sent[1] for sent in validation_kinase])

### Classify expression sentences using db similarity classifier

In [ ]:
centroid_exp_pattern_embeddings_bio = loadtxt("../sentence_embeddings/centroid_biosentvec_expr_pattern.csv")
centroid_subcellloc_embeddings_bio = loadtxt("../sentence_embeddings/centroid_biosentvec_subcellloc.csv")
db_sim_classifications_expr = []
for i, emb in enumerate(embedded_expr):
    if any([feat > 0 for feat in emb]):
        sim_expr_pattern = 1 - spatial.distance.cosine(centroid_exp_pattern_embeddings_bio, emb)
        sim_subcellloc = 1 - spatial.distance.cosine(centroid_subcellloc_embeddings_bio, emb)
        if sim_expr_pattern > 0.45 or sim_subcellloc > 0.45:
            db_sim_classifications_expr.append("TRUE")
        else:
            db_sim_classifications_expr.append("FALSE")
    else:
        db_sim_classifications_expr.append("NA")

### Classify expression and kinaseact using gold sentences trained classifiers

In [ ]:
sentence_classifier_all_info_expression = joblib.load("../classifiers/all_info_expression.joblib")
sentence_classifier_curatable_expression = joblib.load("../classifiers/curatable_expression.joblib")
sentence_classifier_language_expression = joblib.load("../classifiers/language_expression.joblib")
sentence_classifier_all_info_kinase = joblib.load("../classifiers/all_info_kinase.joblib")
sentence_classifier_curatable_kinase = joblib.load("../classifiers/curatable_kinase.joblib")
sentence_classifier_language_kinase = joblib.load("../classifiers/language_kinase.joblib")

In [ ]:
gold_sent_classifications_all_info_expr = sentence_classifier_all_info_expression.predict(embedded_expr).tolist()
gold_sent_classifications_curatable_expr = sentence_classifier_curatable_expression.predict(embedded_expr).tolist()
gold_sent_classifications_language_expr = sentence_classifier_language_expression.predict(embedded_expr).tolist()

gold_sent_classifications_all_info_kinase = sentence_classifier_all_info_kinase.predict(embedded_kinase).tolist()
gold_sent_classifications_curatable_kinase = sentence_classifier_curatable_kinase.predict(embedded_kinase).tolist()
gold_sent_classifications_language_kinase = sentence_classifier_language_kinase.predict(embedded_kinase).tolist()

### Merge sentences and classifications and save to file

In [ ]:
with open("../extracted_sentences/GROBID_validation_set_filled_expr.txt", "w") as outfile:
    for i, sentence in enumerate(validation_expr):
        outfile.write(sentence[0] + "\t" + sentence[1] + "\t" + db_sim_classifications_expr[i] + "\t" + "TRUE" if gold_sent_classifications_all_info_expr[i] == 1 else "FALSE" + "\t" + "TRUE" if gold_sent_classifications_curatable_expr[i] == 1 else "FALSE" + "\t" + "TRUE" if gold_sent_classifications_language_expr[1] else "FALSE" + "\n")

In [ ]:
with open("../extracted_sentences/GROBID_validation_set_filled_kinase.txt", "w") as outfile:
    for i, sentence in enumerate(validation_kinase):
        outfile.write(sentence[0] + "\t" + sentence[1] + "\t" + "TRUE" if gold_sent_classifications_all_info_kinase[i] == 1 else "FALSE" + "\t" + "TRUE" if gold_sent_classifications_curatable_kinase[i] == 1 else "FALSE" + "\t" + "TRUE" if gold_sent_classifications_language_kinase[1] else "FALSE" + "\n")