In [1]:
import random

import joblib
import sent2vec
from scipy import spatial
from numpy import loadtxt

In [2]:

model_path = "../../biosentvec/model.bin"
biosentvec_model = sent2vec.Sent2vecModel()
try:
    biosentvec_model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


### For each datatype, taken 5 nn high, 5 med, 5 low and 15 neg, extracted all sentences with GROBID
### Merged all sentences together and taken random sample of 1000 sentences

In [6]:
neg_expr = [line.strip() for line in open("../../extracted_sentences/GROBID_neg_sentences_expr.txt")]
high_expr = [line.strip() for line in open("../../extracted_sentences/GROBID_validation_nnc_high_expr.txt")]
med_expr = [line.strip() for line in open("../../extracted_sentences/GROBID_validation_nnc_med_expr.txt")]
low_expr = [line.strip() for line in open("../../extracted_sentences/GROBID_validation_nnc_low_expr.txt")]
validation_expr = list({*neg_expr, *high_expr, *med_expr, *low_expr})
random.shuffle(validation_expr)
validation_expr = [line.split("\t") for line in validation_expr if len(line.split("\t")) > 1]

In [8]:
neg_kinase = [line.strip() for line in open("../../extracted_sentences/GROBID_neg_sentences_kinase.txt")]
positive_kinase = [line.strip() for line in open("../../extracted_sentences/GROBID_validation_selected_positive_kinase.txt")]
validation_kinase = list({*neg_kinase, *positive_kinase})
random.shuffle(validation_kinase)
validation_kinase = [line.split("\t") for line in validation_kinase if len(line.split("\t")) > 1]

### Get sentence embeddings on validation sets

In [9]:
embedded_expr = biosentvec_model.embed_sentences([sent[1] for sent in validation_expr])
embedded_kinase = biosentvec_model.embed_sentences([sent[1] for sent in validation_kinase])

### Classify expression sentences using db similarity classifier

In [9]:
centroid_exp_pattern_embeddings_bio = loadtxt("../../sentence_embeddings/centroid_biosentvec_expr_pattern.csv")
centroid_subcellloc_embeddings_bio = loadtxt("../../sentence_embeddings/centroid_biosentvec_subcellloc.csv")
db_sim_classifications_expr = []
for i, emb in enumerate(embedded_expr):
    if any([feat > 0 for feat in emb]):
        sim_expr_pattern = 1 - spatial.distance.cosine(centroid_exp_pattern_embeddings_bio, emb)
        sim_subcellloc = 1 - spatial.distance.cosine(centroid_subcellloc_embeddings_bio, emb)
        if sim_expr_pattern > 0.45 or sim_subcellloc > 0.45:
            db_sim_classifications_expr.append("TRUE")
        else:
            db_sim_classifications_expr.append("FALSE")
    else:
        db_sim_classifications_expr.append("NA")

### Classify expression using db neural network classifier

In [10]:
db_nn_classifier_expression = joblib.load("../../classifiers/db_nn_expression.joblib")
db_nn_classifications_expr = db_nn_classifier_expression.predict(embedded_expr)

### Classify expression and kinaseact using gold sentences trained classifiers

In [5]:
sentence_classifier_all_info_expression = joblib.load("../../classifiers/GROBID_all_info_expression.joblib")
sentence_classifier_curatable_expression = joblib.load("../../classifiers/GROBID_curatable_expression.joblib")
sentence_classifier_language_expression = joblib.load("../../classifiers/GROBID_language_expression.joblib")
sentence_classifier_all_info_kinase = joblib.load("../../classifiers/GROBID_all_info_kinase.joblib")
sentence_classifier_curatable_kinase = joblib.load("../../classifiers/GROBID_curatable_kinase.joblib")
sentence_classifier_language_kinase = joblib.load("../../classifiers/GROBID_language_kinase.joblib")

In [11]:
gold_sent_classifications_all_info_expr = sentence_classifier_all_info_expression.predict(embedded_expr).tolist()
gold_sent_classifications_curatable_expr = sentence_classifier_curatable_expression.predict(embedded_expr).tolist()
gold_sent_classifications_language_expr = sentence_classifier_language_expression.predict(embedded_expr).tolist()

gold_sent_classifications_all_info_kinase = sentence_classifier_all_info_kinase.predict(embedded_kinase).tolist()
gold_sent_classifications_curatable_kinase = sentence_classifier_curatable_kinase.predict(embedded_kinase).tolist()
gold_sent_classifications_language_kinase = sentence_classifier_language_kinase.predict(embedded_kinase).tolist()

### Merge sentences and classifications and save to file

In [13]:
with open("../../extracted_sentences/GROBID_validation_set_filled_expr.tsv", "w") as outfile:
    outfile.write("WBPaperID\tSENTENCE\tDB_SIM_CLASSIFICATION\tDB_NN_CLASSIFICATION\tGOLD_SENT_CLASS_ALL_INFO\tGOLD_SENT_CLASS_CURATABLE\tGOLD_SENT_CLASS_LANGUAGE\n")
    for i, sentence in enumerate(validation_expr):
        outfile.write(sentence[0] + "\t" + sentence[1].replace("\t", "  ") + "\t" + db_sim_classifications_expr[i] + "\t" + ("TRUE" if db_nn_classifications_expr[i] == 1 else "FALSE") + "\t" + ("TRUE" if gold_sent_classifications_all_info_expr[i] == 1 else "FALSE") + "\t" + ("TRUE" if gold_sent_classifications_curatable_expr[i] == 1 else "FALSE") + "\t" + ("TRUE" if gold_sent_classifications_language_expr[i] == 1 else "FALSE") + "\n")

In [12]:
with open("../../extracted_sentences/GROBID_validation_set_filled_kinase.tsv", "w") as outfile:
    outfile.write("WBPaperID\tSENTENCE\tGOLD_SENT_CLASS_ALL_INFO\tGOLD_SENT_CLASS_CURATABLE\tGOLD_SENT_CLASS_LANGUAGE\n")
    for i, sentence in enumerate(validation_kinase):
        outfile.write(sentence[0] + "\t" + sentence[1].replace("\t", "  ") + "\t" + ("TRUE" if gold_sent_classifications_all_info_kinase[i] == 1 else "FALSE") + "\t" + ("TRUE" if gold_sent_classifications_curatable_kinase[i] == 1 else "FALSE") + "\t" + ("TRUE" if gold_sent_classifications_language_kinase[i] == 1 else "FALSE") + "\n")