In [None]:
from rank_bm25 import BM25Okapi
from trec_cds.data.parsers import parse_clinical_trials_from_folder, load_topics_from_xml
import spacy
import tqdm
import numpy as np
import json

In [None]:

folder_name = "../data/external/ClinicalTrials"
first_n = 450000

cts = parse_clinical_trials_from_folder(folder_name=folder_name, first_n=first_n)

In [None]:
nlp = spacy.load("en_core_sci_sm")

In [None]:
len(cts)

In [None]:
cts_tokenized = []
for clinical_trial in tqdm.tqdm(cts):

    preprocessed = nlp(clinical_trial.text,
                       disable=[
                            "ner",
                            "tok2vec",
                            "tagger",
                            "parser",
                            "attribute_ruler",
                            "lemmatizer"
                       ]
                       )
    cts_tokenized.append([
        token.text for token in preprocessed if not token.is_stop
    ])

In [None]:
bm25 = BM25Okapi(cts_tokenized)
print(bm25)

In [None]:
import pickle
pickle.dump(bm25, open("../models/bm25-baseline.p", "wb"))

In [None]:
topic_file = "../data/external/topics2021.xml"
topics = load_topics_from_xml(topic_file)

In [None]:
TOP_N = 2500

output_scores = {}
for topic in tqdm.tqdm(topics):
    topic_scores = {}
    doc = nlp(topic.text)
    doc_scores = bm25.get_scores([token.text for token in doc if not token.is_stop])

    for index, score in zip(np.argsort(doc_scores)[-TOP_N:], np.sort(doc_scores)[-TOP_N:]):
        topic_scores[cts[index].nct_id] = score

    output_scores[topic.number] = topic_scores

In [None]:
len(output_scores[1])

In [None]:
with open("../data/processed/bm25-scores.json", 'w') as fp:
    json.dump(output_scores, fp)