In [1]:
import csv
import numpy as np
import pandas as pd
import nltk.data
import pickle
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25L

In [3]:
PM_Articles = pd.read_csv("PM classified data/Final_PM_Reduced.csv", encoding='latin1')

In [4]:
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [6]:
corpus_embeddings = embedder.encode(PM_Articles.Abstract.tolist())
embedding_file = "models/PM_Articles_DistilledBert_reduced.emb"
with open(embedding_file,mode='wb') as emb_f:
    pickle.dump(corpus_embeddings,emb_f)

In [7]:
corpus = []
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
Title = PM_Articles['Title']
Abstract = PM_Articles['Abstract']
NCT_ID = PM_Articles['NCT ID']
query = ["cancer 64-year-old male"]
gene = "BRAF (V600E)"

In [9]:
for article in PM_Articles.Abstract.apply(lambda row: row.lower()):
    corpus.extend(tokenizer.tokenize(article))

In [10]:
bm25 = BM25L(corpus)
tokenized_gene = gene.split(" ")
BM25_Score = bm25.get_scores(tokenized_gene) * 2
query_embeddings = embedder.encode(query)

In [11]:
corpus_embeddings.shape

(264, 768)

In [12]:
query_embeddings.shape

(1, 768)

In [13]:
topk=10
score_corpus = np.sum(query_embeddings * corpus_embeddings, axis=1) / np.linalg.norm(corpus_embeddings, axis=1)

topk_idx = np.argsort(score_corpus)[::-1][:topk]

In [15]:
i = 0
for idx in topk_idx:
    i = i + 1
    index=10
    score = score_corpus[idx] + BM25_Score[idx]
    print(index, NCT_ID[idx])
    with open('Ranked PM 25 Articles new.csv', 'a', newline='') as csvfile:
        fieldnames = ['QueryNum', 'Q0', 'NCT_ID', 'Score', ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow({'QueryNum': index , 'NCT_ID': NCT_ID[idx], 'Score': score})

10 NCT00002703
10 NCT00001377
10 NCT00003450
10 NCT00003147
10 NCT00002511
10 NCT00002602
10 NCT00003290
10 NCT00002924
10 NCT00002938
10 NCT00002723


In [16]:
for idx in topk_idx:
    print('https://clinicaltrials.gov/ct2/show/'+NCT_ID[idx]+'?term='+NCT_ID[idx]+'&draw=2&rank=1')

https://clinicaltrials.gov/ct2/show/NCT00002703?term=NCT00002703&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00001377?term=NCT00001377&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00003450?term=NCT00003450&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00003147?term=NCT00003147&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00002511?term=NCT00002511&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00002602?term=NCT00002602&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00003290?term=NCT00003290&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00002924?term=NCT00002924&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00002938?term=NCT00002938&draw=2&rank=1
https://clinicaltrials.gov/ct2/show/NCT00002723?term=NCT00002723&draw=2&rank=1
