This notebook is used to load the BM25 model and create vector embeddings of the document corpus and save them to the directory.

In [8]:
# Import the necessary packages.
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from joblib import load, dump

In [9]:
# Load the dataset as dataframe.
df = pd.read_csv("synthetic_names_samples.csv")

In [15]:
# Convert transcriptions into lowercase tokenized list.
docs = df['transcription'].tolist()
tokens = [doc.lower().split() for doc in docs]

In [16]:
# Initialize BM25 model from the tokenized docs.
bm25 = BM25Okapi(tokens)

In [17]:
# Tokenize the given question to lower case words.
question = "What happened to the patient called Michael Dunstan who takes hydrochlorothiazide for hypertension?"
question_tokens = question.lower().split()

In [18]:
# Compute the BM25 relevance scores and sort them to identify top documents.
scores = bm25.get_scores(question_tokens)
top_n = np.argsort(scores)

In [19]:
# Save the BM25 model to the directory.
dump(bm25, "bm25.joblib")

['bm25.joblib']