In [54]:
from joblib import dump, load
import numpy as np
import pandas as pd
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import string
from collections import Counter

In [55]:
TFIDFEmbedding = load("TFIDFEmbedding.joblib")
KNNModel = load("KNNModel.joblib")

sbert = load("sbert.joblib")
faiss = load("faiss.joblib")

bm25 = load("bm25.joblib")
df = pd.read_csv("synthetic_names_samples.csv")
docs = df['transcription'].tolist()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [18]:
questions = []
answers = []

questions.append("What is Charlie Harlow's dosage?")
answers.append(0)

questions.append("What is the age of the patient with name Jenna Lundsten and has bladder cancer?")
answers.append(113)

questions.append("What was the procedure done for Richard Jasso, who had heavy tobacco abuse?")
answers.append(793)

questions.append("How did we obtain consent for surgery for Dorothy Robinson?")
answers.append(818)

questions.append("What is the procedure performed on the 55 year old female who is having a colonoscopy to screen for colon cancer?")
answers.append(976)

questions.append("What happened to the Michael Dunstan who takes hydrochlorothiazide for hypertension?")
answers.append(13)

questions.append("What is the operation done on Michael Neal?")
answers.append(1162)

questions.append("What is the medication given to Karen Lewis?")
answers.append(1449)

questions.append("What is the impression on Adam Juarez who took an MRI?")
answers.append(1546)

questions.append("What were the symptoms on Jason Olguin who was given amphotericin?")
answers.append(1593)


In [19]:
knn_answers = []
faiss_answers = []
bm25_answers = []
for question in questions: 
    # TFIDF + KNN
    tfidf = TFIDFEmbedding.transform([question])
    knn_ans = KNNModel.kneighbors(tfidf , return_distance=False)
    knn_answers.append(knn_ans)

    # SBERT + FAISS
    sbert_embedding = sbert.encode([question]).astype('float32')
    dustabces, indices = faiss.search(sbert_embedding , 10)
    faiss_answers.append(indices)

    # BM25
    bm_tokens = question.lower().split()
    bm_scores = np.argsort(bm25.get_scores(bm_tokens))[::-1][:10]
    bm25_answers.append(bm_scores)


AttributeError: BertTokenizerFast has no attribute pad_token

In [47]:
knn_answers

[array([[1312, 3694, 4104, 3194, 3363, 1390, 4168, 1804, 3148, 1705]],
       dtype=int64),
 array([[ 215,   38,  113,  715, 1149,  171,  139,  931,  173, 4523]],
       dtype=int64),
 array([[1970, 1966, 3533, 1791, 4538, 3739, 4459, 4785, 3302, 4327]],
       dtype=int64),
 array([[1853, 2657, 4529, 4952, 3951, 4916,  818, 3673,  264, 2176]],
       dtype=int64),
 array([[ 976, 3610, 3158, 3626, 4486, 3613,  985,  973, 3600, 4452]],
       dtype=int64),
 array([[  13, 1947, 3392, 4444, 4381, 1366, 3285, 2437, 2375, 3396]],
       dtype=int64),
 array([[1932, 1977, 1947, 1619, 1162, 4718, 1961,  690, 1945, 1741]],
       dtype=int64),
 array([[3324, 4357,   73, 1320, 4104, 3194, 2776, 4232, 2414, 1754]],
       dtype=int64),
 array([[2134, 1817, 1591, 2932, 4502, 2797, 1558, 1547, 2796, 1551]],
       dtype=int64),
 array([[2041, 4106, 2094, 2761, 1536, 4163, 2074, 4565, 4322, 3501]],
       dtype=int64)]

In [48]:
faiss_answers

[array([[2151,    0, 1784, 4819, 3944, 4381, 4257, 3170, 3183, 1742]],
       dtype=int64),
 array([[ 113, 2253, 2996, 3584,  140, 3151, 4241, 4488, 3084, 3022]],
       dtype=int64),
 array([[4126,  247, 2941, 3847, 3423, 4070, 4353, 4555, 3406, 3195]],
       dtype=int64),
 array([[2409,  818, 3418, 4455,  367, 1132, 4574,  592, 4043, 1967]],
       dtype=int64),
 array([[3613,  985, 4455, 4486, 3158, 3626, 3608, 3584,  976,  731]],
       dtype=int64),
 array([[  13, 3777, 4083, 3396, 2833, 3813, 2973, 4665, 4268, 4238]],
       dtype=int64),
 array([[ 164,  752, 1947,  500,  407, 2309,   96, 2204, 1832, 4725]],
       dtype=int64),
 array([[4257, 4465, 1449, 2141, 3591, 4731, 4374, 3357, 2907, 3329]],
       dtype=int64),
 array([[2797, 1019,  118, 1558, 4578, 1679, 2737, 4447, 3405, 3055]],
       dtype=int64),
 array([[1593, 4264, 2446, 3213, 2959, 3823,  118, 3235, 4363, 1353]],
       dtype=int64)]

In [49]:
bm25_answers

[array([1494, 2288,    0, 4430, 1395, 3958, 4057, 3317, 1096, 3301],
       dtype=int64),
 array([ 113,   73, 1320, 1762, 4124, 1771, 4133, 1777, 4144, 4518],
       dtype=int64),
 array([ 793, 3302, 4327, 2606,  567, 4459, 3739, 3196, 4108, 4768],
       dtype=int64),
 array([4043, 4524, 2144,  568, 2673, 4529, 4952, 3353,  129, 4391],
       dtype=int64),
 array([3610,  976, 3331, 4368, 4472, 3409,  962, 3594, 3158, 3626],
       dtype=int64),
 array([  13, 4381, 3392, 4444, 4551, 4394, 2854, 3409, 4472, 1354],
       dtype=int64),
 array([4551, 1317, 3584, 4455, 2417, 1419, 4167, 2678, 2214,  798],
       dtype=int64),
 array([2414,  926, 4965, 1704, 3294, 3247, 4289, 3249, 4252, 1470],
       dtype=int64),
 array([1546, 2732, 2968, 1033, 4252, 1361, 3281, 4313, 1914, 3310],
       dtype=int64),
 array([1593, 1203,  276, 1292, 1632, 1126,  139,  931, 3844, 4554],
       dtype=int64)]

In [50]:
# Accuracy scores:
faiss_accuracy = 0
bm25_accuracy = 0
knn_accuracy = 0

for i in range(0, 10, 1):
    if(answers[i] in knn_answers[i]):
        knn_accuracy+=1
    if(answers[i] in faiss_answers[i]):
        faiss_accuracy+=1
    if(answers[i] in bm25_answers[i]):
        bm25_accuracy+=1

In [51]:
faiss_accuracy

7

In [52]:
bm25_accuracy

7

In [53]:
knn_accuracy

5

In [43]:
# Tests for part 2 of the model : Question Answering

In [20]:
qa_model_name = "dmis-lab/biobert-large-cased-v1.1-squad"
tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

Device set to use cpu


In [21]:
questions = []
documents = []

questions.append("What is Charlie Harlow's dosage?")
documents.append(docs[0])

questions.append("What is the age of the patient with name Jenna Lundsten and has bladder cancer?")
documents.append(docs[113])

questions.append("What was the procedure done for Richard Jasso, who had heavy tobacco abuse?")
documents.append(docs[793])

questions.append("How did we obtain consent for surgery for Dorothy Robinson?")
documents.append(docs[818])

questions.append("What is the procedure performed on the 55 year old female who is having a colonoscopy to screen for colon cancer?")
documents.append(docs[976])

questions.append("What happened to the Michael Dunstan who takes hydrochlorothiazide for hypertension?")
documents.append(docs[13])

questions.append("What is the operation done on Michael Neal?")
documents.append(docs[1162])

questions.append("What is the medication given to Karen Lewis?")
documents.append(docs[1449])

questions.append("What is the impression on Adam Juarez who took an MRI?")
documents.append(docs[1546])

questions.append("What were the symptoms on Jason Olguin who was given amphotericin?")
documents.append(docs[1593])

In [38]:
ground_truths = ["Samples of Nasonex two sprays in each nostril given for three weeks.", "66-year-old"
                 , "Right common femoral artery cannulation", "After risk of operation was explained to this patient's family"
                 , "The tip of the endoscope was introduced into the rectum."
                 , "a hysterectomy done in 2000 and a gallbladder resection"
                 , " Bilateral Myringotomy with placement of PE tubes"
                 , "Ativan"
                 , "mild degenerative"
                 , "HA, nausea and vomiting"]

In [44]:
predictions = []
for i in range(len(questions)):
    prediction = qa_pipeline({'question': questions[i], 'context': documents[i]})
    predictions.append(prediction['answer'])



In [52]:
def normalize_text(s):
    def remove_articles(text):
        return ' '.join([word for word in text.split() if word not in ('a', 'an', 'the')])
    
    def remove_punctuation(text):
        return ''.join([char for char in text if char not in string.punctuation])
    
    def to_lower(text):
        return text.lower()
    
    return remove_articles(remove_punctuation(to_lower(s))).strip()

def compute_f1(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()
    
    common_tokens = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common_tokens.values())
    
    if len(prediction_tokens) == 0 or len(ground_truth_tokens) == 0:
        return 0
    
    if num_same == 0:
        return 0
    
    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

def compute_average_f1(predictions, ground_truths):
    assert len(predictions) == len(ground_truths), "Number of predictions and ground truths must match."
    f1_scores = [compute_f1(prediction, ground_truth) for prediction, ground_truth in zip(predictions, ground_truths)]
    average_f1 = sum(f1_scores) / len(f1_scores)
    return average_f1


In [53]:
compute_average_f1(answers, ground_truths)

0.8