In [None]:
# loading data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

directory_path = '/content/drive/MyDrive/patients_data'

text_contents = []


for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(directory_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            text_contents.append(content)

print(text_contents[0])

Patient ID: P1001
Name: John Doe
Age: 54
Gender: Male
Medical Conditions: Hypertension, Type 2 Diabetes
Current Medications: Metformin, Lisinopril
Prescribed Treatments: Diet modification, Exercise
Medical History:
- 2024-02-10: Symptoms: Fatigue, Increased thirst; Diagnoses: Type 2 Diabetes; Medications: Metformin; Adherence: good; Side Effects: None; Barriers: Diet compliance
- 2024-04-15: Symptoms: Headache, High blood pressure; Diagnoses: Hypertension; Medications: Lisinopril; Adherence: moderate; Side Effects: Mild cough; Barriers: None
Clinical Notes: Patient shows moderate adherence to medication. Reports mild side effects but no major complications.
Missed Appointments: 1
Non-compliance Patterns: Occasional missed doses
Lab Results:
- 2024-02-10: HbA1c 7.2% (Above target)
- 2024-04-15: Blood Pressure 150/95 (Hypertensive)



In [None]:
# now we want to do some text cleaning

import re

def text_cleaning(text):

    text = text.lower()
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


cleaned_texts = [text_cleaning(doc) for doc in text_contents]
print(cleaned_texts[0])


patient id: p1001 name: john doe age: 54 gender: male medical conditions: hypertension, type 2 diabetes current medications: metformin, lisinopril prescribed treatments: diet modification, exercise medical history: - 2024-02-10: symptoms: fatigue, increased thirst; diagnoses: type 2 diabetes; medications: metformin; adherence: good; side effects: none; barriers: diet compliance - 2024-04-15: symptoms: headache, high blood pressure; diagnoses: hypertension; medications: lisinopril; adherence: moderate; side effects: mild cough; barriers: none clinical notes: patient shows moderate adherence to medication. reports mild side effects but no major complications. missed appointments: 1 non-compliance patterns: occasional missed doses lab results: - 2024-02-10: hba1c 7.2% (above target) - 2024-04-15: blood pressure 150/95 (hypertensive)


In [None]:
# now creating questions based on the documents and also their real answers

questions = [
    'what is id of Jhon Doe?',
    'how old is Jane Smith?' ,
    'what is gender of patient with id P1003?',
    'what are the medical conditions of Emily Davis?',
    'what current medications is William Jhonson taking?',
    'what treatments have been prescribed for Olivia Martinez?',
    'give me medical history of James Wilson',
    'what notes have been written for patient with id P1008 ?',
    'how many appointments has David Kim missed?',
    'what was the labratory return for Isabella Garcia?',
]

real_answers = [
    'P1001', # patient1
    '67',   # patient2
    'male',  # patient3
    'Osteoarthritis',  # patient4
    'Lisinopril, Furosemide',  # patient5
    'Daily thyroid hormone replacement', # patient6
    '- 2023-12-20: Symptoms: Low mood, anxiety; Diagnoses: Major depressive disorder; Medications: Sertraline; Adherence: moderate; Side Effects: Nausea; Barriers: Occasional missed doses', # patient7
    'Clinical Notes: Migraines well controlled with medication.',  # patient8
    '1',  # patient9
    'Lab Results:- 2024-01-30: Elevated ESR and CRP',  # patient10
]

In [None]:
# creating a function that finds the most related string to the query using TF-IDF and cosine similarity.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_top_k_related_strings(query, list_of_strings, k=3):
    # Combine query and strings for vectorization
    corpus = [query] + list_of_strings

    # Initialize TF-IDF Vectorizer and fit-transform the corpus
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Compute cosine similarity between query vector (first element) and all others
    cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:]).flatten()

    # Get indices of top k most similar documents (sorted descending)
    top_k_idx = np.argsort(cosine_similarities)[-k:][::-1]

    # Prepare list of (document, similarity) tuples for top k
    top_k = [(list_of_strings[i], cosine_similarities[i]) for i in top_k_idx]

    return top_k



In [None]:
# testing the first question and see if it works

most_related_docs= find_top_k_related_strings(questions[9], cleaned_texts)
print(questions[9])
print(f"Most related string :\n{most_related_docs[0]}")


what was the labratory return for Isabella Garcia?
Most related string :
('patient id: p1010 name: isabella garcia age: 42 gender: female medical conditions: rheumatoid arthritis current medications: methotrexate, prednisone prescribed treatments: immunosuppressive therapy, physical therapy medical history: - 2024-01-30: symptoms: joint pain, swelling; diagnoses: rheumatoid arthritis; medications: methotrexate, prednisone; adherence: good; side effects: mild nausea; barriers: none clinical notes: disease activity controlled with current regimen. missed appointments: 0 non-compliance patterns: none lab results: - 2024-01-30: elevated esr and crp', np.float64(0.07745367769918175))


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# now chunking the chosen string and find the most related chunk

from nltk.tokenize import word_tokenize


def string_chunker(string, chunk_size=50, overlap=15):
    tokens = word_tokenize(string)
    chunks = []
    start = 0
    length = len(tokens)

    while start < length:
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = " ".join(chunk_tokens)
        chunks.append(chunk_text)

        # Move start forward by chunk_size - overlap
        start += chunk_size - overlap

        if start >= length:
            break

    return chunks

In [None]:
# testing string chunker to find the most related chunk

chunks = string_chunker(cleaned_texts[8])
most_related_chunks = find_top_k_related_strings(questions[8], chunks)

print(questions[8])
print('---------------------------------------------------------------')
print('TOTAL TEXT :')
print(cleaned_texts[8])
print('---------------------------------------------------------------')
print('RELATED CHUNK :')
print(most_related_chunks[0])

how many appointments has David Kim missed?
---------------------------------------------------------------
TOTAL TEXT :
patient id: p1009 name: david kim age: 65 gender: male medical conditions: copd, hypertension current medications: tiotropium, amlodipine prescribed treatments: smoking cessation, inhaler use medical history: - 2024-03-25: symptoms: chronic cough, shortness of breath; diagnoses: copd; medications: tiotropium; adherence: moderate; side effects: dry mouth; barriers: difficulty quitting smoking clinical notes: patient advised on smoking cessation, moderate medication adherence. missed appointments: 1 non-compliance patterns: missed doses of inhaler lab results: - 2024-03-25: fev1 55% predicted
---------------------------------------------------------------
RELATED CHUNK :
('quitting smoking clinical notes : patient advised on smoking cessation , moderate medication adherence . missed appointments : 1 non-compliance patterns : missed doses of inhaler lab results : - 2024

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def answer_question_with_context(model , question , context):
    result = model(question=question, context=context)
    return result

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
# testing the roberta model

print(questions[8])
print('answer : ' , answer_question_with_context(qa_pipeline , questions[8] , most_related_chunks[0][0])['answer'])

how many appointments has David Kim missed?
answer :  1


In [None]:
# creating a whole function with reranker

def rag_with_reranker(question , cleaned_texts):
    top_doc = find_top_k_related_strings(question, cleaned_texts)
    chunks = string_chunker(top_doc[0][0])
    top_chunk = find_top_k_related_strings(question, chunks)
    output =  answer_question_with_context(qa_pipeline , question , top_chunk[0][0])
    return output['answer']

In [None]:
# testing rag with reranker

print(questions[8])
print(rag_with_reranker(questions[8] , cleaned_texts))

how many appointments has David Kim missed?
1


In [None]:
# FID

In [None]:
# my original code

# from transformers import T5Tokenizer, T5ForConditionalGeneration
# import torch
# from transformers.modeling_outputs import BaseModelOutput

# class FID:
#     def __init__(self):
#         self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
#         self.model = T5ForConditionalGeneration.from_pretrained('t5-base')

#     def encode_and_fuse(self, question, list_of_docs):
#         inputs = [f"question: {question} context: {p}" for p in list_of_docs]

#         encoded_inputs = self.tokenizer(
#             inputs,
#             return_tensors='pt',
#             padding=True,
#             truncation=True,
#             max_length=512
#         )

#         encoder_outputs = self.model.encoder(
#             input_ids=encoded_inputs.input_ids,
#             attention_mask=encoded_inputs.attention_mask
#         )

#         # Concatenate along sequence dimension (dim=1), not batch dimension (dim=0)
#         # fused_encoder_hidden_states = torch.cat(
#         # [encoder_outputs.last_hidden_state[i] for i in range(encoder_outputs.last_hidden_state.size(0))],
#         # dim=0  # concatenate sequences, not batches
#         # ).unsqueeze(0)  # add batch dimension back
#         fused_encoder_hidden_states = encoder_outputs.last_hidden_state.reshape(1, -1, encoder_outputs.last_hidden_state.size(-1))

#         # fused_attention_mask = torch.cat(
#         # [encoded_inputs.attention_mask[i] for i in range(encoded_inputs.attention_mask.size(0))],
#         # dim=0  # concatenate sequence masks
#         # ).unsqueeze(0)
#         fused_attention_mask = encoded_inputs.attention_mask.reshape(1, -1)

#         fused_encoder_outputs = BaseModelOutput(last_hidden_state=fused_encoder_hidden_states)

#         return fused_encoder_outputs, fused_attention_mask

#     def generate_answer(self, question, list_of_docs):
#         fused_encoder_outputs, fused_attention_mask = self.encode_and_fuse(question, list_of_docs)

#         outputs = self.model.generate(
#             encoder_outputs=fused_encoder_outputs,
#             attention_mask=fused_attention_mask,
#             max_length=100,
#             num_beams=4,
#             early_stopping=True
#         )

#         answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
#         return answer



In [None]:
# a better code for fid

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

class FID:
    def __init__(self):
        self.tokenizer = T5Tokenizer.from_pretrained('allenai/unifiedqa-t5-base')
        self.model = T5ForConditionalGeneration.from_pretrained('allenai/unifiedqa-t5-base')

    def generate_answer(self, question, list_of_docs):
        inputs = [f"question: {question} context: {p}" for p in list_of_docs]

        tokenized = self.tokenizer(
            inputs,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=512
        )

        input_ids = tokenized.input_ids  # shape: (num_passages, seq_len)
        attention_mask = tokenized.attention_mask

        # The model handles Fusion-in-Decoder internally if given multiple passages as batch
        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=100,
            num_beams=4,
            early_stopping=True
        )

        # Decode the first generated sequence (batch size is 1 for generate)
        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return answer




In [None]:
# testing fid function

top_3 = find_top_k_related_strings(questions[8], cleaned_texts)
top_3_docs = [item[0] for item in top_3]  # extract just the text part

fid_obj = FID()

print("Question:", questions[8])
print("Top 3 related docs:", top_3_docs)

answer = fid_obj.generate_answer(questions[8], top_3_docs)
print("Generated answer:", answer)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Question: how many appointments has David Kim missed?
Top 3 related docs: ['patient id: p1009 name: david kim age: 65 gender: male medical conditions: copd, hypertension current medications: tiotropium, amlodipine prescribed treatments: smoking cessation, inhaler use medical history: - 2024-03-25: symptoms: chronic cough, shortness of breath; diagnoses: copd; medications: tiotropium; adherence: moderate; side effects: dry mouth; barriers: difficulty quitting smoking clinical notes: patient advised on smoking cessation, moderate medication adherence. missed appointments: 1 non-compliance patterns: missed doses of inhaler lab results: - 2024-03-25: fev1 55% predicted', 'patient id: p1007 name: james wilson age: 50 gender: male medical conditions: depression, anxiety current medications: sertraline prescribed treatments: cognitive behavioral therapy, medication medical history: - 2023-12-20: symptoms: low mood, anxiety; diagnoses: major depressive disorder; medications: sertraline; adhere

In [None]:
# evaluating the answers of the methods (reranker and fid)

In [None]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=4dcc44c7c1164243e71b079b2d5c912202f2de41f1ed4a9b3bc99d2955b0040b
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load('rouge')

for question in questions:

    # finding real answer
    real_answer = real_answers[questions.index(question)]
    # finding reranker answer
    reranker_answer = rag_with_reranker(question , cleaned_texts)
    # finding fid answer
    top_3 = find_top_k_related_strings(question, cleaned_texts)
    top_3_docs = [item[0] for item in top_3]
    fid_obj = FID()
    fid_answer = fid_obj.generate_answer(question, top_3_docs)

    print(question)
    print('REAL ANSWER : ' , real_answer)
    print('RERANKER ANSWER : ',reranker_answer)
    print('SCORE : ' , rouge.compute(predictions=[reranker_answer], references=[real_answer]))
    print('FID ANSWER : ', fid_answer)
    print('SCORE : ' , rouge.compute(predictions=[fid_answer], references=[real_answer]))
    print('___________________________________________________________________')


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

what is id of Jhon Doe?
REAL ANSWER :  P1001
RERANKER ANSWER :  p1009
SCORE :  {'rouge1': np.float64(0.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0), 'rougeLsum': np.float64(0.0)}
FID ANSWER :  p1009
SCORE :  {'rouge1': np.float64(0.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0), 'rougeLsum': np.float64(0.0)}
___________________________________________________________________
how old is Jane Smith?
REAL ANSWER :  67
RERANKER ANSWER :  67
SCORE :  {'rouge1': np.float64(1.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(1.0), 'rougeLsum': np.float64(1.0)}
FID ANSWER :  67
SCORE :  {'rouge1': np.float64(1.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(1.0), 'rougeLsum': np.float64(1.0)}
___________________________________________________________________
what is gender of patient with id P1003?
REAL ANSWER :  male
RERANKER ANSWER :  male
SCORE :  {'rouge1': np.float64(1.0), 'rouge2': np.float64(0.0), 'rougeL': np.float64(1.0), 'rougeLsum': np.float64(1.0)}
FID 

In [None]:
#___________________________________________________TRASH CAN___________________________________________________________________________