In [28]:
from datasets import load_dataset
dataset = load_dataset(
    "json",
    data_files={
        "train": "/content/drive/MyDrive/medical-qa-rag/artifacts/train.json",
        "validation": "/content/drive/MyDrive/medical-qa-rag/artifacts/validation.json",
        "test": "/content/drive/MyDrive/medical-qa-rag/artifacts/test.json"
    }
)


In [29]:
dataset['train'][0]

{'instruction': 'Please answer with one of the option in the bracket',
 'input': "Q:A 55-year-old woman comes to the physician because of increased blurring of vision in both eyes for the past 4 months. She has tried using over-the-counter reading glasses, but they have not helped. She has a history of hypertension, type 2 diabetes mellitus, and chronic obstructive pulmonary disease. Current medications include lisinopril, insulin, metformin, and a fluticasone-vilanterol inhaler. Vital signs are within normal limits. Examination shows visual acuity of 20/70 in each eye. A photograph of the fundoscopic examination of the right eye is shown. Which of the following is the most appropriate next step in management?? \n{'A': 'Topical timolol therapy', 'B': 'Laser photocoagulation', 'C': 'Oral ganciclovir therapy', 'D': 'Ocular massage', 'E': 'Surgical vitrectomy'},",
 'output': 'B: Laser photocoagulation'}

In [30]:
from huggingface_hub import login
login("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")


In [31]:
from transformers import AutoTokenizer
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [32]:
print("Current pad token",tokenizer.pad_token)

Current pad token None


In [33]:
tokenizer.pad_token = tokenizer.eos_token

In [34]:
special_tokens_dict = {
    'additional_special_tokens':["<INSTRUCTION>", "</INSTRUCTION>",
                                 "<QUESTION>", "</QUESTION>",
                                  "<OUTPUT>", "</OUTPUT>"]
}
tokenizer.add_special_tokens(special_tokens_dict)

6

In [35]:
import re

def clean_text(text):
    # Remove escaped newline/backslashes
    text = text.replace("\\n", " ").replace("\\", "")
    # Remove repeated spaces
    text = re.sub(r"\s+", " ", text)
    # Remove weird leftover quotes/braces
    text = re.sub(r"[\'\"{}]+", "", text)
    return text.strip()

def label_dataset(qa_pair):
    qa_pair["instruction"] = clean_text(qa_pair["instruction"])
    qa_pair["input"] = clean_text(qa_pair["input"])
    qa_pair["output"] = clean_text(qa_pair["output"])

    qa_pair["text"] = f"<INSTRUCTION> {qa_pair['instruction']} </INSTRUCTION> " \
                      f"<QUESTION> {qa_pair['input']} </QUESTION>"
    qa_pair["label_text"] = f"<OUTPUT> {qa_pair['output']} </OUTPUT>"
    return qa_pair

dataset = dataset.map(label_dataset)

Map:   0%|          | 0/7327 [00:00<?, ? examples/s]

Map:   0%|          | 0/815 [00:00<?, ? examples/s]

Map:   0%|          | 0/2036 [00:00<?, ? examples/s]

In [36]:
dataset['train'][90]

{'instruction': 'Please answer with one of the option in the bracket',
 'input': 'Q:A 67-year-old woman comes to the physician with a 4-month history of chest pain that occurs on exertion. The pain is dull, and she experiences retrosternal pressure when she walks up the stairs to her apartment on the fifth floor. The pain disappears shortly after stopping for one minute. She has hypertension, for which she takes lisinopril and metoprolol daily. She does not smoke or drink alcohol. She is 158 cm (5 ft 2 in) tall and weighs 82 kg (180 lb); BMI is 33 kg/m2. Her pulse is 72/min and blood pressure is 140/85 mm Hg. Cardiac examination shows no murmurs, rubs, or gallops. Fasting lipid studies show: Total cholesterol 196 mg/dL LDL 110 mg/dL HDL 50 mg/dL A resting ECG shows no abnormalities. A week after uneventful initiation of aspirin, the patient is started on atorvastatin. This patient is most likely to develop which of the following?? A: Bloating, B: Myositis, C: Elevated transaminases, D:

In [37]:
max_length = 1024
def tokenize(batch):
  full_texts = [
      t + " " + l for t, l in zip(batch["text"], batch["label_text"])
  ]
  model_inputs = tokenizer(
      full_texts,
      max_length=max_length,
      padding="max_length",
      truncation=True,
      return_tensors="pt"
  )
  model_inputs["labels"] = model_inputs["input_ids"].clone() #labels are same as input ids in causal lm training
  return model_inputs


In [38]:
tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names
)
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"]))

Map:   0%|          | 0/7327 [00:00<?, ? examples/s]

Map:   0%|          | 0/815 [00:00<?, ? examples/s]

Map:   0%|          | 0/2036 [00:00<?, ? examples/s]

<s><INSTRUCTION>  Please answer with one of the option in the bracket </INSTRUCTION>  <QUESTION>  Q:A 55-year-old woman comes to the physician because of increased blurring of vision in both eyes for the past 4 months. She has tried using over-the-counter reading glasses, but they have not helped. She has a history of hypertension, type 2 diabetes mellitus, and chronic obstructive pulmonary disease. Current medications include lisinopril, insulin, metformin, and a fluticasone-vilanterol inhaler. Vital signs are within normal limits. Examination shows visual acuity of 20/70 in each eye. A photograph of the fundoscopic examination of the right eye is shown. Which of the following is the most appropriate next step in management?? A: Topical timolol therapy, B: Laser photocoagulation, C: Oral ganciclovir therapy, D: Ocular massage, E: Surgical vitrectomy, </QUESTION>  <OUTPUT>  B: Laser photocoagulation </OUTPUT></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></

In [39]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model
import torch

model_name = "meta-llama/Llama-2-7b-hf"

# ✅ Define quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,     # optional, improves quantization
    bnb_4bit_quant_type="nf4",          # quantization type (nf4 = good default)
    bnb_4bit_compute_dtype=torch.float16  # dtype for computation
)

# ✅ Load model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,     # 👈 new argument replaces deprecated ones
    device_map="auto"                   # automatically uses GPU
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
# Resize model embeddings to include the new special tokens
model.resize_token_embeddings(len(tokenizer))


Embedding(32006, 4096)

In [41]:
from peft import LoraConfig, get_peft_model
from transformers import Trainer, DataCollatorForSeq2Seq

# 1️⃣ Apply LoRA to your base model
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [42]:
# Apply LoRA configuration to the 4-bit LLaMA 2 model
model = get_peft_model(model, lora_config)



In [43]:
# 2️⃣ Make sure LoRA layers are trainable
model.train()
model.print_trainable_parameters()  # Should print ~8M trainable params

# 3️⃣ Prepare data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

# 4️⃣ Create Trainer using default arguments (without TrainingArguments)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator
)


trainable params: 8,388,608 || all params: 6,746,853,376 || trainable%: 0.1243


In [None]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq, Trainer

# ----------------------------
# 1️⃣ Training arguments
# ----------------------------
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model",
    per_device_train_batch_size=1,         # ↓ Reduce to prevent OOM
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,         # ↑ Effective batch size = 8
    learning_rate=1.5e-4,                  # Stable for LoRA fine-tuning
    num_train_epochs=1,
    fp16=True,                             # Half precision for memory & speed
    logging_steps=10,
    warmup_steps=30,
    max_steps=500,
    save_total_limit=2,
    load_best_model_at_end=False,
    eval_strategy="no",              # Disable eval during training to save memory
    save_strategy="steps",
    save_steps=100,
    report_to="none",
    gradient_checkpointing=False,          # Must be False with 4-bit + LoRA
    lr_scheduler_type="cosine",
)

# ----------------------------
# 2️⃣ Data collator
# ----------------------------
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

# ----------------------------
# 3️⃣ Trainer
# ----------------------------
trainer = Trainer(
    model=model,  # ✅ LoRA-applied 4-bit model
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],  # Optional, won't run because eval_strategy="no"
    data_collator=data_collator
)

# ----------------------------
# 4️⃣ Start training
# ----------------------------
trainer.train()


Step,Training Loss
10,8.6589
20,5.4712
30,2.4658
40,1.641
50,1.3046
60,0.9371
70,0.7513
80,0.599
90,0.5106
100,0.4775




KeyboardInterrupt: 

In [None]:
model.save_pretrained("/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora")
tokenizer.save_pretrained("/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora")




('/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora/tokenizer_config.json',
 '/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora/special_tokens_map.json',
 '/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora/tokenizer.model',
 '/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora/added_tokens.json',
 '/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora/tokenizer.json')

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
from datasets import load_dataset

# ---------------------------
# 1️⃣ Load tokenizer & dataset
# ---------------------------
tokenizer = AutoTokenizer.from_pretrained(
    "/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora"
)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset(
    "json",
    data_files={
        "test": "/content/drive/MyDrive/medical-qa-rag/artifacts/test.json"
    }
)
print("Loaded test dataset:", len(dataset["test"]))

# ---------------------------
# 2️⃣ Load base model in 4-bit
# ---------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

# ✅ Resize embeddings to match tokenizer BEFORE loading LoRA
base_model.resize_token_embeddings(len(tokenizer))
print("Resized base model embedding to:", len(tokenizer))

# ---------------------------
# 3️⃣ Load fine-tuned LoRA adapter
# ---------------------------
model = PeftModel.from_pretrained(
    base_model,
    "/content/drive/MyDrive/medical-qa-rag/notebooks/fine_tuned_model_lora"
)
model.eval()
print("✅ Loaded fine-tuned LoRA model successfully!")


Loaded test dataset: 2036


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Resized base model embedding to: 32006
✅ Loaded fine-tuned LoRA model successfully!


In [2]:
from transformers import pipeline

# Create generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Pick a few samples from your test dataset
for i in range(3):
    sample = dataset["test"][i]
    prompt = f"<INSTRUCTION> {sample['instruction']} </INSTRUCTION> " \
             f"<QUESTION> {sample['input']} </QUESTION>"

    print(f"\n🩺 Instruction: {sample['instruction']}")
    print(f"❓ Question: {sample['input']}")
    print("🔮 Model output:")

    output = pipe(
        prompt,
        max_new_tokens=200,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )[0]["generated_text"]

    # extract model output after <OUTPUT>
    if "<OUTPUT>" in output:
        output = output.split("<OUTPUT>")[-1].split("</OUTPUT>")[0].strip()

    print(output)


In [13]:
from transformers import pipeline
import evaluate
from datasets import Dataset
import torch

# ⚙️ Smaller generation settings for Colab T4
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    dtype=torch.bfloat16  # faster on T4
)

# ---------------------------
# 2️⃣ Load metrics
# ---------------------------
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

# ---------------------------
# 3️⃣ Prepare prompts and references
# ---------------------------
test_data = dataset["test"]

prompts = [
    f"<INSTRUCTION> {ex['instruction']} </INSTRUCTION> <QUESTION> {ex['input']} </QUESTION>"
    for ex in test_data
][:50]  # evaluate only 50 samples

references = [ex["output"].strip() for ex in test_data][:50]

ds = Dataset.from_dict({"prompt": prompts, "reference": references})

# ---------------------------
# 4️⃣ Generate predictions in batches
# ---------------------------
predictions = []

for i in range(0, len(ds), 2):  # small batch size for T4
    batch_prompts = ds[i : i + 2]["prompt"]

    outputs = pipe(
        batch_prompts,
        max_new_tokens=60,
        do_sample=False,
    )

    for out in outputs:
        text = out[0]["generated_text"]
        if "<OUTPUT>" in text:
            text = text.split("<OUTPUT>")[-1].split("</OUTPUT>")[0].strip()
        predictions.append(text)

# ---------------------------
# 5️⃣ Compute metrics
# ---------------------------
rouge_result = rouge.compute(predictions=predictions, references=references)
bertscore_result = bertscore.compute(predictions=predictions, references=references, lang="en")

print("✅ Evaluation Complete!")
print("ROUGE-L:", rouge_result["rougeL"])
print("BERTScore (F1):", sum(bertscore_result["f1"]) / len(bertscore_result["f1"]))


Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Evaluation Complete!
ROUGE-L: 0.05032862321035386
BERTScore (F1): 0.798479962348938


In [18]:
# Number of samples you want to evaluate
num_samples = 50

prompts = [
    f"<INSTRUCTION> {dataset['test']['instruction'][i]} </INSTRUCTION> "
    f"<QUESTION> {dataset['test']['input'][i]} </QUESTION>"
    for i in range(num_samples)
]

reference_texts = [dataset['test']['output'][i] for i in range(num_samples)]


In [26]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
import nltk

# ✅ Download WordNet for METEOR
nltk.download("wordnet")

# ---------------------------
# 6️⃣ Compute BLEU
# ---------------------------
# Prepare references and candidates for BLEU
reference_corpus = [[ref.split()] for ref in references]
candidate_corpus = [pred.split() for pred in predictions]

bleu_score = corpus_bleu(reference_corpus, candidate_corpus)

# ---------------------------
# 7️⃣ Compute METEOR
# ---------------------------
meteor_scores = [
    meteor_score([ref.split()], pred.split())
    for ref, pred in zip(references, predictions)
]
avg_meteor = sum(meteor_scores) / len(meteor_scores)

# ---------------------------
# 8️⃣ Print additional metrics
# ---------------------------
print(f"✅ BLEU Score: {bleu_score:.4f}")
print(f"✅ METEOR Score: {avg_meteor:.4f}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ BLEU Score: 0.0107
✅ METEOR Score: 0.1319


In [44]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load a semantic embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode references and predictions
ref_embeds = embed_model.encode(references, convert_to_tensor=True)
pred_embeds = embed_model.encode(predictions, convert_to_tensor=True)

# Cosine similarity
cos_sim = util.cos_sim(pred_embeds, ref_embeds)
avg_similarity = torch.diagonal(cos_sim).mean().item()

print(f"✅ Average Semantic Similarity: {avg_similarity:.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Average Semantic Similarity: 0.3114
