In [101]:
# !pip install trl==0.7.4
# !pip install datasets
# !pip install transformers==4.38.2
# !pip install peft==0.10.0
# !pip install accelerate==0.28.0
# test commit

In [1]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [3]:
import random

import numpy as np
import torch
import pandas as pd

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
)

def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

#Configuration options
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 1
eval_steps = 500
max_input_length = 550
save_steps = 1000
num_train_epochs = 20
random.seed(42)




  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

from sklearn.model_selection import train_test_split

## Creating the policy model for human Evaluation

In [7]:
df = pd.read_csv("combined_clinical_notes.csv")

# First split: 60% train, 40% temp (eval + test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

# Second split: 50% of temp (which is 20% of total) for eval, 50% for test
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [8]:
for column in df.columns:
    print(column)  # Prints each column name

dataset
encounter_id
dialogue
note
source_file
id


In [9]:
import json

import pandas as pd
import torch
from datasets import load_dataset
from torch.utils.data import Dataset

# 1) 4-bit quant config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# 2) Load base model in 4-bit
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)
base_model = prepare_model_for_kbit_training(base_model)

tokenizer = AutoTokenizer.from_pretrained("HPAI-BSC/Qwen2.5-Aloe-Beta-7B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# 3) Apply LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
)
model = get_peft_model(base_model, lora_config)

# 4) Enable gradient checkpointing
# model.enable_input_require_grads()
# model.gradient_checkpointing_enable()
base_model.config.use_cache = False

# 5) Prepare data with smaller sequence length
# notechat = load_dataset("akemiH/NoteChat")
# trainB = notechat["train"].select(range(3000))
# evalB = notechat["train"].select(range(3000, 3500))

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:   0%|                                                                        | 0/4 [03:01<?, ?it/s]

KeyboardInterrupt



In [None]:
class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length=4000):
        self.post_list = []
        dataset = test_df
        self.labels = []

        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["dialogue"])
            self.labels.append(sample[1]["note"])

        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []

    def __len__(self):
        return len(self.post_list)

    def __getitem__(self, idx):
        conversation = self.post_list[idx]
        summary = self.labels[idx]
        # label = self.labels[idx]

        # encodings_dict = self.tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
        # input_ids = torch.tensor(encodings_dict["input_ids"])
        # attn_masks = torch.tensor(encodings_dict["attention_mask"])
        # return {
        #     "input_ids": input_ids,
        #     "attention_mask": attn_masks,
        #     "labels": labels_ids,
        # }

        txt = f"CONVERSATION:\n{conversation}\n\nSUMMARY: \n{summary}" #IMPORTANT!!!!!!!!!!

        encodings = self.tokenizer(
            txt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt",
        )
         # Labels should be the same as input_ids for causal LM training
        # The model will automatically shift labels internally
        encodings["labels"] = encodings["input_ids"].clone()
        
        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": encodings["labels"].squeeze(),
        }

In [None]:
# model.resize_token_embeddings(len(tokenizer))
# model.config.end_token_id = tokenizer.eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id
model.config.pad_token_id = model.config.eos_token_id

# Disable caching (already done, but double-check)
model.config.use_cache = False

# Enable gradient checkpointing (already done, but confirm)
# model.gradient_checkpointing_enable()

In [None]:
# Set up the datasets
data_path = "NA"
train_dataset = TLDRDataset(
    data_path,
    tokenizer,
    "train",
    max_length=256,
)


In [None]:
for i in train_dataset:
    print(i["input_ids"], i["labels"])
    break

In [None]:
torch.cuda.set_device(0)

In [None]:
output_dir = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-logs"

In [None]:
# Prepare the trainer and start training
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
#     per_device_eval_batch_size=eval_batch_size,
    fp16=True,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    warmup_steps=100,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    dataloader_pin_memory=True,
    dataloader_drop_last=True,
    gradient_checkpointing=True,
)

In [None]:
training_args.device.index

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=default_data_collator,
#     preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    tokenizer=tokenizer,
)
trainer.train()
# trainer.save_model(output_dir)

In [None]:
# trainer.save_model(r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune")   ##path to save policy model
# tokenizer.save_pretrained(r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune")
# model.save_pretrained(r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-peft-adapaters")  

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-logs")
model_path = "HPAI-BSC/Qwen2.5-Aloe-Beta-7B"

tokenizer = AutoTokenizer.from_pretrained(model_path, truncation=True, max_length=256, padding="max_length")
text = train_df.iloc[2]["dialogue"]
tokenized_text = tokenizer(text, return_tensors="pt", max_length=256)
tokenized_text

# Policy Model Tuning 

In [1]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [2]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
##model path
# MODEL_PATH = "/content/drive/MyDrive/Medical Dialogue Summarization using PPO/rm_model"
MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune"

In [4]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], 
)

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from trl import AutoModelForCausalLMWithValueHead
from transformers import BitsAndBytesConfig  

# ---- Device Setup ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ---- Paths ----
MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune"
PEFT_ADAPTER_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune-peft-adapaters"
REF_MODEL_PATH = r"D:\kshitij-weights-folder\qwen-aloe-9-4-base-fine-tune"

# ---- 1) 4-bit Quantization Configuration ----
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# ---- 2) Load Base Model in 4-bit ----
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    device_map="auto",
)
# Prepare the model for k-bit training (this typically freezes most parameters except adapter ones)
base_model = prepare_model_for_kbit_training(base_model)
base_model.gradient_checkpointing_disable()  # Disable checkpointing

# ---- 3) Load Tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# ---- 4) Load the PEFT Adapter (LoRA) ----
# This reloads your fine-tuned adapter weights onto your base model.
model_with_lora = PeftModel.from_pretrained(base_model, PEFT_ADAPTER_PATH)

# ---- 5) Convert to PPO-Compatible ValueHead Model ----
# When converting, pass the peft_config from the adapter model to ensure proper initialization.
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model_with_lora,
    peft_config=lora_config
).to(device)

# ---- 6) Optionally, Load a Reference Model for KL (e.g., reward model) ----
ppo_model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(
    model_with_lora,
).to(device)
ppo_model_ref.eval()  # Disable dropout/etc
for param in ppo_model_ref.parameters():
    param.requires_grad = False  # Freeze all





Using device: cuda


Loading checkpoint shards: 100%|█████████████| 4/4 [00:10<00:00,  2.58s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) ## tokenizer of step 1 model., here since we are using same model for step 1 and 2 it doesnot matter
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# tok = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
# tok.pad_token = tok.eos_token
# tok.padding_side = "left"
# ppo_model.config.pad_token_id = tok.eos_token_id
# ppo_model.resize_token_embeddings(len(tok))

In [7]:
sentiment_pipe_kwargs = {"top_k": None, "function_to_apply": "none"}
project_kwargs={"logging_dir": r"D:\kshitij-weights-folder\qwen-aloe-9-4-rl-logs"}

config = PPOConfig(
    model_name=MODEL_PATH, ppo_epochs=1, project_kwargs=project_kwargs, gradient_accumulation_steps=2, steps=5, batch_size=2, mini_batch_size=1, learning_rate=2e-5, log_with='tensorboard')

In [8]:
import torch
import bitsandbytes as bnb
# optimizer = torch.optim.SGD(starcoder_model.parameters(), lr=config.learning_rate)

optimizer = bnb.optim.Adam8bit(ppo_model.parameters(), lr=config.learning_rate)

ppo_trainer = PPOTrainer(config, ppo_model,ppo_model_ref, tokenizer, optimizer=optimizer)



In [9]:
from torch.utils.data import Dataset, DataLoader

df = pd.read_csv("combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.L = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        conv = str(self.df.iloc[i]["dialogue"])
        ref = str(self.df.iloc[i]["note"])
        prompt = f"Summarize the following conversation:\n\n{conv}"
        enc = self.tok(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.L,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt": prompt,
            "ref_txt": ref,
        }

loader = DataLoader(
    ClinDS(df.sample(200, random_state=0), tokenizer),
    batch_size=1, shuffle=True, pin_memory=True, drop_last=True
)

In [10]:
from random import choices
from tqdm import tqdm
import time
import numpy as np

In [11]:
gen_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 32,
    "eos_token_id": -1,
}

In [12]:
MEDICAL_PROMPT = """
Please generate a medical summary based on the following clinical notes. The summary should include the following sections: 

CHIEF COMPLAINT
A concise statement of the patient's primary concern or reason for visiting the clinic.

HISTORY OF PRESENT ILLNESS
A detailed narrative about the patient's symptoms, their onset, duration, and any relevant medical history or previous treatments.

VITALS
Include any relevant vital signs (e.g., oxygen saturation, blood pressure) if available.

PHYSICAL EXAM 
Summarize the findings from the physical examination, including any notable abnormalities.

RESULTS 
Summarize the results of any diagnostic tests performed (e.g., lab work, imaging studies).

ASSESSMENT
The doctor's assessment of the patient's condition or diagnosis.

PLAN
The treatment plan, including prescribed medications, lifestyle recommendations, and follow-up instructions.

INSTRUCTIONS
Specific instructions for the patient regarding their treatment plan and follow-up care.

Important Note: If any section lacks relevant information, omit that section from the generated summary. Only include sections for which there is sufficient information available.
"""

In [13]:
import sys
sys.path.insert(0, r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")

from utils import convert_to_json
from metric.evaluator import get_evaluator

sum_eval = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp, ref):
    """
    src, hyp, ref: lists of strings, length B
    returns: Tensor (B,4) with [coherence, consistency, fluency, relevance]
    """
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32)  # CPU (B,4)



In [14]:
from random import choices
from tqdm import tqdm
import time
import numpy as np
import traceback

for epoch in range(1):
    for batch_idx, batch in enumerate(loader):
        try:  # Wrap the entire batch processing in try-catch
            # Prepare inputs
            ids = batch["input_ids"].to(DEVICE)
            attn_mask = batch["attention_mask"].to(DEVICE)
            src_txt = batch["src_txt"]  # list[str]
            ref_txt = batch["ref_txt"]  # list[str]

            # Generate multiple candidates per prompt
            NUM_CANDIDATES = 2
            all_outs = []
            for _ in range(NUM_CANDIDATES):
                with torch.no_grad():
                    out = ppo_model.generate(
                        input_ids=ids,
                        attention_mask=attn_mask,
                        **gen_kwargs
                    )
                all_outs.append(out)

            # Stack outputs (B, K, L)
            outs = torch.stack(all_outs, dim=1)

            # Decode all candidates
            hyps = [
                [tokenizer.decode(outs[b, k], skip_special_tokens=True)
                for k in range(NUM_CANDIDATES)]
                for b in range(outs.size(0))
            ]

            # Compute rewards using UniEval and dominance scoring
            rewards = []
            for b in range(len(src_txt)):
                # Get scores for all candidates (K, 4)
                scores = unieval_4way(
                    [src_txt[b]] * NUM_CANDIDATES,
                    hyps[b],
                    [ref_txt[b]] * NUM_CANDIDATES
                ).numpy()

                # Compute dominance counts
                dom_counts = np.zeros(NUM_CANDIDATES)
                for i in range(NUM_CANDIDATES):
                    for j in range(NUM_CANDIDATES):
                        if i == j:
                            continue
                        # Check if i dominates j
                        if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                            dom_counts[i] += 1

                # Normalize to [-1, 1]
                max_dom = NUM_CANDIDATES - 1
                scalar_rewards = 2 * (dom_counts / max_dom) - 1
                rewards.append(scalar_rewards)  

            # Flatten for PPO
            flat_queries = []
            flat_responses = []
            flat_rewards = []

            for b in range(len(src_txt)):
                for k in range(NUM_CANDIDATES):
                    flat_queries.append(ids[b])
                    flat_responses.append(outs[b, k])
                    flat_rewards.append(torch.tensor([rewards[b][k]], device=DEVICE))

            # PPO step with error handling
            try:
                stats = ppo_trainer.step(
                    queries=flat_queries,
                    responses=flat_responses,
                    scores=flat_rewards
                )
            except ValueError as e:
                print(f"⚠️ PPO step failed (batch {batch_idx}): {str(e)}")
                print("Traceback:", traceback.format_exc())
                print("Skipping this batch and continuing...")
                exit()  # Skip to next batch

            # Logging
            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}")
                
        except Exception as e:  # Catch any other unexpected errors
            print(f"⚠️ Unexpected error in batch {batch_idx}: {str(e)}")
            print("Traceback:", traceback.format_exc())
            print("Skipping this batch and continuing...")
            exit()

    print(f"✅ Epoch {epoch+1}/3 complete")

print("🎉 PPO fine-tuning done")

Evaluating coherence of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.57s/it]


Evaluating consistency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.09s/it]


Evaluating fluency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:02<00:00,  2.18s/it]


Evaluating relevance of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.05s/it]


Epoch 1, Batch 0
Evaluating coherence of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.52s/it]


Evaluating consistency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.03s/it]


Evaluating fluency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:02<00:00,  2.31s/it]


Evaluating relevance of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:03<00:00,  3.91s/it]


Evaluating coherence of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.40s/it]


Evaluating consistency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.08s/it]


Evaluating fluency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:02<00:00,  2.13s/it]


Evaluating relevance of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.04s/it]


Evaluating coherence of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.39s/it]


Evaluating consistency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:04<00:00,  4.14s/it]


Evaluating fluency of 2 samples !!!


100%|████████████████████████████████████████| 1/1 [00:02<00:00,  2.13s/it]


Evaluating relevance of 2 samples !!!


  0%|                                                | 0/1 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# ###saving the model
# # starcoder_model.save_pretrained("rhlfmodel/")
# # starcoder_tokenizer.save_pretrained("rhlfmodel/")

# ppo_trainer.model.pretrained_model.save_pretrained("D:\kshitij-weights-folder\qwen-aloe-rl-ppo-dom-10-5")
# tokenizer.save_pretrained("D:\kshitij-weights-folder\qwen-aloe-rl-ppo-dom-10-5")

# # if isinstance(ppo_trainer.model.pretrained_model, PeftModel):
# #     ppo_trainer.model.pretrained_model.save_adapter(
# #         "D:/kshitij-weights-folder/qwen-aloe-rl-ppo-dom-10-5-lora",
# #         "lora_adapter"
# #     )

### Testing

In [34]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [36]:
import torch
import transformers
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling
from trl import RewardTrainer
from datasets import Dataset
import json
import pandas as pd
from transformers import Trainer, TrainingArguments
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from transformers import BitsAndBytesConfig  
from tqdm import tqdm

In [38]:
DATA = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv"
df = pd.read_csv(DATA)

# First split: 60% train, 40% temp (eval + test)
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)

# Second split: 50% of temp (which is 20% of total) for eval, 50% for test
eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df.head(1)

Unnamed: 0,dataset,encounter_id,dialogue,note,source_file,id
246,aci,,[doctor] so tyler is a 56 -year-old male who p...,SUBJECTIVE\n\nDifficulty swallowing. Tyler Gre...,src_experiment_data\test1_aci_asrcorr.csv,ACI084


In [40]:
# -- Path to your LoRA weights + tokenizer --
model_dir = r"D:\kshitij-weights-folder\qwen-ppo-tuned-9-5"  

# -- 4-bit quantization config (same as training) --
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

# -- 1) Load the *base* Qwen2.5 model in 4-bit --
base_model = AutoModelForCausalLM.from_pretrained(
    "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
    quantization_config=bnb_config,
    device_map="auto",
)

# model = AutoModelForCausalLM.from_pretrained(base_model_name)
# model = PeftModel.from_pretrained(model, adapter_model_name)

# -- 2) Load your fine-tuned LoRA adapters into the base model --
# The directory should contain adapter_model.bin, adapter_config.json, etc.
lora_model = PeftModel.from_pretrained(
    base_model, 
    model_dir,
    device_map="auto",
)

# -- 3) Load the tokenizer you saved to ./aloe_qwen --
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards: 100%|█████████████| 4/4 [00:11<00:00,  2.87s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [42]:
def return_prompt(conv):
    prompt = f"""
    CONVERSATION:
    {conv}
    
    SUMMARY:
    Please generate a medical summary based on the following clinical notes. The summary should include the following sections: Chief Complaint, History of Present Illness, Vitals, Physical Exam, Results, Assessment, Plan, and Instructions.
    Please format the response as plain text, without using markdown or special formatting, and with clear headings for each section, like this:
    
    
    CHIEF COMPLAINT
    A concise statement of the patient's primary concern or reason for visiting the clinic.
    
    HISTORY OF PRESENT ILLNESS
    A detailed narrative about the patient's symptoms, their onset, duration, and any relevant medical history or previous treatments.
    
    VITALS
    Include any relevant vital signs (e.g., oxygen saturation, blood pressure) if available.
    
    PHYSICAL EXAM 
    Summarize the findings from the physical examination, including any notable abnormalities.
    
    RESULTS 
    Summarize the results of any diagnostic tests performed (e.g., lab work, imaging studies).
    
    ASSESSMENT
    The doctor's assessment of the patient's condition or diagnosis.
    
    PLAN
    The treatment plan, including prescribed medications, lifestyle recommendations, and follow-up instructions.
    
    INSTRUCTIONS
    Specific instructions for the patient regarding their treatment plan and follow-up care.
    
    Important Note: If any section lacks relevant information or if specific details are not provided (e.g., vitals are not mentioned, no abnormal findings in the physical exam), omit that section from the generated summary. Only include sections for which there is sufficient information available.
    """

    return prompt

In [44]:
eval_df = test_df

eval_df.head(1)

Unnamed: 0,dataset,encounter_id,dialogue,note,source_file,id
172,aci,D2N053,[doctor] so barbara i i know you are here for ...,CHIEF COMPLAINT\n\nItchy scalp pain.\n\nREVIEW...,challenge_data\train.csv,


In [46]:
from transformers import pipeline
import re
from tqdm import tqdm
import json
import os
from datetime import datetime

# Create a directory for the output files if it doesn't exist
output_dir = "structured_predictions"
os.makedirs(output_dir, exist_ok=True)

# Initialize summarizer pipeline
summarizer = pipeline(
    "text-generation",
    model=lora_model,
    tokenizer=tokenizer,
    device_map="auto",
)

eval_df = df[:1]  # Using first 3 samples for evaluation
# Prepare batching parameters
batch_size = 4
num_samples = len(eval_df)
num_batches = (num_samples // batch_size) + int(num_samples % batch_size != 0)

# Lists to store predictions and references
predictions = []
structured_predictions = []  # To store extracted sections
references = []

# Batching loop to generate summaries
for i in tqdm(range(num_batches), desc="Generating Summaries"):
    start = i * batch_size
    end = min(start + batch_size, num_samples)

    batch_structured_predictions = []  
    
    # Extract conversation and reference summary columns
    batch_conversations = eval_df["dialogue"][start:end].tolist()
    batch_refs = eval_df["note"][start:end].tolist()

    # Prepare prompts
    prompts = [return_prompt(conv) for conv in batch_conversations]
    
    # Generate summaries
    results = summarizer(
        prompts,
        max_new_tokens=900,
        do_sample=False,
    )
    
    # Process results
    for j, item in enumerate(results):
        output_dict = item[0]
        generated_text = output_dict["generated_text"]
        current_prompt = prompts[j]
        
        # Extract the generated summary (after prompt)
        cleaned = generated_text[len(current_prompt):].strip()
        predictions.append(cleaned)
        
        # Extract structured sections from the generated summary
        # extracted_sections = extract_sections(cleaned)
        # batch_structured_predictions.append(extracted_sections)
        # structured_predictions.append(extracted_sections)
    
    # Append the reference texts
    references.extend(batch_refs)
    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # batch_filename = f"{output_dir}/batch_{i}_{timestamp}.json"
    
    # with open(batch_filename, 'w') as f:
    #     json.dump({
    #         "batch_number": i,
    #         "start_index": start,
    #         "end_index": end,
    #         "structured_predictions": batch_structured_predictions,
    #         "references": batch_refs
    #     }, f, indent=2)
    
    # print(f"Saved batch {i} predictions to {batch_filename}")


# Now you have:
# predictions - raw generated summaries
# structured_predictions - dictionary of extracted sections for each summary
# references - ground truth notes

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

In [47]:
print(predictions[0])

CHIEF COMPLAINT
The patient presented with a chief complaint of high blood pressure, which was noted to be significantly elevated during an emergency room visit. She also reported experiencing lightheadedness and a recent episode of near-fainting while walking due to high blood pressure.

HISTORY OF PRESENT ILLNESS
Diane, a 28-year-old female with a history of depression and hypertension, presented for emergency room follow-up. She reported that her blood pressure was very high during the ER visit, reaching almost 200. She experienced a headache and lightheadedness, which led to a fall that was prevented by her boyfriend. Diane has a history of her blood pressure "skyrocketing" once a week or month, often associated with poor diet and lack of adherence to her medication regimen when traveling. She has been taking lisinopril as prescribed and has been compliant with her medication when at home. She recently started therapy for her depression, which she attends weekly, and reports having

In [None]:
src_list = eval_df["dialogue"].tolist()
ref_list = eval_df["note"].tolist()

In [None]:
output_list = []
for pred in predictions:
    # Ensure that "Summary:" exists in the string to avoid errors
    if len(pred) > 0:
        output_list.append(pred)
    else:
        # Handle cases where "Summary:" is missing (optional)
        output_list.append("")  # Or handle differently based on your needs

In [None]:
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")

data = convert_to_json(
    src_list=src_list,
    ref_list=ref_list,
    output_list=output_list
)

filtered_data = [
    entry for entry in data
    if entry["system_output"].strip()  # Ensure non-empty system_output
]



In [None]:
print(predictions[0])

In [None]:
task = 'summarization'

# Initialize evaluator for a specific task
evaluator = get_evaluator(task)
evaluator1 = get_evaluator('fact')

In [None]:
# Get multi-dimensional evaluation scores
eval_scores = evaluator.evaluate(data, print_result=True, dims=['consistency', 'fluency', 'relevance','coherence'])

In [None]:
eval_scores1 = evaluator1.evaluate(data, print_result=True)

In [None]:
import json

# Load JSON data from a file
with open("structured_predictions/batch_0_20250508_200636.json", 'r', encoding='utf-8') as file:
    data = json.load(file)

In [None]:
for key, value in data['structured_predictions'][1].items():
    print(f"{key}:\n{value}")