In [None]:
pip install -q "transformers==4.38.2" datasets torch pandas bitsandbytes tqdm "accelerate==0.28.0" "trl==0.7.4" "peft==0.10.0"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

# ──────────┐
# 0) Monkey‑patch accelerate to drop dispatch_batches (fixes Trainer bug on 4.38.2)
# ──────────┘
import accelerate
from accelerate import Accelerator
_acc_init = Accelerator.__init__
def _patched_acc_init(self, *args, **kwargs):
    kwargs.pop("dispatch_batches", None)
    return _acc_init(self, *args, **kwargs)
Accelerator.__init__ = _patched_acc_init

In [None]:
import transformers
import transformers.modeling_utils as _mod_utils

# if it's already there (unlikely), skip
if not hasattr(_mod_utils, "EncoderDecoderCache"):
    class EncoderDecoderCache:
        """
        Dummy placeholder so Seq2SeqTrainer can import it.
        No functional cache behavior — Trainer won’t actually use it.
        """
        def __init__(self, **kwargs): pass

    # inject into both the submodule and top‐level namespace
    _mod_utils.EncoderDecoderCache    = EncoderDecoderCache
    transformers.EncoderDecoderCache  = EncoderDecoderCache

In [None]:
import transformers
import transformers.modeling_utils as _mod_utils

# if it's already there (unlikely), skip
if not hasattr(_mod_utils, "EncoderDecoderCache"):
    class EncoderDecoderCache:
        """
        Dummy placeholder so Seq2SeqTrainer can import it.
        No functional cache behavior — Trainer won’t actually use it.
        """
        def __init__(self, **kwargs): pass

    # inject into both the submodule and top‐level namespace
    _mod_utils.EncoderDecoderCache    = EncoderDecoderCache
    transformers.EncoderDecoderCache  = EncoderDecoderCache
import os
import torch
import pandas as pd
from datasets import load_dataset, Dataset as HFDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    pipeline
)
from torch.utils.data import Dataset as TorchDataset

DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL   = "facebook/bart-base"
MED_FT_DIR   = r"D:\kshitij-weights-folder\bart-med-ft"
FINAL_FT_DIR = r"D:\kshitij-weights-folder\bart-final-ft"
CSV_PATH     = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv"  # must have columns: dialogue, note

# ──────────┐
# 2) Dataset wrappers
# ──────────┘
class MedMCQADataset(TorchDataset):
    def __init__(self, hf_ds, tok, max_src=256, max_tgt=16):
        self.tok, self.max_src, self.max_tgt = tok, max_src, max_tgt
        self.examples = []
        for row in hf_ds:
            q    = str(row["question"])
            opts = [str(row[f"op{c}"]) for c in ("a","b","c","d")]
            ans  = str(row["cop"])
            prompt = f"Question: {q} Options: A){opts[0]} B){opts[1]} C){opts[2]} D){opts[3]}"
            self.examples.append((prompt, ans))
    def __len__(self): return len(self.examples)
    def __getitem__(self,i):
        prompt, ans = self.examples[i]
        src = self.tok(prompt,
                       truncation=True, padding="max_length",
                       max_length=self.max_src, return_tensors="pt")
        tgt = self.tok(ans,
                       truncation=True, padding="max_length",
                       max_length=self.max_tgt, return_tensors="pt")
        labels = tgt.input_ids.clone()
        labels[labels==self.tok.pad_token_id] = -100
        return {
          "input_ids":      src.input_ids.squeeze(),
          "attention_mask": src.attention_mask.squeeze(),
          "labels":         labels.squeeze(),
        }

class DialogueSummaryDataset(TorchDataset):
    def __init__(self, hf_ds, tok, max_src=512, max_tgt=256):
        self.ds, self.tok = hf_ds, tok
        self.max_src, self.max_tgt = max_src, max_tgt
    def __len__(self): return len(self.ds)
    def __getitem__(self,i):
        row     = self.ds[i]
        src_txt = str(row["dialogue"])
        tgt_txt = str(row["note"])
        src = self.tok(src_txt,
                       truncation=True, padding="max_length",
                       max_length=self.max_src, return_tensors="pt")
        tgt = self.tok(tgt_txt,
                       truncation=True, padding="max_length",
                       max_length=self.max_tgt, return_tensors="pt")
        labels = tgt.input_ids.clone()
        labels[labels==self.tok.pad_token_id] = -100
        return {
          "input_ids":      src.input_ids.squeeze(),
          "attention_mask": src.attention_mask.squeeze(),
          "labels":         labels.squeeze(),
        }

In [None]:
# ──────────┐
# 3) Stage 1: MedMCQA fine‑tuning
# ──────────┘
print("=== Stage 1: MedMCQA fine‑tuning ===")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model     = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(DEVICE)

med_ds    = load_dataset("openlifescienceai/medmcqa")
train_med = med_ds["train"].select(range(5000))
eval_med  = med_ds["validation"].select(range(500))

train1 = MedMCQADataset(train_med, tokenizer)
eval1  = MedMCQADataset(eval_med,  tokenizer)

args1 = TrainingArguments(
    output_dir           = MED_FT_DIR,
    num_train_epochs     = 1,
    per_device_train_batch_size = 8,
    evaluation_strategy = "epoch",
    save_strategy        = "epoch",
    logging_steps        = 50,
    fp16                 = torch.cuda.is_available(),
)
trainer1 = Trainer(
    model            = model,
    args             = args1,
    train_dataset    = train1,
    eval_dataset     = eval1,
    tokenizer        = tokenizer,
)
trainer1.train()

In [None]:
trainer1.save_model(MED_FT_DIR)
model.save_pretrained(
    MED_FT_DIR,
    safe_serialization=False  # Crucial for preserving buffers
)
tokenizer.save_pretrained(MED_FT_DIR)

In [None]:

# ──────────┐
# 4) Stage 2: Clinical‑notes fine‑tuning
# ──────────┘
print("=== Stage 2: Clinical notes fine‑tuning ===")
# reload on CPU then send to DEVICE
model     = AutoModelForSeq2SeqLM.from_pretrained(MED_FT_DIR, device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(MED_FT_DIR)

# fix pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer))
model = model.to(DEVICE)

df      = pd.read_csv(CSV_PATH)[["dialogue","note"]]
hf_clin = HFDataset.from_pandas(df)

train_clin = hf_clin.shuffle(seed=42).select(range(400))
eval_clin  = hf_clin.shuffle(seed=42).select(range(400,464))

train2 = DialogueSummaryDataset(train_clin, tokenizer)
eval2  = DialogueSummaryDataset(eval_clin,  tokenizer)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
args2 = TrainingArguments(
    output_dir               = FINAL_FT_DIR,
    num_train_epochs         = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2,
    evaluation_strategy            = "epoch",
    save_strategy            = "epoch",
    logging_steps            = 50,
    fp16                     = torch.cuda.is_available(),
)
trainer2 = Trainer(
    model            = model,
    args             = args2,
    train_dataset    = train2,
    eval_dataset     = eval2,
    tokenizer        = tokenizer,
    data_collator    = data_collator,
)
trainer2.train()

In [None]:
trainer2.save_model(FINAL_FT_DIR)
tokenizer.save_pretrained(FINAL_FT_DIR)

In [None]:
# ──────────┐
# 5) Batch inference & evaluation (manual)
# ──────────┘
print("=== Stage 3: Batch inference ===")

batch_size  = 4
num_samples = len(eval_clin)
num_batches = (num_samples + batch_size - 1) // batch_size

predictions, references = [], []
for i in range(num_batches):
    start, end = i*batch_size, min((i+1)*batch_size, num_samples)
    convs = [str(x) for x in eval_clin["dialogue"][start:end]]
    refs  = [str(x) for x in eval_clin["note"][start:end]]

    prompts = [
        f"Summarize the following conversation:\n\n{c}"
        for c in convs if len(c.strip()) > 10
    ]
    if not prompts:
        continue

    # 1) Tokenize on GPU
    enc = tokenizer(
        prompts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to("cuda")                     # <-- send inputs to GPU

    # 2) Generate on GPU
    with torch.no_grad():
        out_ids = model.generate(
            input_ids      = enc.input_ids,
            attention_mask = enc.attention_mask,
            max_new_tokens = 120,
            do_sample      = False
        )

    # 3) Decode back on CPU
    dec = tokenizer.batch_decode(out_ids, skip_special_tokens=True)
    predictions.extend(dec)
    references.extend(refs)

print(f"✅ Generated {len(predictions)} summaries.")


In [None]:
predictions[0]

In [None]:
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")  # if needed to make sure your Python can import from the UniEval folder

from utils import convert_to_json
from metric.evaluator import get_evaluator

# on CUDA since you asked for it
sum_eval  = get_evaluator("summarization", device="cuda")
# fact_eval = get_evaluator("fact",          device="cuda")


In [None]:
import json
with open(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\unieval_data.json", "r") as f:
    data = json.load(f)

sum_scores = sum_eval.evaluate(
    data,
    print_result=True,
)

In [None]:
fact_dicts = fact_eval.evaluate(data, print_result = True)

In [None]:
from utils            import convert_to_json
from metric.evaluator import get_evaluator

import os, sys, gc, torch, pandas as pd
from torch.utils.data import Dataset, DataLoader
# sum_eval  = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp):
    """
    src, hyp: list[str] of equal length → (B,4) Tensor on CPU:
      [coherence, consistency, fluency, factual]
    """
    data = convert_to_json(output_list=hyp, src_list=src)
    arr = sum_eval.evaluate(data, print_result=False)  # shape (B,4) but only first 3 used
    # the summarization evaluator by default returns [coherence, consistency, fluency, relevance]
    coh = arr[:,0].tolist()
    con = arr[:,1].tolist()
    flu = arr[:,2].tolist()
    rel = arr[:,3].tolist()
    return torch.tensor([coh,con,flu,rel]).T  # (B,4) on CPU

In [None]:
from utils            import convert_to_json
from metric.evaluator import get_evaluator

import os, sys, gc, torch, pandas as pd
from torch.utils.data import Dataset, DataLoader


sum_eval  = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp):
     """
     src, hyp: list[str] → (B,4) Tensor on CPU:
       [coherence, consistency, fluency, relevance]
     """

     data = convert_to_json(output_list=hyp, src_list=src)
     arr = sum_eval.evaluate(data, print_result=False)   # → numpy (B,4)
     coh, con, flu, rel = arr.T.tolist()
     return torch.tensor([coh, con, flu, rel]).T       # (B,4) on CPU

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training
)
from trl import (
    PPOConfig,
    PPOTrainer,
    AutoModelForSeq2SeqLMWithValueHead
)

DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
SFT_DIR  = r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft"  # your SFT BART folder

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# 2a) 4‑bit quant config
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit            = True,
    bnb_4bit_quant_type     = "nf4",
    bnb_4bit_compute_dtype  = torch.float16,
)

# 2b) causal‐LM load (avoids the Seq2Seq final_logits_bias mismatch)
base = AutoModelForCausalLM.from_pretrained(
    SFT_DIR,
    quantization_config = bnb_cfg,
    device_map          = "auto",
)
# freeze + prepare for k‑bit + gradient checkpoint
base = prepare_model_for_kbit_training(base)
base.gradient_checkpointing_enable()
base.config.use_cache = False

# 2c) attach LoRA for causal‑LM
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r          = 8,
    lora_alpha = 32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
)
model = get_peft_model(base, lora_cfg).to(DEVICE)

# 2d) tokenizer
tok = AutoTokenizer.from_pretrained(SFT_DIR, use_fast=False)
tok.pad_token = tok.eos_token
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))

In [None]:
# 2e) wrap for PPO
ppo_model     = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, peft_config=lora_cfg
).to(DEVICE)
ppo_ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, peft_config=lora_cfg
).to(DEVICE).eval()
for p in ppo_ref_model.parameters(): p.requires_grad = False


In [None]:
# ─────────────────────────────────────────────────────────────────────
# 3) same ClinDS + loader as before
# ─────────────────────────────────────────────────────────────────────
# [unchanged code defining ClinDS and loader...]

# ─────────────────────────────────────────────────────────────────────
# 4) PPO setup (same)
# ─────────────────────────────────────────────────────────────────────
ppo_cfg = PPOConfig(
    batch_size      = 2,
    mini_batch_size = 1,
)
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, ppo_model.parameters()),
    lr=2e-5
)
ppo_trainer = PPOTrainer(
    config     = ppo_cfg,
    model      = ppo_model,
    ref_model  = ppo_ref_model,
    tokenizer  = tok,
    optimizer  = optimizer,
)

In [None]:
# ─────────────────────────────────────────────────────────────────────
# 5) PC‑Grad utilities
# ─────────────────────────────────────────────────────────────────────
def pc_merge(flats: list[torch.Tensor]) -> torch.Tensor:
    # Yu et al. 2020 PC‑Grad
    for i in range(len(flats)):
        for j in range(i+1, len(flats)):
            dot = torch.dot(flats[i], flats[j])
            if dot < 0:
                flats[i] -= (dot / (flats[j].norm()**2 + 1e-12)) * flats[j]
    return torch.stack(flats).mean(0)

def flat_param_grads(model) -> torch.Tensor:
    return torch.cat([
        p.grad.detach().flatten()
        for p in model.parameters() if p.grad is not None
    ])

def scatter_flat_grads(model, flat: torch.Tensor):
    idx = 0
    for p in model.parameters():
        if p.grad is None: continue
        n = p.grad.numel()
        p.grad.data = flat[idx:idx+n].view_as(p).clone()
        idx += n


In [None]:
import pandas as pd
from datasets import load_dataset, Dataset

# ─────────────────────────────────────────────────────────────────────
# 3) Dataset & DataLoader (clinical_notes.csv must have 'dialogue','note')
# ─────────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue","note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df, self.tok, self.L = df.reset_index(drop=True), tok, max_len
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        txt = str(self.df.iloc[i]["dialogue"])
        enc = self.tok(txt,
                       truncation=True,
                       max_length=self.L,
                       padding="max_length",
                       return_tensors="pt")
        return {
            "input_ids":      enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt":        txt
        }

loader = DataLoader(
    ClinDS(df.sample(200, random_state=0), tok),
    batch_size=2,
    shuffle=True,
    pin_memory=True,
    drop_last=True
)

In [None]:
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")  # if needed to make sure your Python can import from the UniEval folder

from utils import convert_to_json
from metric.evaluator import get_evaluator

In [None]:
import transformers
import transformers.modeling_utils as _mod_utils

# if it's already there (unlikely), skip
if not hasattr(_mod_utils, "EncoderDecoderCache"):
    class EncoderDecoderCache:
        """
        Dummy placeholder so Seq2SeqTrainer can import it.
        No functional cache behavior — Trainer won’t actually use it.
        """
        def __init__(self, **kwargs): pass

    # inject into both the submodule and top‐level namespace
    _mod_utils.EncoderDecoderCache    = EncoderDecoderCache
    transformers.EncoderDecoderCache  = EncoderDecoderCache

run these 2 cells above first, before running the RL code below



REINFORCEMENT LEARNING CODE !!! (NEW TECHNIQUE USED)

In [None]:
import numpy as np
import torch
from utils            import convert_to_json
from metric.evaluator import get_evaluator

sum_eval = get_evaluator("summarization", device="cuda")

# @torch.inference_mode()
# def unieval_4way(src: list[str], hyp: list[str], ref: list[str]) -> torch.Tensor:
#     """
#     src, hyp, ref: lists of strings, length B
#     Returns a (B,4) CPU tensor with columns [coherence, consistency, fluency, relevance].
#     """
#     data = convert_to_json(
#         output_list = hyp,
#         src_list    = src,
#         ref_list    = ref,
#     )
#     # This returns a list of dicts: [{'coherence':…, 'consistency':…, 'fluency':…, 'relevance':…}, …]
#     raw = sum_eval.evaluate(data, print_result=False)

#     dims = ["coherence", "consistency", "fluency", "relevance"]
#     arr  = np.array([[d[dim] for dim in dims] for d in raw], dtype=np.float32)
#     return torch.from_numpy(arr)  # shape (B,4)

In [None]:
from accelerate import Accelerator
accelerator = Accelerator()
accelerator.state.skip_key = ["src_txt", "ref_txt"]


In [None]:
# ════════════════════════════════════════════════════════════════
# Requirements:
#   pip install trl==0.7.4 transformers==4.38.2 peft==0.10.0 \
#               accelerate==0.28.0 bitsandbytes datasets evaluate pandas
# ════════════════════════════════════════════════════════════════
import os, gc, torch, pandas as pd
import numpy as np
import torch
torch.device('cpu')
from torch.utils.data import Dataset, DataLoader

# ────────────────────────────────────────────────────────────────
# 1) UniEval multi‑dim evaluator (CPU only, load once)
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")  # if needed to make sure your Python can import from the UniEval folder
from utils import convert_to_json
from metric.evaluator import get_evaluator

sum_eval = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp, ref):
    """
    src, hyp, ref: lists of strings, length B
    returns: Tensor (B,4) with [coherence, consistency, fluency, relevance]
    """
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32).cpu()  # CPU (B,4)

# ────────────────────────────────────────────────────────────────
# 2) Load your SFT‑finetuned BART in 4‑bit + LoRA
# ────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = "cuda"
SFT_DIR = r"D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft"

bnb = BitsAndBytesConfig(
    load_in_4bit            = True,
    bnb_4bit_quant_type     = "nf4",
    bnb_4bit_compute_dtype  = torch.float32,   # ← here
)
base = AutoModelForCausalLM.from_pretrained(
    SFT_DIR,
    quantization_config=bnb,
    device_map="auto",
    max_memory={0: "16GiB"},
)


base = prepare_model_for_kbit_training(base)
base.gradient_checkpointing_enable()
base.config.use_cache = False

# 2b) Attach fresh LoRA
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(base, lora_cfg).to(DEVICE)

# 2c) Tokenizer (decoder‑only → left‑pad)
tok = AutoTokenizer.from_pretrained(SFT_DIR, use_fast=False)
tok.pad_token = tok.eos_token
tok.padding_side = "left"
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))

# … after you do your existing wrap …

ppo_model     = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, peft_config=lora_cfg
).to(DEVICE)
ppo_ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, peft_config=lora_cfg
).to(DEVICE).eval()

# Here’s the key: cast **both** to fp16 so the value head and the backbone match
ppo_model     = ppo_model
ppo_ref_model = ppo_ref_model

for p in ppo_ref_model.parameters():
    p.requires_grad = False


# ────────────────────────────────────────────────────────────────
# 3) Prepare your DataLoader (with references)
# ────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.L = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        conv = str(self.df.iloc[i]["dialogue"])
        ref = str(self.df.iloc[i]["note"])
        prompt = f"Summarize the following conversation:\n\n{conv}"
        enc = self.tok(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.L,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt": prompt,
            "ref_txt": ref,
        }

loader = DataLoader(
    ClinDS(df.sample(200, random_state=0), tok),
    batch_size=1, shuffle=True, pin_memory=False, drop_last=True
)

# ────────────────────────────────────────────────────────────────
# 4) Build PPOTrainer + optimizer
# ────────────────────────────────────────────────────────────────
NUM_CANDIDATES = 2
ppo_cfg = PPOConfig(
  batch_size      = loader.batch_size * NUM_CANDIDATES,  # e.g. 1 * 2 = 2
  mini_batch_size = 2,          # or split it if you like
  # log_with        = "tensorboard"
)

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, ppo_model.parameters()),
    lr=2e-5
)

ppo_trainer = PPOTrainer(
    config=ppo_cfg,
    model=ppo_model,
    ref_model=ppo_ref_model,
    tokenizer=tok,
    optimizer=optimizer,
)

# ────────────────────────────────────────────────────────────────
# 5) Training loop with candidate generation and dominance rewards
# ────────────────────────────────────────────────────────────────
gen_kwargs = {
    "max_new_tokens": 64,
    "do_sample": True,
    "pad_token_id": tok.eos_token_id,
    "top_p": 0.9,
    "temperature": 0.7,
}

for epoch in range(1):
    for batch_idx, batch in enumerate(loader):
        ids       = batch["input_ids"].to(DEVICE)
        attn_mask = batch["attention_mask"].to(DEVICE)
        src_txt    = batch["src_txt"]
        ref_txt    = batch["ref_txt"]

        # Generate multiple candidates per prompt
        NUM_CANDIDATES = 2
        all_outs = []
        for _ in range(NUM_CANDIDATES):
            with torch.no_grad():
                out = ppo_model.generate(
                    input_ids=ids,
                    attention_mask=attn_mask,
                    **gen_kwargs
                )
            all_outs.append(out)

        # Stack outputs (B, K, L)
        outs = torch.stack(all_outs, dim=1)

        # Decode all candidates
        hyps = [
            [tok.decode(outs[b, k], skip_special_tokens=True)
            for k in range(NUM_CANDIDATES)]
            for b in range(outs.size(0))
        ]

        # Compute rewards using UniEval and dominance scoring
        rewards = []
        for b in range(len(src_txt)):
            # Get scores for all candidates (K, 4)
            scores = unieval_4way(
                [src_txt[b]] * NUM_CANDIDATES,
                hyps[b],
                [ref_txt[b]] * NUM_CANDIDATES
            ).numpy()

            # Compute dominance counts
            dom_counts = np.zeros(NUM_CANDIDATES)
            for i in range(NUM_CANDIDATES):
                for j in range(NUM_CANDIDATES):
                    if i == j:
                        continue
                    # Check if i dominates j
                    if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                        dom_counts[i] += 1

            # Normalize to [-1, 1]
            max_dom = NUM_CANDIDATES - 1
            scalar_rewards = 2 * (dom_counts / max_dom) - 1
            rewards.append(scalar_rewards)

        # Flatten for PPO
        flat_queries = []
        flat_responses = []
        flat_rewards = []

        for b in range(len(src_txt)):
            for k in range(NUM_CANDIDATES):
                flat_queries.append(ids[b])
                flat_responses.append(outs[b, k])
                flat_rewards.append(torch.tensor([rewards[b][k]], device=DEVICE))

        # PPO step
        # … after your `for b in …` loops that populate flat_queries, flat_responses, flat_rewards …

# make sure your PPOConfig was set to batch_size = loader.batch_size * NUM_CANDIDATES
# so here batch_size == len(flat_queries) == len(flat_responses) == len(flat_rewards)

        stats = ppo_trainer.step(
            queries   = flat_queries,    # e.g. [ q0, q0 ]
            responses = flat_responses,  # e.g. [ r0, r1 ]
            scores    = flat_rewards     # e.g. [ s0, s1 ]
        )


        # Logging
        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}")
            print(f"Sample output: {hyps[0][0][:100]}...")
            print(f"Average reward: {np.mean([r.item() for r in flat_rewards]):.4f}")

    print(f"✅ Epoch {epoch+1}/3 complete")

print("🎉 PPO fine-tuning done")

In [1]:
# ════════════════════════════════════════════════════════════════
# Requirements:
#   pip install trl==0.7.4 transformers==4.38.2 peft==0.10.0 \
#               accelerate==0.28.0 bitsandbytes datasets evaluate pandas
# ════════════════════════════════════════════════════════════════
import os, gc, torch, pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Set environment variable to debug CUDA issues
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# ────────────────────────────────────────────────────────────────
# 1) UniEval multi‑dim evaluator (CPU only, load once)
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

# Initialize evaluator on CPU to avoid device conflicts
sum_eval = get_evaluator("summarization")

@torch.inference_mode()
def unieval_4way(src, hyp, ref):
    """
    src, hyp, ref: lists of strings, length B
    returns: Tensor (B,4) with [coherence, consistency, fluency, relevance]
    """
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32).cpu()  # CPU (B,4)

# ────────────────────────────────────────────────────────────────
# 2) Load your SFT‑finetuned BART (try without quantization first)
# ────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# Try using CPU if CUDA is unstable
# Uncomment the line below to use CPU instead
# DEVICE = "cpu"  
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Use consistent compute dtype
COMPUTE_DTYPE = torch.float32  # Using float32 to avoid dtype issues

SFT_DIR = r"D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft"

# OPTION 1: Try without quantization first to eliminate that source of errors
base = AutoModelForCausalLM.from_pretrained(
    SFT_DIR,
    torch_dtype=COMPUTE_DTYPE,
    device_map={"": 0} if DEVICE == "cuda" else "auto",  # Explicitly set device mapping
    # Reduce the model footprint
    low_cpu_mem_usage=True,
)

# 2b) Attach fresh LoRA
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(base, lora_cfg)

# 2c) Tokenizer (ensure padding_side is explicitly set)
tok = AutoTokenizer.from_pretrained(SFT_DIR, use_fast=False)
tok.pad_token = tok.eos_token
tok.padding_side = "left"  # Explicitly set left padding for decoder-only models
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))

# 2d) Wrap for PPO with consistent dtype
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, 
    peft_config=lora_cfg,
    torch_dtype=COMPUTE_DTYPE,
).to(DEVICE)

ppo_ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, 
    peft_config=lora_cfg,
    torch_dtype=COMPUTE_DTYPE,
).to(DEVICE).eval()

# Freeze reference model
for p in ppo_ref_model.parameters():
    p.requires_grad = False

# ────────────────────────────────────────────────────────────────
# 3) Prepare your DataLoader (with references) - NO PIN MEMORY
# ────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.L = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        conv = str(self.df.iloc[i]["dialogue"])
        ref = str(self.df.iloc[i]["note"])
        prompt = f"Summarize the following conversation:\n\n{conv}"
        enc = self.tok(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.L,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt": prompt,
            "ref_txt": ref,
        }

# Disable pin_memory to avoid CUDA memory issues
loader = DataLoader(
    ClinDS(df.sample(50, random_state=0), tok),  # Reduce sample size to 50
    batch_size=1, 
    shuffle=True, 
    pin_memory=False,  # IMPORTANT: Disabled pin_memory
    drop_last=True,
    num_workers=0,  # Use single-process data loading
)

# ────────────────────────────────────────────────────────────────
# 4) Build PPOTrainer + optimizer
# ────────────────────────────────────────────────────────────────
NUM_CANDIDATES = 2

# Configure PPO with exactly matching batch sizes
ppo_cfg = PPOConfig(
    batch_size=loader.batch_size * NUM_CANDIDATES,  # Must match the actual number of examples
    mini_batch_size=loader.batch_size * NUM_CANDIDATES,  # Process all at once for simplicity
    gradient_accumulation_steps=1,
    optimize_device_cache=True,  # Use newer parameter
    learning_rate=2e-5,
    log_with=None,  # Disable wandb/tensorboard logging
)

# Optimizer with explicit dtype
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, ppo_model.parameters()),
    lr=2e-5,
    eps=1e-5,  # Slightly larger epsilon for stability
)

# Create PPO trainer
ppo_trainer = PPOTrainer(
    config=ppo_cfg,
    model=ppo_model,
    ref_model=ppo_ref_model,
    tokenizer=tok,
    optimizer=optimizer,
    data_collator=None,  # Don't use a data collator
)

# ────────────────────────────────────────────────────────────────
# 5) Simplified PPO Training Loop
# ────────────────────────────────────────────────────────────────
def get_rewards(src, hyp, ref):
    """Compute rewards using UniEval and dominance scoring."""
    # Get scores using UniEval
    scores = unieval_4way(src, hyp, ref).numpy()
    
    # Count dominance relationships
    k = len(hyp)
    dom_counts = np.zeros(k)
    
    for i in range(k):
        for j in range(k):
            if i == j:
                continue
            # Check dominance: i dominates j if all scores are >= and at least one is >
            if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                dom_counts[i] += 1
    
    # Convert to [-1, 1] range reward
    max_dom = k - 1
    if max_dom > 0:
        rewards = 2 * (dom_counts / max_dom) - 1
    else:
        rewards = np.zeros(k)
    
    return rewards

# Generation parameters
gen_kwargs = {
    "max_new_tokens": 64,
    "do_sample": True,
    "pad_token_id": tok.eos_token_id,
    "top_p": 0.9,
    "temperature": 0.7,
}

# Main training function
def train():
    try:
        for epoch in range(1):
            print(f"Starting epoch {epoch+1}")
            
            # Process one batch at a time
            for batch_idx, batch in enumerate(loader):
                try:
                    print(f"Processing batch {batch_idx}")
                    
                    # Clear GPU cache before each batch
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    
                    # Get inputs
                    ids = batch["input_ids"].to(DEVICE)
                    attn_mask = batch["attention_mask"].to(DEVICE)
                    src_txt = batch["src_txt"]
                    ref_txt = batch["ref_txt"]
                    
                    # 1. Generate candidates
                    all_outs = []
                    for _ in range(NUM_CANDIDATES):
                        with torch.no_grad():
                            try:
                                out = ppo_model.generate(
                                    input_ids=ids,
                                    attention_mask=attn_mask,
                                    **gen_kwargs
                                )
                                all_outs.append(out)
                            except Exception as e:
                                print(f"Error during generation: {e}")
                                # Try to continue with what we have
                                if not all_outs:
                                    raise  # Re-raise if we have no outputs
                    
                    # Stack and decode
                    outs = torch.stack(all_outs, dim=1)
                    B, K, _ = outs.shape
                    
                    # Decode outputs for evaluation
                    hyps = []
                    for b in range(B):
                        hyps_b = []
                        for k in range(K):
                            try:
                                text = tok.decode(outs[b, k], skip_special_tokens=True)
                                hyps_b.append(text)
                            except Exception as e:
                                print(f"Error decoding text: {e}")
                                hyps_b.append("")  # Add empty string as fallback
                        hyps.append(hyps_b)
                    
                    # 2. Compute rewards
                    flat_queries, flat_responses, flat_rewards = [], [], []
                    
                    for b in range(B):
                        try:
                            # Calculate rewards
                            rewards_b = get_rewards(
                                [src_txt[b]] * K, 
                                hyps[b],
                                [ref_txt[b]] * K
                            )
                            
                            # Flatten for PPO
                            for k in range(K):
                                flat_queries.append(ids[b])
                                flat_responses.append(outs[b, k])
                                flat_rewards.append(torch.tensor([rewards_b[k]], device=DEVICE, dtype=COMPUTE_DTYPE))
                                
                        except Exception as e:
                            print(f"Error computing rewards: {e}")
                            continue
                    
                    # Safety check
                    if len(flat_queries) != ppo_cfg.batch_size:
                        print(f"Batch size mismatch: expected {ppo_cfg.batch_size}, got {len(flat_queries)}")
                        continue
                    
                    # 3. PPO step
                    try:
                        # Verify shapes match
                        print(f"Queries: {len(flat_queries)}, Responses: {len(flat_responses)}, Rewards: {len(flat_rewards)}")
                        
                        # Manual memory management
                        torch.cuda.empty_cache()
                        
                        # Do PPO step
                        stats = ppo_trainer.step(
                            queries=flat_queries,
                            responses=flat_responses,
                            scores=flat_rewards
                        )
                        
                        # Success! Log the output
                        print(f"Batch {batch_idx} PPO step successful!")
                        print(f"Sample output: {hyps[0][0][:100]}...")
                        avg_reward = np.mean([r.item() for r in flat_rewards])
                        print(f"Average reward: {avg_reward:.4f}")
                        
                    except RuntimeError as e:
                        print(f"Error in PPO step: {e}")
                        
                        # If still running into CUDA errors, try moving to CPU
                        if "CUDA" in str(e) and DEVICE != "cpu":
                            print("\nContinuing to encounter CUDA errors. Try two options:")
                            print("1. Change DEVICE = 'cpu' at the top of the script")
                            print("2. Or use the non-quantized model version\n")
                            
                        # Clear memory and continue
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                            
                except Exception as e:
                    print(f"Error processing batch {batch_idx}: {e}")
                    continue
                
                # Break after a few batches during testing
                if batch_idx >= 2:  # Process 3 batches for testing
                    print("Processed 3 test batches successfully, exiting test run")
                    break
                
            print(f"✅ Epoch {epoch+1} complete")
        
        print("🎉 PPO fine-tuning done")
        
    except KeyboardInterrupt:
        print("Training interrupted by user")
    except Exception as e:
        print(f"Unexpected error: {e}")
        import traceback
        traceback.print_exc()

train()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BartForCausalLM were not initialized from the model checkpoint at D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft and are newly initialized: ['lm_head.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Starting epoch 1
Processing batch 0


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Evaluating coherence of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.35it/s]


Evaluating consistency of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.97it/s]


Evaluating fluency of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.48it/s]


Evaluating relevance of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.75it/s]


Queries: 2, Responses: 2, Rewards: 2
Error in PPO step: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Continuing to encounter CUDA errors. Try two options:
1. Change DEVICE = 'cpu' at the top of the script
2. Or use the non-quantized model version

Processing batch 1
Error processing batch 1: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Processing batch 2
Error processing batch 2: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Processing batch 3
Error processing batch 3: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Processing batch 4
Error processing batch 4: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Processing batch 5
Error processing batch 5: CUDA error: device-si

In [1]:
# ════════════════════════════════════════════════════════════════
# Final Working PPO Implementation (Fixed Device Tracking)
# ════════════════════════════════════════════════════════════════
import os, gc, torch, pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# ────────────────────────────────────────────────────────────────
# 1) Device Configuration
# ────────────────────────────────────────────────────────────────
# Check if CUDA is available
CUDA_AVAILABLE = torch.cuda.is_available()
DEVICE = "cuda" if CUDA_AVAILABLE else "cpu"
print(f"Using device: {DEVICE}")

# Create save directory if it doesn't exist
SAVE_DIR = "ppo_checkpoints"
os.makedirs(SAVE_DIR, exist_ok=True)
print(f"Will save checkpoints to: {SAVE_DIR}")

# ────────────────────────────────────────────────────────────────
# 2) UniEval multi‑dim evaluator (always on CPU)
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

# Initialize evaluator on CPU (always keep on CPU)
sum_eval = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp, ref):
    """
    src, hyp, ref: lists of strings, length B
    returns: Tensor (B,4) with [coherence, consistency, fluency, relevance]
    """
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    # Return scores on CPU, will move to GPU if needed
    return torch.tensor(scores, dtype=torch.float32)  # CPU (B,4)

# ────────────────────────────────────────────────────────────────
# 3) Load your SFT‑finetuned BART (no BitsAndBytes)
# ────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# Use consistent compute dtype
COMPUTE_DTYPE = torch.float32  # Using float32 for stability

SFT_DIR = r"D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft"

# Load model - NO BitsAndBytes
base = AutoModelForCausalLM.from_pretrained(
    SFT_DIR,
    torch_dtype=COMPUTE_DTYPE,
)
# Move to device after loading
base = base.to(DEVICE)

# Add LoRA 
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(base, lora_cfg)
model = model.to(DEVICE)  # Move model to device

# Tokenizer setup
tok = AutoTokenizer.from_pretrained(SFT_DIR, use_fast=False)
tok.pad_token = tok.eos_token
tok.padding_side = "left"  # Important for decoder-only models
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))

# PPO models
ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, 
    peft_config=lora_cfg,
    torch_dtype=COMPUTE_DTYPE,
).to(DEVICE)

ppo_ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    model, 
    peft_config=lora_cfg,
    torch_dtype=COMPUTE_DTYPE,
).to(DEVICE).eval()

# Freeze reference model
for p in ppo_ref_model.parameters():
    p.requires_grad = False

# ────────────────────────────────────────────────────────────────
# 4) Prepare your DataLoader
# ────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.L = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        conv = str(self.df.iloc[i]["dialogue"])
        ref = str(self.df.iloc[i]["note"])
        prompt = f"Summarize the following conversation:\n\n{conv}"
        enc = self.tok(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.L,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt": prompt,
            "ref_txt": ref,
        }

# Small dataset size for testing
loader = DataLoader(
    ClinDS(df.sample(5, random_state=0), tok),  # Start with just 5 samples
    batch_size=1, 
    shuffle=True, 
    pin_memory=False,  # Disable pin_memory
    drop_last=True,
    num_workers=0,  # Use single-process data loading
)

# ────────────────────────────────────────────────────────────────
# 5) Build PPOTrainer + optimizer
# ────────────────────────────────────────────────────────────────
NUM_CANDIDATES = 2

# Configure PPO with matching batch sizes
ppo_cfg = PPOConfig(
    batch_size=loader.batch_size * NUM_CANDIDATES,
    mini_batch_size=loader.batch_size * NUM_CANDIDATES,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    optimize_device_cache=True if CUDA_AVAILABLE else False,
    log_with=None,  # Disable wandb/tensorboard logging
)

# Optimizer
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, ppo_model.parameters()),
    lr=2e-5,
    eps=1e-5,
)

# Create PPO trainer
ppo_trainer = PPOTrainer(
    config=ppo_cfg,
    model=ppo_model,
    ref_model=ppo_ref_model,
    tokenizer=tok,
    optimizer=optimizer,
    data_collator=None,
)

# ────────────────────────────────────────────────────────────────
# 6) Improved PPO Training Loop with Careful Device Management
# ────────────────────────────────────────────────────────────────
# Generation parameters
gen_kwargs = {
    "max_new_tokens": 64,
    "do_sample": True,
    "pad_token_id": tok.eos_token_id,
    "top_p": 0.9,
    "temperature": 0.7,
}

def get_model_device(model):
    """Helper to get the device of a model by checking its parameters"""
    return next(model.parameters()).device

def train_ppo_loop():
    """Improved PPO training loop with careful device management"""
    print(f"Starting PPO training on {DEVICE}")
    print(f"Model device: {get_model_device(ppo_model)}")
    
    try:
        for epoch in range(1):
            print(f"Starting epoch {epoch+1}")
            
            for batch_idx, batch in enumerate(loader):
                print(f"\nProcessing batch {batch_idx}")
                
                try:
                    # 1. Move batch to device
                    ids = batch["input_ids"].to(DEVICE)
                    attn_mask = batch["attention_mask"].to(DEVICE)
                    src_txt = batch["src_txt"]
                    ref_txt = batch["ref_txt"]
                    
                    print(f"Input tensor device: {ids.device}")
                    
                    # 2. Generate candidates
                    print("Generating candidates...")
                    all_candidates = []
                    decoded_candidates = []
                    
                    for c_idx in range(NUM_CANDIDATES):
                        with torch.no_grad():
                            # Generate
                            output = ppo_model.generate(
                                input_ids=ids,
                                attention_mask=attn_mask,
                                **gen_kwargs
                            )
                            all_candidates.append(output)
                            
                            # Decode for evaluation
                            decoded = tok.decode(output[0], skip_special_tokens=True)
                            decoded_candidates.append(decoded)
                            print(f"Candidate {c_idx+1}: {decoded[:50]}...")
                    
                    # Stack candidates
                    candidates = torch.stack(all_candidates, dim=1)
                    B, K, _ = candidates.shape
                    print(f"Candidates shape: {candidates.shape}, device: {candidates.device}")
                    
                    # 3. Get scores using UniEval
                    print("Calculating UniEval scores...")
                    scores = unieval_4way(
                        [src_txt[0]] * K,
                        decoded_candidates,
                        [ref_txt[0]] * K
                    ).numpy()
                    print(f"UniEval scores: {scores}")
                    
                    # 4. Calculate dominance rewards
                    print("Calculating dominance rewards...")
                    dom_counts = np.zeros(K)
                    for i in range(K):
                        for j in range(K):
                            if i == j:
                                continue
                            # Check if i dominates j
                            if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                                dom_counts[i] += 1
                    
                    max_dom = K - 1
                    rewards = 2 * (dom_counts / max_dom) - 1 if max_dom > 0 else np.zeros(K)
                    print(f"Dominance rewards: {rewards}")
                    
                    # 5. Prepare inputs for PPO step
                    print("Preparing PPO inputs...")
                    flat_queries = []
                    flat_responses = []
                    flat_rewards = []
                    
                    for b in range(B):
                        for k in range(K):
                            flat_queries.append(ids[b])
                            flat_responses.append(candidates[b, k])
                            # Create reward tensor on correct device
                            reward_tensor = torch.tensor([rewards[k]], dtype=COMPUTE_DTYPE, device=DEVICE)
                            flat_rewards.append(reward_tensor)
                    
                    # Verify shapes for PPO
                    print(f"Queries: {len(flat_queries)}, device: {flat_queries[0].device}")
                    print(f"Responses: {len(flat_responses)}, device: {flat_responses[0].device}")
                    print(f"Rewards: {len(flat_rewards)}, device: {flat_rewards[0].device}")
                    
                    # 6. Run PPO step
                    print("Executing PPO step...")
                    stats = ppo_trainer.step(
                        queries=flat_queries,
                        responses=flat_responses,
                        scores=flat_rewards
                    )
                    
                    # 7. Log success
                    print(f"✅ PPO step successful for batch {batch_idx}!")
                    print(f"Stats: {stats}")
                    
                    # Clean up memory
                    if CUDA_AVAILABLE:
                        torch.cuda.empty_cache()
                    gc.collect()
                    
                except Exception as e:
                    print(f"❌ Error processing batch {batch_idx}: {e}")
                    import traceback
                    traceback.print_exc()
                    
                    # Try to recover and continue with next batch
                    if CUDA_AVAILABLE:
                        torch.cuda.empty_cache()
                    gc.collect()
                    continue
                
                # For testing purposes, just do a few batches
                if batch_idx >= 2:
                    print("\nTest run completed with 3 batches")
                    break
            
            print(f"\n✅ Epoch {epoch+1} complete")
            
            # Save the model after each epoch
            print("Saving model checkpoint...")
            save_path = os.path.join(SAVE_DIR, f"epoch_{epoch+1}")
            os.makedirs(save_path, exist_ok=True)
            ppo_model.save_pretrained(save_path)
            
            # Save the tokenizer
            tok.save_pretrained(save_path)
        
        print("\n🎉 PPO fine-tuning complete")
        
    except Exception as e:
        print(f"\n❌ Training error: {e}")
        import traceback
        traceback.print_exc()

# Run training
train_ppo_loop() 

Using device: cuda
Will save checkpoints to: ppo_checkpoints


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BartForCausalLM were not initialized from the model checkpoint at D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft and are newly initialized: ['lm_head.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Starting PPO training on cuda
Model device: cuda:0
Starting epoch 1

Processing batch 0
Input tensor device: cuda:0
Generating candidates...


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Candidate 1: Summarize the following conversation:

[doctor] so...
Candidate 2: Summarize the following conversation:

[doctor] so...
Candidates shape: torch.Size([1, 2, 576]), device: cuda:0
Calculating UniEval scores...
Evaluating coherence of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.09s/it]


Evaluating consistency of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 8/8 [01:52<00:00, 14.12s/it]


Evaluating fluency of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.75it/s]


Evaluating relevance of 2 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.93s/it]

UniEval scores: [[0.05010976 0.90272814 0.5243288  0.39502192]
 [0.01079702 0.90268236 0.5245611  0.04552022]]
Calculating dominance rewards...
Dominance rewards: [-1. -1.]
Preparing PPO inputs...
Queries: 2, device: cuda:0
Responses: 2, device: cuda:0
Rewards: 2, device: cuda:0
Executing PPO step...
❌ Error processing batch 0: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`

❌ Training error: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.




Traceback (most recent call last):
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\AppData\Local\Temp\ipykernel_17688\3337554649.py", line 288, in train_ppo_loop
    stats = ppo_trainer.step(
            ^^^^^^^^^^^^^^^^^
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\contextlib.py", line 81, in inner
    return func(*args, **kwds)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\site-packages\trl\trainer\ppo_trainer.py", line 706, in step
    all_logprobs, logits_or_none, values, masks = self.batched_forward_pass(
                                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\contextlib.py", line 81, in inner
    return func(*args, **kwds)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\site-packages\trl\trainer\ppo_trainer.py", line 978, in batched_forward_pass
    logits, _, values = model(

In [9]:
print("STAR FROM HEREEEEE")

STAR FROM HEREEEEE


In [1]:
# ════════════════════════════════════════════════════════════════
# Custom Reward-Based Training (Without TRL)
# ════════════════════════════════════════════════════════════════
import os, gc, torch, pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Disable CUDA to avoid device conflicts
os.environ["CUDA_VISIBLE_DEVICES"] = ""
DEVICE = "cpu"
print(f"Using CPU for all operations")

# Create save directory
SAVE_DIR = "reward_checkpoints"
os.makedirs(SAVE_DIR, exist_ok=True)

# ────────────────────────────────────────────────────────────────
# 1) UniEval scorer
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

sum_eval = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def get_unieval_scores(src, hyp, ref):
    """Get UniEval scores for generated summaries"""
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32)

# ────────────────────────────────────────────────────────────────
# 2) Load GPT-2 Model and tokenizer
# ────────────────────────────────────────────────────────────────
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import LoraConfig, get_peft_model

BASE_MODEL = "gpt2"
base = GPT2LMHeadModel.from_pretrained(BASE_MODEL)
tokenizer = GPT2Tokenizer.from_pretrained(BASE_MODEL)

# Set padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    base.config.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

# Add LoRA for efficient fine-tuning
lora_cfg = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["c_attn", "c_proj"],
)
model = get_peft_model(base, lora_cfg)

# ────────────────────────────────────────────────────────────────
# 3) Dataset with sampling
# ────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinicalDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        dialogue = str(self.df.iloc[idx]["dialogue"])
        reference = str(self.df.iloc[idx]["note"])
        prompt = f"Summarize the following conversation:\n\n{dialogue}"
        
        inputs = self.tokenizer(
            prompt, 
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "prompt": prompt,
            "reference": reference,
            "dialogue": dialogue
        }

# Create a small dataset to demonstrate the concept
dataset = ClinicalDataset(df.sample(10, random_state=42), tokenizer)
loader = DataLoader(dataset, batch_size=1, shuffle=True)

# ────────────────────────────────────────────────────────────────
# 4) Custom reward-weighted training loop
# ────────────────────────────────────────────────────────────────
NUM_EPOCHS = 3
NUM_CANDIDATES = 2
LEARNING_RATE = 5e-5  # Higher learning rate for more impact

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# We'll keep track of performance
all_metrics = []

def train_with_rewards():
    print("Starting reward-based training (no TRL)")
    
    for epoch in range(NUM_EPOCHS):
        print(f"\n{'='*30}\nEpoch {epoch+1}/{NUM_EPOCHS}\n{'='*30}")
        epoch_metrics = []
        
        for batch_idx, batch in enumerate(tqdm(loader, desc=f"Epoch {epoch+1}")):
            try:
                # Each batch contains a single example
                input_ids = batch["input_ids"]
                attention_mask = batch["attention_mask"]
                dialogue = batch["dialogue"][0]  # String
                reference = batch["reference"][0]  # String
                
                # 1. Generate multiple candidates
                candidates = []
                candidate_texts = []
                
                # Generate candidates
                for _ in range(NUM_CANDIDATES):
                    # Forward pass with model to get generation loss
                    output = model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=input_ids  # Using input as target for causal LM
                    )
                    
                    # Generate text
                    with torch.no_grad():
                        generated = model.generate(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            do_sample=True,
                            temperature=0.7,
                            max_new_tokens=64,
                            num_return_sequences=1,
                            pad_token_id=tokenizer.eos_token_id
                        )
                    
                    # Save generation and decode
                    candidates.append(generated)
                    text = tokenizer.decode(generated[0], skip_special_tokens=True)
                    candidate_texts.append(text)
                    print(f"Candidate {len(candidate_texts)}: {text[:50]}...")
                
                # 2. Score candidates with UniEval
                scores = get_unieval_scores(
                    src=[dialogue] * NUM_CANDIDATES,
                    hyp=candidate_texts,
                    ref=[reference] * NUM_CANDIDATES
                ).numpy()
                
                print(f"UniEval scores: {scores}")
                
                # 3. Identify the best candidate
                total_scores = scores.sum(axis=1)
                best_idx = np.argmax(total_scores)
                best_score = total_scores[best_idx]
                
                print(f"Best candidate: {best_idx+1} with score {best_score:.4f}")
                
                # 4. Retrain model to make best candidate more likely
                # Get the model output for the best candidate
                optimizer.zero_grad()
                
                # Run model on best candidate
                best_candidate = candidates[best_idx]
                
                # Compute language modeling loss for the best candidate
                outputs = model(
                    input_ids=best_candidate,
                    labels=best_candidate
                )
                loss = outputs.loss
                
                # Weight the loss by the score - better generations = stronger signal
                weighted_loss = loss * (1.0 / (best_score + 1.0))  # +1 to prevent division by zero
                
                # Backpropagate and update
                weighted_loss.backward()
                optimizer.step()
                
                # Log metrics
                metric_entry = {
                    "batch": batch_idx,
                    "epoch": epoch,
                    "best_score": best_score.item(),
                    "loss": loss.item(),
                    "weighted_loss": weighted_loss.item()
                }
                epoch_metrics.append(metric_entry)
                
                print(f"Loss: {loss.item():.4f}, Weighted Loss: {weighted_loss.item():.4f}")
                
            except Exception as e:
                print(f"Error processing batch {batch_idx}: {e}")
                import traceback
                traceback.print_exc()
                continue
                
            # For demonstration, process a few batches
            if batch_idx >= 3 and epoch == 0:
                print("Processed 4 batches in first epoch for demonstration")
                break
                
        # Save model after each epoch
        save_path = os.path.join(SAVE_DIR, f"epoch_{epoch+1}")
        os.makedirs(save_path, exist_ok=True)
        model.save_pretrained(save_path)
        tokenizer.save_pretrained(save_path)
        
        # Save metrics
        all_metrics.extend(epoch_metrics)
        metrics_df = pd.DataFrame(all_metrics)
        metrics_df.to_csv(os.path.join(SAVE_DIR, "training_metrics.csv"), index=False)
        
        epoch_avg_score = np.mean([m["best_score"] for m in epoch_metrics])
        print(f"Epoch {epoch+1} complete. Average score: {epoch_avg_score:.4f}")
    
    print("\n🎉 Reward-based training complete!")
    return all_metrics

# Run the training
metrics = train_with_rewards()

# ────────────────────────────────────────────────────────────────
# 5) Visualize training progress
# ────────────────────────────────────────────────────────────────
import matplotlib.pyplot as plt

metrics_df = pd.DataFrame(metrics)

plt.figure(figsize=(10, 6))
plt.plot(metrics_df["batch"], metrics_df["best_score"], marker='o')
plt.xlabel('Batch')
plt.ylabel('Best Candidate Score')
plt.title('Training Progress: Best Candidate Score Over Time')
plt.savefig(os.path.join(SAVE_DIR, "training_progress.png"))
plt.close()

# Plot loss
plt.figure(figsize=(10, 6))
plt.plot(metrics_df["batch"], metrics_df["loss"], label='Original Loss')
plt.plot(metrics_df["batch"], metrics_df["weighted_loss"], label='Weighted Loss')
plt.xlabel('Batch')
plt.ylabel('Loss')l
plt.title('Training Losses Over Time')
plt.legend()
plt.savefig(os.path.join(SAVE_DIR, "training_losses.png"))

Using CPU for all operations


  from .autonotebook import tqdm as notebook_tqdm
Could not load bitsandbytes native library: list index out of range
Traceback (most recent call last):
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\site-packages\bitsandbytes\cextension.py", line 85, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\site-packages\bitsandbytes\cextension.py", line 64, in get_native_library
    cuda_specs = get_cuda_specs()
                 ^^^^^^^^^^^^^^^^
  File "C:\Users\BMSCE CSE.DESKTOP-IUB6THA\.conda\envs\kshitij\Lib\site-packages\bitsandbytes\cuda_specs.py", line 38, in get_cuda_specs
    highest_compute_capability=(get_compute_capabilities()[-1]),
                                ~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^
IndexError: list index out of range

CUDA Setup failed despite CUDA being available. Please run the following command to get more information:

python -m bitsandbytes

Inspect the 

Starting reward-based training (no TRL)

Epoch 1/3


Epoch 1:   0%|                                                                        | 0/10 [00:00<?, ?it/s]

Candidate 1: Summarize the following conversation:

[doctor] he...
Candidate 2: Summarize the following conversation:

[doctor] he...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.05s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.74s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.35s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.03s/it][A


UniEval scores: [[0.8156485  0.51292425 0.62240136 0.19431536]
 [0.67899764 0.66789716 0.5546173  0.18404898]]
Best candidate: 1 with score 2.1453


Epoch 1:  10%|██████▍                                                         | 1/10 [00:17<02:38, 17.62s/it]

Loss: 3.2263, Weighted Loss: 1.0258
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.07s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.62s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.06s/it][A


UniEval scores: [[0.12032465 0.09880885 0.17077254 0.35931796]
 [0.23346081 0.2542215  0.1789969  0.32776153]]
Best candidate: 2 with score 0.9944


Epoch 1:  20%|████████████▊                                                   | 2/10 [00:32<02:09, 16.24s/it]

Loss: 3.2906, Weighted Loss: 1.6499
Candidate 1: Summarize the following conversation:

[doctor] ok...
Candidate 2: Summarize the following conversation:

[doctor] ok...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.20s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.28s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.59s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.19s/it][A


UniEval scores: [[0.87280875 0.89389765 0.5762318  0.78019637]
 [0.7488114  0.73546976 0.73058736 0.58123803]]
Best candidate: 1 with score 3.1231


Epoch 1:  30%|███████████████████▏                                            | 3/10 [00:48<01:51, 15.97s/it]

Loss: 3.1939, Weighted Loss: 0.7746
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.02s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:08<00:00,  8.70s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.10s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.07s/it][A


UniEval scores: [[0.75040156 0.7281368  0.7843533  0.72957665]
 [0.36572725 0.54545105 0.7890035  0.49600145]]
Best candidate: 1 with score 2.9925


Epoch 1:  30%|███████████████████▏                                            | 3/10 [01:11<02:47, 23.97s/it]

Loss: 3.4739, Weighted Loss: 0.8701
Processed 4 batches in first epoch for demonstration





Epoch 1 complete. Average score: 2.3138

Epoch 2/3


Epoch 2:   0%|                                                                        | 0/10 [00:00<?, ?it/s]

Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.10s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.10s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.48s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.98s/it][A


UniEval scores: [[0.9460837  0.93651366 0.79167867 0.6155328 ]
 [0.9339346  0.94431704 0.7979428  0.62661076]]
Best candidate: 2 with score 3.3028


Epoch 2:  10%|██████▍                                                         | 1/10 [00:15<02:16, 15.12s/it]

Loss: 2.7943, Weighted Loss: 0.6494
Candidate 1: Summarize the following conversation:

[doctor] ka...
Candidate 2: Summarize the following conversation:

[doctor] ka...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.14s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:14<02:23, 14.31s/it][A
 18%|█████████████▎                                                           | 2/11 [00:28<02:08, 14.29s/it][A
 27%|███████████████████▉                                                     | 3/11 [00:43<01:56, 14.57s/it][A
 36%|██████████████████████████▌                                              | 4/11 [00:57<01:40, 14.34s/it][A
 45%|█████████████████████████████████▏                                       | 5/11 [01:12<01:27, 14.61s/it][A
 55%|███████████████████████████████████████▊                                 | 6/11 [01:26<01:11, 14.37s/it][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [01:41<00:58, 14.57s/it][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [01:55<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:00<00:07,  1.36it/s][A
 18%|█████████████▎                                                           | 2/11 [00:01<00:04,  2.01it/s][A
 27%|███████████████████▉                                                     | 3/11 [00:01<00:03,  2.34it/s][A
 36%|██████████████████████████▌                                              | 4/11 [00:01<00:02,  2.35it/s][A
 45%|█████████████████████████████████▏                                       | 5/11 [00:02<00:02,  2.45it/s][A
 55%|███████████████████████████████████████▊                                 | 6/11 [00:02<00:02,  2.19it/s][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [00:03<00:01,  2.46it/s][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [00:03<00:

Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


UniEval scores: [[0.9370204 0.8676779 0.6193464 0.9311132]
 [0.9311306 0.8629612 0.6395685 0.9278866]]
Best candidate: 2 with score 3.3615


Epoch 2:  20%|████████████▌                                                  | 2/10 [03:00<13:48, 103.59s/it]

Loss: 3.1011, Weighted Loss: 0.7110
Candidate 1: Summarize the following conversation:

[doctor] th...
Candidate 2: Summarize the following conversation:

[doctor] th...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.01s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/7 [00:00<?, ?it/s][A
 14%|██████████▌                                                               | 1/7 [00:15<01:35, 15.96s/it][A
 29%|█████████████████████▏                                                    | 2/7 [00:29<01:13, 14.72s/it][A
 43%|███████████████████████████████▋                                          | 3/7 [00:45<01:01, 15.26s/it][A
 57%|██████████████████████████████████████████▎                               | 4/7 [00:59<00:44, 14.67s/it][A
 71%|████████████████████████████████████████████████████▊                     | 5/7 [01:15<00:30, 15.13s/it][A
 86%|███████████████████████████████████████████████████████████████▍          | 6/7 [01:29<00:14, 14.69s/it][A
100%|██████████████████████████████████████████████████████████████████████████| 7/7 [01:37<00:00, 13.91s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/7 [00:00<?, ?it/s][A
 14%|██████████▌                                                               | 1/7 [00:00<00:03,  1.59it/s][A
 29%|█████████████████████▏                                                    | 2/7 [00:01<00:02,  1.71it/s][A
 43%|███████████████████████████████▋                                          | 3/7 [00:01<00:02,  1.61it/s][A
 57%|██████████████████████████████████████████▎                               | 4/7 [00:02<00:01,  1.85it/s][A
 71%|████████████████████████████████████████████████████▊                     | 5/7 [00:02<00:01,  1.88it/s][A
 86%|███████████████████████████████████████████████████████████████▍          | 6/7 [00:03<00:00,  1.73it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.79it/s][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


UniEval scores: [[0.54006886 0.8651444  0.4970894  0.5230327 ]
 [0.6105341  0.8697496  0.45273086 0.6482121 ]]
Best candidate: 2 with score 2.5812


Epoch 2:  30%|██████████████████▉                                            | 3/10 [04:52<12:31, 107.29s/it]

Loss: 3.8384, Weighted Loss: 1.0718
Candidate 1: Summarize the following conversation:

[doctor] pa...
Candidate 2: Summarize the following conversation:

[doctor] pa...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.03s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:14<02:21, 14.18s/it][A
 18%|█████████████▎                                                           | 2/11 [00:27<02:05, 13.94s/it][A
 27%|███████████████████▉                                                     | 3/11 [00:43<01:58, 14.79s/it][A
 36%|██████████████████████████▌                                              | 4/11 [00:58<01:42, 14.69s/it][A
 45%|█████████████████████████████████▏                                       | 5/11 [01:15<01:32, 15.49s/it][A
 55%|███████████████████████████████████████▊                                 | 6/11 [01:29<01:14, 14.92s/it][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [01:44<01:00, 15.15s/it][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [01:58<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:00<00:05,  1.95it/s][A
 18%|█████████████▎                                                           | 2/11 [00:00<00:03,  2.37it/s][A
 27%|███████████████████▉                                                     | 3/11 [00:01<00:03,  2.51it/s][A
 36%|██████████████████████████▌                                              | 4/11 [00:01<00:02,  2.50it/s][A
 45%|█████████████████████████████████▏                                       | 5/11 [00:02<00:02,  2.32it/s][A
 55%|███████████████████████████████████████▊                                 | 6/11 [00:02<00:01,  2.51it/s][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [00:02<00:01,  2.51it/s][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [00:03<00:

Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


UniEval scores: [[0.963919   0.91738725 0.80172455 0.9281089 ]
 [0.95988935 0.9361755  0.7940443  0.9193229 ]]
Best candidate: 1 with score 3.6111


Epoch 2:  40%|█████████████████████████▏                                     | 4/10 [07:40<13:06, 131.13s/it]

Loss: 2.4791, Weighted Loss: 0.5376
Candidate 1: Summarize the following conversation:

[doctor] he...
Candidate 2: Summarize the following conversation:

[doctor] he...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.95s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.00s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.44s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.94s/it][A


UniEval scores: [[0.93311125 0.93273103 0.62301916 0.5334728 ]
 [0.9153697  0.916284   0.6688446  0.5619631 ]]
Best candidate: 2 with score 3.0625


Epoch 2:  50%|████████████████████████████████                                | 5/10 [07:54<07:25, 89.16s/it]

Loss: 2.7213, Weighted Loss: 0.6699
Candidate 1: Summarize the following conversation:

[doctor] he...
Candidate 2: Summarize the following conversation:

[doctor] he...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.53s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.98s/it][A


UniEval scores: [[0.5331384  0.5413694  0.6151794  0.21584497]
 [0.8403657  0.8543995  0.63494474 0.19231147]]
Best candidate: 2 with score 2.5220


Epoch 2:  60%|██████████████████████████████████████▍                         | 6/10 [08:09<04:15, 63.92s/it]

Loss: 3.1836, Weighted Loss: 0.9039
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.08s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.45s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


UniEval scores: [[0.13086264 0.17930661 0.1790385  0.38674003]
 [0.3100764  0.73759663 0.1800412  0.3527536 ]]
Best candidate: 2 with score 1.5805


Epoch 2:  70%|████████████████████████████████████████████▊                   | 7/10 [08:24<02:23, 47.86s/it]

Loss: 3.2222, Weighted Loss: 1.2487
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.06s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.23s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.27s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.08s/it][A


UniEval scores: [[0.6837006  0.73258626 0.8014618  0.73102564]
 [0.35653263 0.4764218  0.54118085 0.5983527 ]]
Best candidate: 1 with score 2.9488


Epoch 2:  80%|███████████████████████████████████████████████████▏            | 8/10 [08:44<01:17, 39.00s/it]

Loss: 3.3408, Weighted Loss: 0.8460
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.15s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/10 [00:00<?, ?it/s][A
 10%|███████▎                                                                 | 1/10 [00:16<02:30, 16.74s/it][A
 20%|██████████████▌                                                          | 2/10 [00:30<02:00, 15.01s/it][A
 30%|█████████████████████▉                                                   | 3/10 [00:46<01:48, 15.48s/it][A
 40%|█████████████████████████████▏                                           | 4/10 [01:00<01:28, 14.83s/it][A
 50%|████████████████████████████████████▌                                    | 5/10 [01:16<01:16, 15.22s/it][A
 60%|███████████████████████████████████████████▊                             | 6/10 [01:30<00:59, 14.79s/it][A
 70%|███████████████████████████████████████████████████                      | 7/10 [01:46<00:45, 15.22s/it][A
 80%|██████████████████████████████████████████████████████████▍              | 8/10 [02:00<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/10 [00:00<?, ?it/s][A
 10%|███████▎                                                                 | 1/10 [00:00<00:04,  2.21it/s][A
 20%|██████████████▌                                                          | 2/10 [00:00<00:03,  2.01it/s][A
 30%|█████████████████████▉                                                   | 3/10 [00:01<00:03,  2.29it/s][A
 40%|█████████████████████████████▏                                           | 4/10 [00:01<00:02,  2.30it/s][A
 50%|████████████████████████████████████▌                                    | 5/10 [00:02<00:02,  2.31it/s][A
 60%|███████████████████████████████████████████▊                             | 6/10 [00:02<00:01,  2.29it/s][A
 70%|███████████████████████████████████████████████████                      | 7/10 [00:03<00:01,  2.32it/s][A
 80%|██████████████████████████████████████████████████████████▍              | 8/10 [00:03<00:

Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.98s/it][A


UniEval scores: [[0.87769604 0.9130933  0.6716667  0.8509167 ]
 [0.94064003 0.9007478  0.6831777  0.89207697]]
Best candidate: 2 with score 3.4166


Epoch 2:  90%|█████████████████████████████████████████████████████████▌      | 9/10 [11:18<01:14, 74.95s/it]

Loss: 2.7297, Weighted Loss: 0.6180
Candidate 1: Summarize the following conversation:

[doctor] ok...
Candidate 2: Summarize the following conversation:

[doctor] ok...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.05s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.99s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.47s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


UniEval scores: [[0.867584   0.8780972  0.6831967  0.76737076]
 [0.908229   0.9096109  0.59641767 0.7629183 ]]
Best candidate: 1 with score 3.1962


Epoch 2: 100%|███████████████████████████████████████████████████████████████| 10/10 [11:33<00:00, 69.34s/it]

Loss: 3.1470, Weighted Loss: 0.7499





Epoch 2 complete. Average score: 2.9583

Epoch 3/3


Epoch 3:   0%|                                                                        | 0/10 [00:00<?, ?it/s]

Candidate 1: Summarize the following conversation:

[doctor] ka...
Candidate 2: Summarize the following conversation:

[doctor] ka...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:15<02:39, 15.95s/it][A
 18%|█████████████▎                                                           | 2/11 [00:29<02:11, 14.66s/it][A
 27%|███████████████████▉                                                     | 3/11 [00:45<02:02, 15.25s/it][A
 36%|██████████████████████████▌                                              | 4/11 [00:59<01:43, 14.83s/it][A
 45%|█████████████████████████████████▏                                       | 5/11 [01:15<01:31, 15.25s/it][A
 55%|███████████████████████████████████████▊                                 | 6/11 [01:29<01:13, 14.75s/it][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [01:45<01:00, 15.10s/it][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [01:59<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:00<00:07,  1.42it/s][A
 18%|█████████████▎                                                           | 2/11 [00:01<00:04,  1.97it/s][A
 27%|███████████████████▉                                                     | 3/11 [00:01<00:03,  2.30it/s][A
 36%|██████████████████████████▌                                              | 4/11 [00:01<00:02,  2.36it/s][A
 45%|█████████████████████████████████▏                                       | 5/11 [00:02<00:02,  2.62it/s][A
 55%|███████████████████████████████████████▊                                 | 6/11 [00:02<00:02,  2.34it/s][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [00:02<00:01,  2.56it/s][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [00:03<00:

Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.94s/it][A


UniEval scores: [[0.9176087  0.85493135 0.68302125 0.9105813 ]
 [0.93535894 0.9076342  0.6554592  0.9227168 ]]
Best candidate: 2 with score 3.4212


Epoch 3:  10%|██████▎                                                        | 1/10 [02:50<25:34, 170.47s/it]

Loss: 3.0959, Weighted Loss: 0.7002
Candidate 1: Summarize the following conversation:

[doctor] th...
Candidate 2: Summarize the following conversation:

[doctor] th...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.10s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/7 [00:00<?, ?it/s][A
 14%|██████████▌                                                               | 1/7 [00:14<01:27, 14.62s/it][A
 29%|█████████████████████▏                                                    | 2/7 [00:28<01:10, 14.15s/it][A
 43%|███████████████████████████████▋                                          | 3/7 [00:44<00:59, 14.88s/it][A
 57%|██████████████████████████████████████████▎                               | 4/7 [00:57<00:43, 14.45s/it][A
 71%|████████████████████████████████████████████████████▊                     | 5/7 [01:13<00:29, 14.86s/it][A
 86%|███████████████████████████████████████████████████████████████▍          | 6/7 [01:27<00:14, 14.49s/it][A
100%|██████████████████████████████████████████████████████████████████████████| 7/7 [01:38<00:00, 14.07s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/7 [00:00<?, ?it/s][A
 14%|██████████▌                                                               | 1/7 [00:00<00:03,  1.59it/s][A
 29%|█████████████████████▏                                                    | 2/7 [00:01<00:02,  1.75it/s][A
 43%|███████████████████████████████▋                                          | 3/7 [00:01<00:02,  1.64it/s][A
 57%|██████████████████████████████████████████▎                               | 4/7 [00:02<00:01,  1.88it/s][A
 71%|████████████████████████████████████████████████████▊                     | 5/7 [00:02<00:01,  1.87it/s][A
 86%|███████████████████████████████████████████████████████████████▍          | 6/7 [00:03<00:00,  1.72it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 7/7 [00:03<00:00,  1.84it/s][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


UniEval scores: [[0.80291355 0.8594468  0.48933992 0.82005095]
 [0.78175014 0.87534386 0.48537263 0.85947704]]
Best candidate: 2 with score 3.0019


Epoch 3:  20%|████████████▌                                                  | 2/10 [04:43<18:12, 136.54s/it]

Loss: 3.7720, Weighted Loss: 0.9426
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.03s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.48s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.08s/it][A


UniEval scores: [[0.09042832 0.06878943 0.21315213 0.37108418]
 [0.1441121  0.18272017 0.17061749 0.3931847 ]]
Best candidate: 2 with score 0.8906


Epoch 3:  30%|███████████████████▏                                            | 3/10 [04:58<09:27, 81.05s/it]

Loss: 3.2289, Weighted Loss: 1.7079
Candidate 1: Summarize the following conversation:

[doctor] he...
Candidate 2: Summarize the following conversation:

[doctor] he...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.98s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.22s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.59s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.95s/it][A


UniEval scores: [[0.87785083 0.87469226 0.6273687  0.1789501 ]
 [0.81112665 0.8312623  0.5284885  0.18352531]]
Best candidate: 1 with score 2.5589


Epoch 3:  40%|█████████████████████████▌                                      | 4/10 [05:13<05:29, 54.99s/it]

Loss: 3.0553, Weighted Loss: 0.8585
Candidate 1: Summarize the following conversation:

[doctor] ok...
Candidate 2: Summarize the following conversation:

[doctor] ok...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.01s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.04s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.44s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


UniEval scores: [[0.8433821  0.8518076  0.60936636 0.73403615]
 [0.8929249  0.9073397  0.6951779  0.75595355]]
Best candidate: 2 with score 3.2514


Epoch 3:  50%|████████████████████████████████                                | 5/10 [05:28<03:22, 40.52s/it]

Loss: 3.0752, Weighted Loss: 0.7233
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/10 [00:00<?, ?it/s][A
 10%|███████▎                                                                 | 1/10 [00:14<02:10, 14.54s/it][A
 20%|██████████████▌                                                          | 2/10 [00:28<01:53, 14.13s/it][A
 30%|█████████████████████▉                                                   | 3/10 [00:44<01:44, 14.88s/it][A
 40%|█████████████████████████████▏                                           | 4/10 [00:57<01:26, 14.44s/it][A
 50%|████████████████████████████████████▌                                    | 5/10 [01:13<01:14, 14.88s/it][A
 60%|███████████████████████████████████████████▊                             | 6/10 [01:27<00:58, 14.67s/it][A
 70%|███████████████████████████████████████████████████                      | 7/10 [01:43<00:44, 14.95s/it][A
 80%|██████████████████████████████████████████████████████████▍              | 8/10 [01:57<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/10 [00:00<?, ?it/s][A
 10%|███████▎                                                                 | 1/10 [00:00<00:04,  1.91it/s][A
 20%|██████████████▌                                                          | 2/10 [00:01<00:04,  1.73it/s][A
 30%|█████████████████████▉                                                   | 3/10 [00:01<00:03,  2.10it/s][A
 40%|█████████████████████████████▏                                           | 4/10 [00:01<00:02,  2.07it/s][A
 50%|████████████████████████████████████▌                                    | 5/10 [00:02<00:02,  2.05it/s][A
 60%|███████████████████████████████████████████▊                             | 6/10 [00:02<00:01,  2.05it/s][A
 70%|███████████████████████████████████████████████████                      | 7/10 [00:03<00:01,  2.09it/s][A
 80%|██████████████████████████████████████████████████████████▍              | 8/10 [00:03<00:

Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.10s/it][A


UniEval scores: [[0.74687743 0.8746346  0.6658515  0.6630001 ]
 [0.9751532  0.9156578  0.6380515  0.92391837]]
Best candidate: 2 with score 3.4528


Epoch 3:  60%|██████████████████████████████████████▍                         | 6/10 [08:08<05:24, 81.11s/it]

Loss: 2.6165, Weighted Loss: 0.5876
Candidate 1: Summarize the following conversation:

[doctor] he...
Candidate 2: Summarize the following conversation:

[doctor] he...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.17s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.69s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.34s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.17s/it][A


UniEval scores: [[0.8441484  0.45822147 0.49691284 0.6073074 ]
 [0.8685372  0.8697154  0.6761829  0.5553409 ]]
Best candidate: 2 with score 2.9698


Epoch 3:  70%|████████████████████████████████████████████▊                   | 7/10 [08:25<03:01, 60.39s/it]

Loss: 2.6237, Weighted Loss: 0.6609
Candidate 1: Summarize the following conversation:

[doctor] pa...
Candidate 2: Summarize the following conversation:

[doctor] pa...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.09s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:16<02:41, 16.12s/it][A
 18%|█████████████▎                                                           | 2/11 [00:30<02:14, 14.95s/it][A
 27%|███████████████████▉                                                     | 3/11 [00:46<02:04, 15.53s/it][A
 36%|██████████████████████████▌                                              | 4/11 [01:00<01:45, 15.02s/it][A
 45%|█████████████████████████████████▏                                       | 5/11 [01:16<01:32, 15.43s/it][A
 55%|███████████████████████████████████████▊                                 | 6/11 [01:30<01:14, 14.96s/it][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [01:47<01:01, 15.35s/it][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [02:00<00:

Evaluating fluency of 2 samples !!!



  0%|                                                                                 | 0/11 [00:00<?, ?it/s][A
  9%|██████▋                                                                  | 1/11 [00:00<00:04,  2.33it/s][A
 18%|█████████████▎                                                           | 2/11 [00:00<00:04,  2.18it/s][A
 27%|███████████████████▉                                                     | 3/11 [00:01<00:03,  2.39it/s][A
 36%|██████████████████████████▌                                              | 4/11 [00:01<00:02,  2.44it/s][A
 45%|█████████████████████████████████▏                                       | 5/11 [00:02<00:02,  2.37it/s][A
 55%|███████████████████████████████████████▊                                 | 6/11 [00:02<00:02,  2.49it/s][A
 64%|██████████████████████████████████████████████▍                          | 7/11 [00:02<00:01,  2.71it/s][A
 73%|█████████████████████████████████████████████████████                    | 8/11 [00:03<00:

Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/it][A


UniEval scores: [[0.9614441  0.91740984 0.80754805 0.927141  ]
 [0.96298325 0.91512823 0.79483646 0.92092985]]
Best candidate: 1 with score 3.6135


Epoch 3:  80%|███████████████████████████████████████████████████▏            | 8/10 [11:23<03:15, 97.74s/it]

Loss: 2.4185, Weighted Loss: 0.5242
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.17s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.57s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.51s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.21s/it][A


UniEval scores: [[0.9212868  0.9364356  0.84396064 0.6389437 ]
 [0.96445614 0.80126816 0.40919822 0.65483016]]
Best candidate: 1 with score 3.3406


Epoch 3:  90%|█████████████████████████████████████████████████████████▌      | 9/10 [11:41<01:12, 72.75s/it]

Loss: 2.7187, Weighted Loss: 0.6263
Candidate 1: Summarize the following conversation:

[doctor] hi...
Candidate 2: Summarize the following conversation:

[doctor] hi...
Evaluating coherence of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.01s/it][A


Evaluating consistency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.03s/it][A


Evaluating fluency of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.60s/it][A


Evaluating relevance of 2 samples !!!



  0%|                                                                                  | 0/1 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.94s/it][A


UniEval scores: [[0.43471366 0.44009072 0.6994359  0.631724  ]
 [0.80084926 0.7911317  0.7820975  0.76566577]]
Best candidate: 2 with score 3.1397


Epoch 3: 100%|███████████████████████████████████████████████████████████████| 10/10 [11:56<00:00, 71.62s/it]

Loss: 3.4237, Weighted Loss: 0.8270





Epoch 3 complete. Average score: 2.9640

🎉 Reward-based training complete!


ModuleNotFoundError: No module named 'matplotlib'

In [3]:
# ════════════════════════════════════════════════════════════════
# Testing Script for PPO-Trained LoRA Weights
# ════════════════════════════════════════════════════════════════
import os
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np

# Check if CUDA is available for inference
CUDA_AVAILABLE = torch.cuda.is_available()
DEVICE = "cuda" if CUDA_AVAILABLE else "cpu"
print(f"Using device: {DEVICE} for inference")

# ────────────────────────────────────────────────────────────────
# 1) Load the trained model weights
# ────────────────────────────────────────────────────────────────
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from peft import PeftModel, PeftConfig

# Path to the saved LoRA weights
LORA_PATH = "reward_checkpoints/epoch_3"  # Updated to use epoch 3 weights

# First load the base model
base_model = "gpt2"
base = GPT2LMHeadModel.from_pretrained(base_model)
tokenizer = GPT2Tokenizer.from_pretrained(LORA_PATH)  # Load from checkpoint

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    base.config.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"  # Important for decoder-only models

# Load the LoRA weights
print(f"Loading LoRA weights from {LORA_PATH}")
model = PeftModel.from_pretrained(base, LORA_PATH)
model = model.to(DEVICE)
model.eval()  # Set to evaluation mode

# ────────────────────────────────────────────────────────────────
# 2) Prepare test dataset
# ────────────────────────────────────────────────────────────────
# Load test data
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")
test_df = df.sample(20, random_state=42)  # Using 20 samples for testing

# Prepare test dialogues and references
dialogues = test_df["dialogue"].tolist()
references = test_df["note"].tolist()

# ────────────────────────────────────────────────────────────────
# 3) Generate summaries using trained model
# ────────────────────────────────────────────────────────────────
print("Generating summaries...")
batch_size = 4
num_samples = len(dialogues)
num_batches = (num_samples + batch_size - 1) // batch_size
predictions = []

for i in tqdm(range(num_batches)):
    start, end = i*batch_size, min((i+1)*batch_size, num_samples)
    convs = dialogues[start:end]
    prompts = [
        f"Summarize the following conversation:\n\n{c}"
        for c in convs if len(str(c).strip()) > 10
    ]
    if not prompts:
        continue
    
    # Tokenize
    enc = tokenizer(
        prompts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(DEVICE)
    
    # Generate
    with torch.no_grad():
        out_ids = model.generate(
            input_ids=enc.input_ids,
            attention_mask=enc.attention_mask,
            max_new_tokens=120,
            do_sample=False,
            num_beams=3,
            no_repeat_ngram_size=2
        )
    
    # Decode
    dec = tokenizer.batch_decode(out_ids, skip_special_tokens=True)
    predictions.extend(dec)

print(f"✅ Generated {len(predictions)} summaries.")

# ────────────────────────────────────────────────────────────────
# 4) Evaluate with UniEval
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

# Keep UniEval on CPU (more stable)
sum_eval = get_evaluator("summarization", device="cpu")

# Trim predictions and references to the same length
min_len = min(len(predictions), len(references), len(dialogues))
predictions = predictions[:min_len]
references = references[:min_len]
dialogues = dialogues[:min_len]

# Create JSON data for UniEval
print("Creating evaluation data...")
data = convert_to_json(
    src_list=[str(d) for d in dialogues],
    ref_list=[str(r) for r in references],
    output_list=[str(p) for p in predictions]
)

# Run evaluation
print("Running UniEval...")
scores = sum_eval.evaluate(data, print_result=True)

# ────────────────────────────────────────────────────────────────
# 5) Display results
# ────────────────────────────────────────────────────────────────
# Extract scores for each metric
coherence_scores = [item["coherence"] for item in scores]
consistency_scores = [item["consistency"] for item in scores]
fluency_scores = [item["fluency"] for item in scores]
relevance_scores = [item["relevance"] for item in scores]

# Calculate averages
avg_coherence = sum(coherence_scores) / len(coherence_scores)
avg_consistency = sum(consistency_scores) / len(consistency_scores)
avg_fluency = sum(fluency_scores) / len(fluency_scores)
avg_relevance = sum(relevance_scores) / len(relevance_scores)

print("\n=== Summary Evaluation Results ===")
print(f"Average Coherence: {avg_coherence:.4f}")
print(f"Average Consistency: {avg_consistency:.4f}")
print(f"Average Fluency: {avg_fluency:.4f}")
print(f"Average Relevance: {avg_relevance:.4f}")
print(f"Overall Average: {(avg_coherence + avg_consistency + avg_fluency + avg_relevance) / 4:.4f}")

# ────────────────────────────────────────────────────────────────
# 6) Sample output comparison
# ────────────────────────────────────────────────────────────────
print("\n=== Sample Output Comparison ===")
for i in range(min(3, min_len)):  # Show up to 3 examples
    print(f"\nExample {i+1}:")
    print(f"Dialogue: {dialogues[i][:100]}...")
    print(f"Reference: {references[i][:100]}...")
    print(f"Generated: {predictions[i][:100]}...")
    print(f"Scores: Coherence={coherence_scores[i]:.2f}, Consistency={consistency_scores[i]:.2f}, "
          f"Fluency={fluency_scores[i]:.2f}, Relevance={relevance_scores[i]:.2f}")

# ────────────────────────────────────────────────────────────────
# 7) Save results
# ────────────────────────────────────────────────────────────────
# Create a results dataframe
results_df = pd.DataFrame({
    'dialogue': dialogues[:min_len],
    'reference': references[:min_len],
    'prediction': predictions[:min_len],
    'coherence': coherence_scores,
    'consistency': consistency_scores,
    'fluency': fluency_scores,
    'relevance': relevance_scores
})

# Save to CSVl
results_file = "ppo_evaluation_results.csv"
results_df.to_csv(results_file, index=False)
print(f"\nResults saved to {results_file}")

Using device: cuda for inference




Loading LoRA weights from reward_checkpoints/epoch_3
Generating summaries...


  0%|                                                                                  | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██████████████▊                                                           | 1/5 [00:02<00:11,  2.94s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|█████████████████████████████▌                                            | 2/5 [00:05<00:08,  2.83s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|████████████████████████████████████████████▍                             | 3/5 [00:08<00:05,  2.72s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|███████████████████████████████████████████████████████████▏              | 4/5 [00:11<00:02,  2.72s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████████████████████████████████████████████████████████████████████| 5/5 [00:13<00:

✅ Generated 20 summaries.
Creating evaluation data...
Running UniEval...
Evaluating coherence of 20 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 3/3 [00:34<00:00, 11.44s/it]


Evaluating consistency of 20 samples !!!


100%|████████████████████████████████████████████████████████████████████████| 43/43 [10:06<00:00, 14.10s/it]


Evaluating fluency of 20 samples !!!


100%|████████████████████████████████████████████████████████████████████████| 43/43 [01:09<00:00,  1.62s/it]


Evaluating relevance of 20 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.72s/it]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.776413 |
| consistency | 0.710692 |
|   fluency   | 0.680258 |
|  relevance  | 0.648008 |
|   overall   | 0.703843 |
+-------------+----------+

=== Summary Evaluation Results ===
Average Coherence: 0.7764
Average Consistency: 0.7107
Average Fluency: 0.6803
Average Relevance: 0.6480
Overall Average: 0.7038

=== Sample Output Comparison ===

Example 1:
Dialogue: [doctor] hi bruce , how are you ?
[patient] hey , good to see you .
[doctor] good to see you as well...
Reference: CHIEF COMPLAINT

Follow up of chronic problems.

HISTORY OF PRESENT ILLNESS

Bruce Howard is a 60-ye...
Generated: Summarize the following conversation:

[doctor] hi bruce, how are you?
[patient] hey, good to see yo...
Scores: Coherence=0.95, Consistency=0.90, Fluency=0.71, Relevance=0.91

Example 2:
Dialogue: [doctor] okay michael so i see in here that you're here because you're exp




In [5]:
# ════════════════════════════════════════════════════════════════
# Baseline GPT-2 Testing Script (For Comparison)
# ════════════════════════════════════════════════════════════════
import os
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np

# Check if CUDA is available for inference
CUDA_AVAILABLE = torch.cuda.is_available()
DEVICE = "cuda" if CUDA_AVAILABLE else "cpu"
print(f"Using device: {DEVICE} for inference")

# ────────────────────────────────────────────────────────────────
# 1) Load the base GPT-2 model (without LoRA)
# ────────────────────────────────────────────────────────────────
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load base model directly
base_model = "gpt2"
model = GPT2LMHeadModel.from_pretrained(base_model)
tokenizer = GPT2Tokenizer.from_pretrained(base_model)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"  # Important for decoder-only models

# Move model to appropriate device
model = model.to(DEVICE)
model.eval()  # Set to evaluation mode

print(f"Loaded baseline GPT-2 model (without LoRA weights)")

# ────────────────────────────────────────────────────────────────
# 2) Prepare the same test dataset as before
# ────────────────────────────────────────────────────────────────
# Load test data - ensure we use the same samples as before
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")
test_df = df.sample(20, random_state=42)  # Using same random seed as LoRA test

# Prepare test dialogues and references
dialogues = test_df["dialogue"].tolist()
references = test_df["note"].tolist()

# ────────────────────────────────────────────────────────────────
# 3) Generate summaries using baseline model
# ────────────────────────────────────────────────────────────────
print("Generating summaries with baseline GPT-2...")
batch_size = 4
num_samples = len(dialogues)
num_batches = (num_samples + batch_size - 1) // batch_size
baseline_predictions = []

for i in tqdm(range(num_batches)):
    start, end = i*batch_size, min((i+1)*batch_size, num_samples)
    convs = dialogues[start:end]
    prompts = [
        f"Summarize the following conversation:\n\n{c}"
        for c in convs if len(str(c).strip()) > 10
    ]
    if not prompts:
        continue
    
    # Tokenize
    enc = tokenizer(
        prompts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(DEVICE)
    
    # Generate - using same parameters as LoRA model for fair comparison
    with torch.no_grad():
        out_ids = model.generate(
            input_ids=enc.input_ids,
            attention_mask=enc.attention_mask,
            max_new_tokens=120,
            do_sample=False,
            num_beams=3,
            no_repeat_ngram_size=2
        )
    
    # Decode
    dec = tokenizer.batch_decode(out_ids, skip_special_tokens=True)
    baseline_predictions.extend(dec)

print(f"✅ Generated {len(baseline_predictions)} baseline summaries.")

# ────────────────────────────────────────────────────────────────
# 4) Evaluate with UniEval
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

# Keep UniEval on CPU
sum_eval = get_evaluator("summarization", device="cpu")

# Trim predictions and references to the same length
min_len = min(len(baseline_predictions), len(references), len(dialogues))
baseline_predictions = baseline_predictions[:min_len]
references = references[:min_len]
dialogues = dialogues[:min_len]

# Create JSON data for UniEval
print("Creating evaluation data...")
data = convert_to_json(
    src_list=[str(d) for d in dialogues],
    ref_list=[str(r) for r in references],
    output_list=[str(p) for p in baseline_predictions]
)

# Run evaluation
print("Running UniEval on baseline model outputs...")
baseline_scores = sum_eval.evaluate(data, print_result=True)

# ────────────────────────────────────────────────────────────────
# 5) Display baseline results
# ────────────────────────────────────────────────────────────────
# Extract scores for each metric
baseline_coherence = [item["coherence"] for item in baseline_scores]
baseline_consistency = [item["consistency"] for item in baseline_scores]
baseline_fluency = [item["fluency"] for item in baseline_scores]
baseline_relevance = [item["relevance"] for item in baseline_scores]

# Calculate averages
avg_coherence = sum(baseline_coherence) / len(baseline_coherence)
avg_consistency = sum(baseline_consistency) / len(baseline_consistency)
avg_fluency = sum(baseline_fluency) / len(baseline_fluency)
avg_relevance = sum(baseline_relevance) / len(baseline_relevance)

print("\n=== Baseline GPT-2 Summary Evaluation Results ===")
print(f"Average Coherence: {avg_coherence:.4f}")
print(f"Average Consistency: {avg_consistency:.4f}")
print(f"Average Fluency: {avg_fluency:.4f}")
print(f"Average Relevance: {avg_relevance:.4f}")
print(f"Overall Average: {(avg_coherence + avg_consistency + avg_fluency + avg_relevance) / 4:.4f}")

# ────────────────────────────────────────────────────────────────
# 6) Sample output comparison
# ────────────────────────────────────────────────────────────────
print("\n=== Sample Baseline Outputs ===")
for i in range(min(3, min_len)):  # Show up to 3 examples
    print(f"\nExample {i+1}:")
    print(f"Dialogue: {dialogues[i][:100]}...")
    print(f"Reference: {references[i][:100]}...")
    print(f"Generated: {baseline_predictions[i][:100]}...")
    print(f"Scores: Coherence={baseline_coherence[i]:.2f}, Consistency={baseline_consistency[i]:.2f}, "
          f"Fluency={baseline_fluency[i]:.2f}, Relevance={baseline_relevance[i]:.2f}")

# ────────────────────────────────────────────────────────────────
# 7) Save results
# ────────────────────────────────────────────────────────────────
# Create a results dataframe
results_df = pd.DataFrame({
    'dialogue': dialogues[:min_len],
    'reference': references[:min_len],
    'baseline_prediction': baseline_predictions[:min_len],
    'coherence': baseline_coherence,
    'consistency': baseline_consistency,
    'fluency': baseline_fluency,
    'relevance': baseline_relevance
})

# Save to CSV
results_file = "baseline_gpt2_evaluation_results.csv"
results_df.to_csv(results_file, index=False)
print(f"\nBaseline results saved to {results_file}")

# ────────────────────────────────────────────────────────────────
# 8) Try to load LoRA results for comparison (if available)
# ────────────────────────────────────────────────────────────────
try:
    lora_results = pd.read_csv("ppo_evaluation_results.csv")
    
    # Check if we have matching entries
    if len(lora_results) == len(results_df):
        print("\n=== Comparison: Baseline vs. LoRA PPO ===")
        
        # Calculate average score improvements
        lora_coherence_avg = lora_results['coherence'].mean()
        lora_consistency_avg = lora_results['consistency'].mean()
        lora_fluency_avg = lora_results['fluency'].mean()
        lora_relevance_avg = lora_results['relevance'].mean()
        lora_overall = (lora_coherence_avg + lora_consistency_avg + lora_fluency_avg + lora_relevance_avg) / 4
        
        baseline_overall = (avg_coherence + avg_consistency + avg_fluency + avg_relevance) / 4
        
        # Print comparison
        print(f"{'Metric':<12} {'Baseline':<10} {'LoRA PPO':<10} {'Difference':<10}")
        print(f"{'-'*42}")
        print(f"{'Coherence':<12} {avg_coherence:.4f}     {lora_coherence_avg:.4f}     {lora_coherence_avg-avg_coherence:+.4f}")
        print(f"{'Consistency':<12} {avg_consistency:.4f}     {lora_consistency_avg:.4f}     {lora_consistency_avg-avg_consistency:+.4f}")
        print(f"{'Fluency':<12} {avg_fluency:.4f}     {lora_fluency_avg:.4f}     {lora_fluency_avg-avg_fluency:+.4f}")
        print(f"{'Relevance':<12} {avg_relevance:.4f}     {lora_relevance_avg:.4f}     {lora_relevance_avg-avg_relevance:+.4f}")
        print(f"{'-'*42}")
        print(f"{'OVERALL':<12} {baseline_overall:.4f}     {lora_overall:.4f}     {lora_overall-baseline_overall:+.4f}")
        
        # Create a combined CSV for easy comparison
        combined_df = results_df.copy()
        combined_df['lora_prediction'] = lora_results['prediction']
        combined_df['lora_coherence'] = lora_results['coherence']
        combined_df['lora_consistency'] = lora_results['consistency']
        combined_df['lora_fluency'] = lora_results['fluency']
        combined_df['lora_relevance'] = lora_results['relevance']
        
        # Calculate per-example improvement
        combined_df['coherence_diff'] = combined_df['lora_coherence'] - combined_df['coherence']
        combined_df['consistency_diff'] = combined_df['lora_consistency'] - combined_df['consistency']
        combined_df['fluency_diff'] = combined_df['lora_fluency'] - combined_df['fluency']
        combined_df['relevance_diff'] = combined_df['lora_relevance'] - combined_df['relevance']
        
        # Save combined results
        combined_file = "comparison_results.csv"
        combined_df.to_csv(combined_file, index=False)
        print(f"\nComparison results saved to {combined_file}")
        
except FileNotFoundError:
    print("\nNo LoRA results file found for comparison. Run the LoRA testing script first.")
except Exception as e:
    print(f"\nError comparing results: {e}")

Using device: cpu for inference




Loaded baseline GPT-2 model (without LoRA weights)
Generating summaries with baseline GPT-2...


  0%|                                                                                  | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██████████████▊                                                           | 1/5 [00:14<00:56, 14.21s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|█████████████████████████████▌                                            | 2/5 [00:27<00:41, 13.93s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|████████████████████████████████████████████▍                             | 3/5 [00:41<00:27, 13.89s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|███████████████████████████████████████████████████████████▏              | 4/5 [00:55<00:13, 13.83s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████████████████████████████████████████████████████████████████████| 5/5 [01:09<00:

✅ Generated 20 baseline summaries.
Creating evaluation data...
Running UniEval on baseline model outputs...
Evaluating coherence of 20 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 3/3 [00:38<00:00, 12.76s/it]


Evaluating consistency of 20 samples !!!


100%|████████████████████████████████████████████████████████████████████████| 45/45 [11:40<00:00, 15.56s/it]


Evaluating fluency of 20 samples !!!


100%|████████████████████████████████████████████████████████████████████████| 45/45 [01:05<00:00,  1.46s/it]


Evaluating relevance of 20 samples !!!


100%|██████████████████████████████████████████████████████████████████████████| 3/3 [00:37<00:00, 12.58s/it]


Evaluation scores are shown below:
+-------------+----------+
|  Dimensions |  Score   |
+-------------+----------+
|  coherence  | 0.777173 |
| consistency | 0.623779 |
|   fluency   | 0.672602 |
|  relevance  | 0.653257 |
|   overall   | 0.681703 |
+-------------+----------+

=== Baseline GPT-2 Summary Evaluation Results ===
Average Coherence: 0.7772
Average Consistency: 0.6238
Average Fluency: 0.6726
Average Relevance: 0.6533
Overall Average: 0.6817

=== Sample Baseline Outputs ===

Example 1:
Dialogue: [doctor] hi bruce , how are you ?
[patient] hey , good to see you .
[doctor] good to see you as well...
Reference: CHIEF COMPLAINT

Follow up of chronic problems.

HISTORY OF PRESENT ILLNESS

Bruce Howard is a 60-ye...
Generated: Summarize the following conversation:

[doctor] hi bruce, how are you?
[patient] hey, good to see yo...
Scores: Coherence=0.97, Consistency=0.88, Fluency=0.71, Relevance=0.93

Example 2:
Dialogue: [doctor] okay michael so i see in here that you're here beca




In [None]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [None]:
# ────────────────────────────────────────────────────────────────
# 1) UniEval multi‑dim evaluator (CPU only, load once)
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")  # if needed to make sure your Python can import from the UniEval folder
from utils import convert_to_json
from metric.evaluator import get_evaluator
import torch

sum_eval = get_evaluator("summarization", device="cpu")

@torch.inference_mode()
def unieval_4way(src, hyp, ref):
    """
    src, hyp, ref: lists of strings, length B
    returns: Tensor (B,4) with [coherence, consistency, fluency, relevance]
    """
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data, print_result=True)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32)  # CPU (B,4)


In [None]:
# ════════════════════════════════════════════════════════════════
# Requirements:
#   pip install trl==0.7.4 transformers==4.38.2 peft==0.10.0 \
#               accelerate==0.28.0 bitsandbytes datasets evaluate pandas
# ════════════════════════════════════════════════════════════════
import os, gc, torch, pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
# ────────────────────────────────────────────────────────────────
# 2) Load your SFT‑finetuned BART in 4‑bit + LoRA
# ────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import PPOConfig, PPOTrainer, AutoModelForSeq2SeqLMWithValueHead

# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = "cpu"
SFT_DIR = r"D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft"

# 2a) Quantize & prepare
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)
base = AutoModelForSeq2SeqLM.from_pretrained(
    SFT_DIR,
    quantization_config=bnb,
    device_map="auto"
)
base = prepare_model_for_kbit_training(base)
base.gradient_checkpointing_enable()
base.config.use_cache = False

# 2b) Attach fresh LoRA
lora_cfg = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
)
model = get_peft_model(base, lora_cfg).to(DEVICE)

# 2c) Tokenizer (decoder‑only → left‑pad)
tok = AutoTokenizer.from_pretrained(SFT_DIR, use_fast=False)
tok.pad_token = tok.eos_token
tok.padding_side = "right"
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))

# 2d) Wrap for PPO
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    model, peft_config=lora_cfg
).to(DEVICE)
ppo_ref_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    model, peft_config=lora_cfg
).to(DEVICE).eval()
for p in ppo_ref_model.parameters():
    p.requires_grad = False

# ────────────────────────────────────────────────────────────────
# 3) Prepare your DataLoader (with references)
# ────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinDS(Dataset):
    def __init__(self, df, tok, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tok = tok
        self.L = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        conv = str(self.df.iloc[i]["dialogue"])
        ref = str(self.df.iloc[i]["note"])
        prompt = f"Summarize the following conversation:\n\n{conv}"
        enc = self.tok(
            prompt,
            truncation=True,
            padding="max_length",
            max_length=self.L,
            return_tensors="pt",
        )
        return {
            "input_ids": enc.input_ids.squeeze(),
            "attention_mask": enc.attention_mask.squeeze(),
            "src_txt": prompt,
            "ref_txt": ref,
        }

loader = DataLoader(
    ClinDS(df.sample(200, random_state=0), tok),
    batch_size=1, shuffle=True, pin_memory=True, drop_last=True
)

# ────────────────────────────────────────────────────────────────
# 4) Build PPOTrainer + optimizer
# ────────────────────────────────────────────────────────────────
ppo_cfg = PPOConfig(
    batch_size=1,
    mini_batch_size=1,
    log_with="tensorboard",
    project_kwargs={"logging_dir": "./logs"},
)

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, ppo_model.parameters()),
    lr=2e-5
)

ppo_trainer = PPOTrainer(
    config=ppo_cfg,
    model=ppo_model,
    ref_model=ppo_ref_model,
    tokenizer=tok,
    optimizer=optimizer,
)

# ────────────────────────────────────────────────────────────────
# 5) Training loop with candidate generation and dominance rewards
# ────────────────────────────────────────────────────────────────
gen_kwargs = {
    "max_new_tokens": 64,
    "do_sample": True,
    "pad_token_id": tok.eos_token_id,
    "top_p": 0.9,
    "temperature": 0.7,
}

for epoch in range(1):
    for batch_idx, batch in enumerate(loader):
        # Prepare inputs
        ids = batch["input_ids"].to(DEVICE)
        attn_mask = batch["attention_mask"].to(DEVICE)
        src_txt = batch["src_txt"]  # list[str]
        ref_txt = batch["ref_txt"]  # list[str]

        # Generate multiple candidates per prompt
        NUM_CANDIDATES = 1
        all_outs = []
        for _ in range(NUM_CANDIDATES):
            with torch.no_grad():
                out = ppo_model.generate(
                    input_ids=ids,
                    attention_mask=attn_mask,
                    **gen_kwargs
                )
            all_outs.append(out)

        # Stack outputs (B, K, L)
        outs = torch.stack(all_outs, dim=1)

        # Decode all candidates
        hyps = [
            [tok.decode(outs[b, k], skip_special_tokens=True)
            for k in range(NUM_CANDIDATES)]
            for b in range(outs.size(0))
        ]

        # Compute rewards using UniEval and dominance scoring
        rewards = []
        for b in range(len(src_txt)):
            # Get scores for all candidates (K, 4)
            scores = unieval_4way(
                [src_txt[b]] * NUM_CANDIDATES,
                hyps[b],
                [ref_txt[b]] * NUM_CANDIDATES
            ).numpy()

            # Compute dominance counts
            dom_counts = np.zeros(NUM_CANDIDATES)
            for i in range(NUM_CANDIDATES):
                for j in range(NUM_CANDIDATES):
                    if i == j:
                        continue
                    # Check if i dominates j
                    if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                        dom_counts[i] += 1

            # Normalize to [-1, 1]
            max_dom = NUM_CANDIDATES - 1
            scalar_rewards = 2 * (dom_counts / max_dom) - 1
            rewards.append(scalar_rewards)

        # Flatten for PPO
        flat_queries = []
        flat_responses = []
        flat_rewards = []

        for b in range(len(src_txt)):
            for k in range(NUM_CANDIDATES):
                flat_queries.append(ids[b])
                flat_responses.append(outs[b, k])
                flat_rewards.append(torch.tensor([rewards[b][k]], device=DEVICE))

        # PPO step
        stats = ppo_trainer.step(
            queries=flat_queries,
            responses=flat_responses,
            scores=flat_rewards
        )

        # Logging
        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}")
            print(f"Sample output: {hyps[0][0][:100]}...")
            print(f"Average reward: {np.mean([r.item() for r in flat_rewards]):.4f}")

    print(f"✅ Epoch {epoch+1}/3 complete")

print("🎉 PPO fine-tuning done")

In [None]:
import os
os.environ["HF_HOME"] = r"D:\hf-cache"

In [None]:
# # ════════════════════════════════════════════════════════════════
# # 0) Force CPU Execution
# # ════════════════════════════════════════════════════════════════
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"  # For CPU BLAS
# import torch
# torch.set_default_tensor_type(torch.FloatTensor)
# torch.use_deterministic_algorithms(True)
# import gc, pandas as pd
# import numpy as np
# from torch.utils.data import Dataset, DataLoader

# # ════════════════════════════════════════════════════════════════
# # 1) UniEval Setup (CPU only)
# # ════════════════════════════════════════════════════════════════
# import sys
# sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
# from utils import convert_to_json
# from metric.evaluator import get_evaluator

# sum_eval = get_evaluator("summarization", device="cpu")

# @torch.inference_mode()
# def unieval_4way(src, hyp, ref):
#     data = convert_to_json(output_list=hyp, src_list=src, ref_list=ref)
#     raw = sum_eval.evaluate(data)
#     return torch.tensor([
#         [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
#         for d in raw
#     ], dtype=torch.float32)

# # ════════════════════════════════════════════════════════════════
# # 2) Causal LM Setup (BART as decoder-only)
# # ════════════════════════════════════════════════════════════════
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from peft import LoraConfig, get_peft_model
# from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# DEVICE = "cpu"
# SFT_DIR = r"D:\kshitij-weights-folder\bart_clinical_ft-20250422T192130Z-001\bart_clinical_ft"

# # 2a) Load model without quantization
# base = AutoModelForCausalLM.from_pretrained(
#     SFT_DIR,
#     trust_remote_code=True
# ).float().cpu()

# # 2b) Causal LM LoRA config
# lora_cfg = LoraConfig(
#     task_type="CAUSAL_LM",
#     r=4,  # Reduced for CPU
#     lora_alpha=16,
#     target_modules=["q_proj", "v_proj"],  # Simplified
#     lora_dropout=0.05
# )
# model = get_peft_model(base, lora_cfg).cpu()

# # 2c) Tokenizer config
# tok = AutoTokenizer.from_pretrained(SFT_DIR)
# tok.pad_token = tok.eos_token
# tok.padding_side = "left"  # Causal LM standard

# # 2d) PPO models
# ppo_model = AutoModelForCausalLMWithValueHead(model).cpu()
# ppo_ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(SFT_DIR).cpu().eval()
# for p in ppo_ref_model.parameters():
#     p.requires_grad = False

# # ════════════════════════════════════════════════════════════════
# # 3) Data Loading with Dominance Prep
# # ════════════════════════════════════════════════════════════════
# # class ClinDS(Dataset):
# #     def __init__(self, df, tok, max_len=256):  # Reduced length
# #         self.df = df.reset_index(drop=True)
# #         self.tok = tok
# #         self.L = max_len

# #     def __getitem__(self, i):
# #         conv = str(self.df.iloc[i]["dialogue"])
# #         ref = str(self.df.iloc[i]["note"])
# #         prompt = f"Summarize:\n\n{conv}\n\nSummary:"
# #         enc = self.tok(
# #             prompt,
# #             truncation=True,
# #             padding="max_length",
# #             max_length=self.L,
# #             return_tensors="pt",
# #         )
# #         return {
# #             "input_ids": enc.input_ids.squeeze(),
# #             "attention_mask": enc.attention_mask.squeeze(),
# #             "src_txt": conv,
# #             "ref_txt": ref,
# #         }

# # df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")
# # loader = DataLoader(
# #     ClinDS(df.sample(50, random_state=0), tok),  # Smaller sample
# #     batch_size=1,
# #     shuffle=True,
# #     pin_memory=False
# # )
# df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

# class ClinDS(Dataset):
#     def __init__(self, df, tok, max_len=512):
#         self.df = df.reset_index(drop=True)
#         self.tok = tok
#         self.L = max_len

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, i):
#         conv = str(self.df.iloc[i]["dialogue"])
#         ref = str(self.df.iloc[i]["note"])
#         prompt = f"Summarize the following conversation:\n\n{conv}"
#         enc = self.tok(
#             prompt,
#             truncation=True,
#             padding="max_length",
#             max_length=self.L,
#             return_tensors="pt",
#         )
#         return {
#             "input_ids": enc.input_ids.squeeze(),
#             "attention_mask": enc.attention_mask.squeeze(),
#             "src_txt": prompt,
#             "ref_txt": ref,
#         }

# loader = DataLoader(
#     ClinDS(df.sample(200, random_state=0), tok),
#     batch_size=1, shuffle=True, pin_memory=True, drop_last=True
# )
# # ════════════════════════════════════════════════════════════════
# # 4) Training Loop with Dominance Scoring
# # ════════════════════════════════════════════════════════════════
# gen_kwargs = {
#     "max_new_tokens": 48,
#     "do_sample": True,
#     "temperature": 0.7,
#     "top_p": 0.9,
#     "pad_token_id": tok.eos_token_id,
#     # "no_cuda": True
# }

# for epoch in range(1):
#     for batch_idx, batch in enumerate(loader):
#         # Explicit CPU handling
#         ids = batch["input_ids"].clone().detach().cpu()
#         attn_mask = batch["attention_mask"].clone().detach().cpu()
        
#         # Generate candidates
#         NUM_CANDIDATES = 3  # Reduced for CPU
#         all_outs = []
#         for _ in range(NUM_CANDIDATES):
#             with torch.no_grad():
#                 out = ppo_model.generate(
#                     input_ids=ids,
#                     attention_mask=attn_mask,
#                     **gen_kwargs
#                 ).cpu()
#             all_outs.append(out)
        
#         # Process outputs
#         outs = torch.stack(all_outs, dim=1)
#         hyps = [
#             [tok.decode(outs[b, k], skip_special_tokens=True)
#             for k in range(NUM_CANDIDATES)]
#             for b in range(outs.size(0))
#         ]
        
#         # Dominance scoring
#         rewards = []
#         for b in range(len(batch["src_txt"])):
#             scores = unieval_4way(
#                 [batch["src_txt"][b]] * NUM_CANDIDATES,
#                 hyps[b],
#                 [batch["ref_txt"][b]] * NUM_CANDIDATES
#             ).numpy()
            
#             # Pairwise comparison
#             dom_matrix = np.zeros((NUM_CANDIDATES, NUM_CANDIDATES))
#             for i in range(NUM_CANDIDATES):
#                 for j in range(NUM_CANDIDATES):
#                     if i == j: continue
#                     dom_matrix[i,j] = np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j])
            
#             # Calculate dominance scores
#             dom_counts = dom_matrix.sum(axis=1)
#             scalar_rewards = 2 * (dom_counts / (NUM_CANDIDATES-1)) - 1
#             rewards.append(scalar_rewards)
        
#         # PPO step
#         stats = ppo_trainer.step(
#             queries=[ids]*NUM_CANDIDATES,
#             responses=outs[0].unbind(),
#             scores=[torch.tensor(r, dtype=torch.float32) for r in rewards]
#         )

#         if batch_idx % 2 == 0:
#             print(f"Batch {batch_idx} | Avg Reward: {np.mean(scalar_rewards):.2f}")
#             print(f"Sample: {hyps[0][0][:60]}...")

# print("✅ Training Complete")

In [None]:
pip install transformers datasets torch trl pandas tqdm bitsandbytes #better we add these in requirements.txt

In [None]:
# Colab-friendly two-stage fine-tuning & inference
# ------------------------------------------------
# 1) MedMCQA adaptation on BART-base
# 2) Dialogue→structured-summary adaptation on clinical_notes.csv (column “note”)
# 3) Batch inference with final model
#
# Requirements (install in Colab):
#   !pip install transformers datasets torch bitsandbytes peft pandas tqdm

import os
import torch
import pandas as pd
from datasets import load_dataset, Dataset as HFDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    pipeline
)
from torch.utils.data import Dataset as TorchDataset

# ———————————————————————————————
# Configuration
# ———————————————————————————————
DEVICE       = "cpu"
BASE_MODEL   = "facebook/bart-base"
MED_FT_DIR   = "/content/bart_medmcqa_ft"
FINAL_FT_DIR = "/content/bart_clinical_ft"
CSV_PATH     = "/content/clinical_notes.csv"  # must have columns: dialogue, note

# ———————————————————————————————
# 1) Datasets
# ———————————————————————————————
class MedMCQADataset(TorchDataset):
    def __init__(self, hf_ds, tok, max_src=256, max_tgt=16):
        self.tok, self.max_src, self.max_tgt = tok, max_src, max_tgt
        self.examples = []
        for row in hf_ds:
            q = str(row.get("question",""))
            opts = [str(row.get(k,"")) for k in ("opa","opb","opc","opd")]
            ans = str(row.get("cop",""))
            prompt = f"Question: {q} Options: A){opts[0]} B){opts[1]} C){opts[2]} D){opts[3]}"
            self.examples.append((prompt, ans))
    def __len__(self): return len(self.examples)
    def __getitem__(self,i):
        prompt, ans = self.examples[i]
        src = self.tok(prompt, truncation=True, padding="max_length",
                       max_length=self.max_src, return_tensors="pt")
        tgt = self.tok(ans,    truncation=True, padding="max_length",
                       max_length=self.max_tgt, return_tensors="pt")
        labels = tgt.input_ids.clone()
        labels[labels==self.tok.pad_token_id] = -100
        return {
            "input_ids":      src.input_ids.squeeze(),
            "attention_mask": src.attention_mask.squeeze(),
            "labels":         labels.squeeze(),
        }

class DialogueSummaryDataset(TorchDataset):
    def __init__(self, hf_ds, tok, max_src=512, max_tgt=256):
        self.ds, self.tok = hf_ds, tok
        self.max_src, self.max_tgt = max_src, max_tgt
    def __len__(self): return len(self.ds)
    def __getitem__(self,i):
        row = self.ds[i]
        src_txt = str(row["dialogue"])
        tgt_txt = str(row["note"])  # use "note" column
        src = self.tok(src_txt, truncation=True, padding="max_length",
                       max_length=self.max_src, return_tensors="pt")
        tgt = self.tok(tgt_txt, truncation=True, padding="max_length",
                       max_length=self.max_tgt, return_tensors="pt")
        labels = tgt.input_ids.clone()
        labels[labels==self.tok.pad_token_id] = -100
        return {
            "input_ids":      src.input_ids.squeeze(),
            "attention_mask": src.attention_mask.squeeze(),
            "labels":         labels.squeeze(),
        }

# ———————————————————————————————
# 2) Stage 1: MedMCQA fine-tuning
# ———————————————————————————————
print("=== Stage 1: MedMCQA fine-tuning ===")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model     = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(DEVICE)

med_ds = load_dataset("openlifescienceai/medmcqa")
train_med = med_ds["train"].select(range(5000))
eval_med  = med_ds["validation"].select(range(500))

train_med_ds = MedMCQADataset(train_med, tokenizer)
eval_med_ds  = MedMCQADataset(eval_med,  tokenizer)

args1 = TrainingArguments(
    output_dir=MED_FT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
)
trainer1 = Trainer(
    model=model,
    args=args1,
    train_dataset=train_med_ds,
    eval_dataset=eval_med_ds,
    tokenizer=tokenizer,
)
trainer1.train()
trainer1.save_model(MED_FT_DIR)
tokenizer.save_pretrained(MED_FT_DIR)

# ———————————————————————————————
# 3) Stage 2: Clinical notes fine-tuning
# ———————————————————————————————
print("=== Stage 2: Clinical notes fine-tuning ===")
# reload on CPU for fixes
model     = AutoModelForSeq2SeqLM.from_pretrained(MED_FT_DIR, device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(MED_FT_DIR)

# fix pad_token & resize embeddings
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer))

model = model.to(DEVICE)

df = pd.read_csv(CSV_PATH)  # must have columns 'dialogue','note'
hf_clin = HFDataset.from_pandas(df)

train_clin = hf_clin.shuffle(42).select(range(400))
eval_clin  = hf_clin.shuffle(123).select(range(400,464))

train_ds2 = DialogueSummaryDataset(train_clin, tokenizer)
eval_ds2  = DialogueSummaryDataset(eval_clin,  tokenizer)

args2 = TrainingArguments(
    output_dir=FINAL_FT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer2 = Trainer(
    model=model,
    args=args2,
    train_dataset=train_ds2,
    eval_dataset=eval_ds2,
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer2.train()
trainer2.save_model(FINAL_FT_DIR)
tokenizer.save_pretrained(FINAL_FT_DIR)

# ———————————————————————————————
# 4) Batch inference
# ———————————————————————————————
print("=== Stage 3: Batch inference ===")
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
)

batch_size  = 4
num_samples = len(eval_ds2)
num_batches = (num_samples + batch_size - 1) // batch_size

predictions, references = [], []
for i in range(num_batches):
    start, end = i*batch_size, min((i+1)*batch_size, num_samples)
    convs = [str(x) for x in eval_clin["dialogue"][start:end]]
    refs  = [str(x) for x in eval_clin["note"][start:end]]
    prompts = [
        f"Summarize the following conversation:\n\n{c}"
        for c in convs if len(c.strip())>10
    ]
    if not prompts:
        continue
    outs = summarizer(prompts, max_new_tokens=120, do_sample=False)
    predictions.extend([o["summary_text"] for o in outs])
    references.extend(refs)

print(f"Generated {len(predictions)} summaries.")


In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
from datasets import Dataset as HFDataset

# 1) Reload & repair checkpoint
MODEL_DIR = "/content/drive/MyDrive/bart_clinical_ft"
device    = "cuda"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR, device_map="cuda")

# force pad_token = eos_token and resize to avoid any OOB ID
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer))


# 2) Load your eval set
df      = pd.read_csv("/content/clinical_notes.csv")  # columns: dialogue, note
hf_clin = HFDataset.from_pandas(df).shuffle(seed=42)
eval_ds = hf_clin.select(range(400,464))

# 3) Batch‐wise manual generation
batch_size  = 4
num_items   = len(eval_ds)
predictions = []
references  = []

for i in range(0, num_items, batch_size):
    # grab slices of the two columns as plain Python lists
    convs = [str(x) for x in eval_ds["dialogue"][i : i + batch_size]]
    refs  = [str(x) for x in eval_ds["note"][i : i + batch_size]]

    # build prompts and skip short ones
    prompts = [
        f"Summarize the following conversation:\n\n{c}"
        for c in convs if len(c.strip()) > 10
    ]
    if not prompts:
        continue

    # tokenize *with* truncation & max_length
    enc = tokenizer(
        prompts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    # generate on the same device
    with torch.no_grad():
        out_ids = model.generate(
            input_ids=enc.input_ids,
            attention_mask=enc.attention_mask,
            max_new_tokens=120,
            do_sample=False
        )

    # decode & store
    dec = tokenizer.batch_decode(out_ids, skip_special_tokens=True)
    predictions.extend(dec)
    references.extend(refs)

print(f"✅ Done — generated {len(predictions)} summaries.")


In [None]:
! CUDA_LAUNCH_BLOCKING=1

In [None]:
pip install -U "transformers>=4.39" datasets

In [None]:
import os, torch, pandas as pd
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          Trainer, TrainingArguments)
from torch.utils.data import Dataset as TorchDataset
from datasets import Dataset as HFDataset


In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL = "facebook/bart-base"        # starting checkpoint
MED_FT_DIR  = "./bart_medmcqa_ft"        # after Stage‑1
FINAL_FT_DIR = "./bart_clinical_ft"      # after Stage‑2
CSV_PATH = "/content/clinical_notes.csv"

In [None]:

# -------------------------------------------------------------
# 1. DATASET HELPERS
# -------------------------------------------------------------
class MedMCQADataset(TorchDataset):
    def __init__(self, hf_ds, tokenizer,
                 max_src=256, max_tgt=16):
        self.tok, self.max_src, self.max_tgt = tokenizer, max_src, max_tgt
        self.examples = []

        # ----------  PATCH START  ----------
        # safe helper that returns a clean string
        def safe_str(val):
            return str(val) if val is not None else ""
        # loop over the HF examples
        for row in hf_ds:
            self.examples.append(
                {
                    "q": safe_str(row["question"]),
                    "options": [safe_str(row["opa"]),
                                safe_str(row["opb"]),
                                safe_str(row["opc"]),
                                safe_str(row["opd"])],
                    "ans": safe_str(row["cop"]),
                }
            )
        # ----------  PATCH END  ----------

    def __len__(self): return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        prompt = f"Question: {ex['q']} Options: " \
                 f"A) {ex['options'][0]} B) {ex['options'][1]} " \
                 f"C) {ex['options'][2]} D) {ex['options'][3]}"
        target = ex['ans']
        src = self.tok(prompt, truncation=True, max_length=self.max_src,
                       padding="max_length", return_tensors="pt")
        tgt = self.tok(target, truncation=True, max_length=self.max_tgt,
                       padding="max_length", return_tensors="pt")
        labels = tgt["input_ids"].clone()
        labels[labels == self.tok.pad_token_id] = -100
        return {"input_ids": src["input_ids"].squeeze(),
                "attention_mask": src["attention_mask"].squeeze(),
                "labels": labels.squeeze()}


In [None]:
class DialogueSummaryDataset(TorchDataset):
    """
    dialogue  ->  structured summary
    Expects a HF `Dataset` with columns 'dialogue' & 'summary'
    """
    def __init__(self, hf_ds, tokenizer,
                 max_src=512, max_tgt=256):
        self.tok, self.max_src, self.max_tgt = tokenizer, max_src, max_tgt
        self.dialogues = hf_ds["dialogue"]
        self.summaries = hf_ds["note"]

    def __len__(self): return len(self.dialogues)

    def __getitem__(self, idx):
        src_txt = self.dialogues[idx]
        tgt_txt = self.summaries[idx]
        src = self.tok(src_txt, truncation=True, max_length=self.max_src,
                       padding="max_length", return_tensors="pt")
        tgt = self.tok(tgt_txt, truncation=True, max_length=self.max_tgt,
                       padding="max_length", return_tensors="pt")
        labels = tgt["input_ids"].clone()
        labels[labels == self.tok.pad_token_id] = -100
        return {"input_ids": src["input_ids"].squeeze(),
                "attention_mask": src["attention_mask"].squeeze(),
                "labels": labels.squeeze()}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL).to(DEVICE)

In [None]:
# -------------------------------------------------------------
# 3. STAGE‑1 : MedMCQA FINE‑TUNING
# -------------------------------------------------------------
print("\n=== Stage 1: MedMCQA fine‑tuning ===")
med_ds = load_dataset("openlifescienceai/medmcqa")
train_med = med_ds["train"].select(range(5000))       # small subset
eval_med  = med_ds["validation"].select(range(500))   # small subset

train_med_ds = MedMCQADataset(train_med, tokenizer)
eval_med_ds  = MedMCQADataset(eval_med, tokenizer)

args_med = TrainingArguments(
    output_dir=MED_FT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

trainer_med = Trainer(
    model=model,
    args=args_med,
    train_dataset=train_med_ds,
    eval_dataset=eval_med_ds,
    tokenizer=tokenizer,
)

trainer_med.train()

In [None]:
trainer_med.save_model(MED_FT_DIR)
tokenizer.save_pretrained(MED_FT_DIR)

In [None]:
# -------------------------------------------------------------
# 4. STAGE‑2 : Dialogue‑>Structured Summary FINE‑TUNING
# -------------------------------------------------------------
print("\n=== Stage 2: Clinical‑notes fine‑tuning ===")

# 1️⃣  Always load on CPU first
model = AutoModelForSeq2SeqLM.from_pretrained(MED_FT_DIR, device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(MED_FT_DIR)

# 2️⃣  Ensure vocabulary / embedding sizes match
vocab_len = len(tokenizer)
if model.config.vocab_size != vocab_len:
    print(f"‑ Resizing embeddings: {model.config.vocab_size}  →  {vocab_len}")
    model.resize_token_embeddings(vocab_len)

# 3️⃣  OPTIONAL: sanity‑check for NaNs / Infs in the checkpoint
with torch.no_grad():
    for name, p in model.named_parameters():
        if torch.isnan(p).any() or torch.isinf(p).any():
            raise RuntimeError(f"NaNs/Infs found in {name}")

# 4️⃣  Now move to GPU
model = model.to(DEVICE)


# Load your CSV → HF Dataset
df = pd.read_csv(CSV_PATH, sep=",")          # adjust sep if tabs
hf_clinical = Dataset.from_pandas(df)

train_clin = hf_clinical.shuffle(seed=42).select(range(400))
eval_clin  = hf_clinical.shuffle(seed=123).select(range(400,464))

train_clin_ds = DialogueSummaryDataset(train_clin, tokenizer)
eval_clin_ds  = DialogueSummaryDataset(eval_clin, tokenizer)

In [None]:
args_clin = TrainingArguments(
    output_dir=FINAL_FT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,    # effective 8
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

trainer_clin = Trainer(
    model=model,
    args=args_clin,
    train_dataset=train_clin_ds,
    eval_dataset=eval_clin_ds,
    tokenizer=tokenizer,
)

trainer_clin.train()

In [None]:
trainer_clin.save_model(FINAL_FT_DIR)
tokenizer.save_pretrained(FINAL_FT_DIR)

print("\n✅ Two‑stage fine‑tuning complete. Final model saved to:", FINAL_FT_DIR)

In [None]:
# -------------------------------------------------------------
# 5. QUICK TEST (optional)
# -------------------------------------------------------------
if __name__ == "__main__":
    from transformers import pipeline
    summarizer = pipeline("summarization",
                          model=FINAL_FT_DIR,
                          tokenizer=FINAL_FT_DIR,
                          device=0 if torch.cuda.is_available() else -1)
    test_dialogue = df["dialogue"][0]
    print("\n--- SAMPLE SUMMARY ---")
    print(summarizer(test_dialogue, max_length=200,
                     min_length=80, do_sample=False)[0]["summary_text"])

In [None]:
# two_stage_bart_ft.py  ─────────────────────────────────────────
#   Two‑stage BART fine‑tune:
#     1. MedMCQA  (QA adaptation)
#     2. clinical_notes.csv  (dialogue ➜ structured summary)
#   Requires: transformers >=4.39  datasets  torch  pandas  tqdm
# ----------------------------------------------------------------
import os, torch, pandas as pd
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          Trainer, TrainingArguments)
from torch.utils.data import Dataset as TorchDataset

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

BASE_MODEL   = "facebook/bart-base"
MED_FT_DIR   = "./bart_medmcqa_ft"
FINAL_FT_DIR = "./bart_clinical_ft"
CSV_PATH     = "./clinical_notes.csv"        # 👈 your file (dialogue, summary)

# ----------------------------------------------------------------
#  Dataset helpers
# ----------------------------------------------------------------
class MedMCQADataset(TorchDataset):
    """ Question + 4 options  -->  correct option letter """
    def __init__(self, hf_ds, tokenizer, max_src=256, max_tgt=16):
        self.tok, self.max_src, self.max_tgt = tokenizer, max_src, max_tgt
        self.examples = []

        def s(val):               # safe cast
            return str(val) if val is not None else ""

        for row in hf_ds:
            self.examples.append(
                dict(
                    prompt=f"Question: {s(row['question'])} Options: "
                           f"A) {s(row['opa'])} B) {s(row['opb'])} "
                           f"C) {s(row['opc'])} D) {s(row['opd'])}",
                    answer=s(row['cop']),
                )
            )

    def __len__(self): return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]
        src = self.tok(ex["prompt"], max_length=self.max_src,
                       truncation=True, padding="max_length",
                       return_tensors="pt")
        tgt = self.tok(ex["answer"], max_length=self.max_tgt,
                       truncation=True, padding="max_length",
                       return_tensors="pt")
        labels = tgt["input_ids"]
        labels[labels == self.tok.pad_token_id] = -100
        return {"input_ids": src["input_ids"].squeeze(),
                "attention_mask": src["attention_mask"].squeeze(),
                "labels": labels.squeeze()}


class DialogueSummaryDataset(TorchDataset):
    """ dialogue  ->  structured summary """
    def __init__(self, hf_ds, tokenizer, max_src=512, max_tgt=256):
        self.tok, self.max_src, self.max_tgt = tokenizer, max_src, max_tgt
        self.dialogues = hf_ds["dialogue"]
        self.summaries = hf_ds["note"]

    def __len__(self): return len(self.dialogues)

    def __getitem__(self, idx):
        src_txt, tgt_txt = self.dialogues[idx], self.summaries[idx]
        src = self.tok(src_txt, max_length=self.max_src,
                       truncation=True, padding="max_length",
                       return_tensors="pt")
        tgt = self.tok(tgt_txt, max_length=self.max_tgt,
                       truncation=True, padding="max_length",
                       return_tensors="pt")
        labels = tgt["input_ids"]
        labels[labels == self.tok.pad_token_id] = -100
        return {"input_ids": src["input_ids"].squeeze(),
                "attention_mask": src["attention_mask"].squeeze(),
                "labels": labels.squeeze()}

# ----------------------------------------------------------------
#  Stage‑1 : MedMCQA fine‑tuning
# ----------------------------------------------------------------
print("\n=== Stage 1 : MedMCQA fine‑tuning ===")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model     = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)

med_ds = load_dataset("openlifescienceai/medmcqa")
train_med = med_ds["train"].select(range(5000))
eval_med  = med_ds["validation"].select(range(500))

train_med_ds = MedMCQADataset(train_med, tokenizer)
eval_med_ds  = MedMCQADataset(eval_med, tokenizer)

args_med = TrainingArguments(
    output_dir=MED_FT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

Trainer(
    model=model,
    args=args_med,
    train_dataset=train_med_ds,
    eval_dataset=eval_med_ds,
    tokenizer=tokenizer,
).train()

model.save_pretrained(MED_FT_DIR)
tokenizer.save_pretrained(MED_FT_DIR)

# ----------------------------------------------------------------
#  Stage‑2 : clinical_notes.csv fine‑tuning
# ----------------------------------------------------------------
print("\n=== Stage 2 : Clinical‑notes fine‑tuning ===")

# 1️⃣ load on CPU first
model = AutoModelForSeq2SeqLM.from_pretrained(MED_FT_DIR, device_map="cpu")
tokenizer = AutoTokenizer.from_pretrained(MED_FT_DIR)

# 2️⃣ ensure pad / eos ids are valid
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# 3️⃣ resize embeddings ↔ tokenizer
model.resize_token_embeddings(len(tokenizer))

# 4️⃣ move to GPU (now safe)
model = model.to(DEVICE)

# build HF dataset from CSV
df = pd.read_csv(CSV_PATH)
hf_clin = Dataset.from_pandas(df)

# small sample for demo
train_clin = hf_clin.shuffle(seed=42).select(range(400))
eval_clin  = hf_clin.shuffle(seed=123).select(range(400, 464))

train_clin_ds = DialogueSummaryDataset(train_clin, tokenizer)
eval_clin_ds  = DialogueSummaryDataset(eval_clin, tokenizer)

args_clin = TrainingArguments(
    output_dir=FINAL_FT_DIR,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,   # effective 8
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

Trainer(
    model=model,
    args=args_clin,
    train_dataset=train_clin_ds,
    eval_dataset=eval_clin_ds,
    tokenizer=tokenizer,
).train()

model.save_pretrained(FINAL_FT_DIR)
tokenizer.save_pretrained(FINAL_FT_DIR)

print("\n✅ Finished.  Final model stored at:", FINAL_FT_DIR)

In [None]:
from transformers import pipeline
summarizer = pipeline("text-generation",
                      model=FINAL_FT_DIR,
                      tokenizer=FINAL_FT_DIR,
                      device=0 if torch.cuda.is_available() else -1)
sample = df["dialogue"][0]
print("\n--- SAMPLE SUMMARY ---")
print(summarizer(sample, max_length=200,
                  min_length=80, do_sample=False)[0]["summary_text"])


OLD CODE BELOW

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import load_dataset
from torch.utils.data import Dataset

# Custom Dataset class for MedMCQA
class MedMCQADataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_input_length=256, max_output_length=128):
        self.tokenizer = tokenizer

        # Convert dataset to list of dictionaries for easy iteration
        self.data = hf_dataset.to_dict()
        num_examples = len(next(iter(self.data.values())))

        # Build list of examples as dicts
        self.examples = [
            {key: self.data[key][i] for key in self.data}
            for i in range(num_examples)
        ]

        self.inputs = []
        self.targets = []

        # Construct inputs and targets as strings
        for item in self.examples:
            # Ensure all fields exist and are strings; replace missing values if needed
            question = str(item.get('question', ''))
            opa = str(item.get('opa', ''))
            opb = str(item.get('opb', ''))
            opc = str(item.get('opc', ''))
            opd = str(item.get('opd', ''))
            cop = str(item.get('cop', ''))

            self.inputs.append(f"Question: {question} Options: {opa} {opb} {opc} {opd}")
            self.targets.append(cop)

        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        input_encodings = self.tokenizer(
            input_text,
            truncation=True,
            max_length=self.max_input_length,
            padding="max_length",
            return_tensors="pt"
        )

        target_encodings = self.tokenizer(
            target_text,
            truncation=True,
            max_length=self.max_output_length,
            padding="max_length",
            return_tensors="pt"
        )

        # Replace padding token id's of the labels by -100 so it's ignored by the loss
        labels = target_encodings['input_ids']
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_encodings['input_ids'].squeeze(),
            "attention_mask": input_encodings['attention_mask'].squeeze(),
            "labels": labels.squeeze()
        }

# Initialize model and tokenizer
base_model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model_name)
model.resize_token_embeddings(len(tokenizer))

# Load the MedMCQA dataset
med_dataset = load_dataset("openlifescienceai/medmcqa")

# Select subsets for training and evaluation
train_med_subset = med_dataset['train'].select(range(5000))
eval_med_subset = med_dataset['validation'].select(range(500))

# Create dataset objects using the custom class
train_med_dataset = MedMCQADataset(train_med_subset, tokenizer)
eval_med_dataset = MedMCQADataset(eval_med_subset, tokenizer)

# Define training arguments for medical fine-tuning
training_args_med = TrainingArguments(
    output_dir="./medical_ft_model",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    eval_strategy="epoch",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

# Initialize Trainer
trainer_med = Trainer(
    model=model,
    args=training_args_med,
    train_dataset=train_med_dataset,
    eval_dataset=eval_med_dataset,
    tokenizer=tokenizer,
)

# Start medical fine-tuning
trainer_med.train()

# Save model and tokenizer after medical fine-tuning
trainer_med.save_model("./medical_ft_model")
tokenizer.save_pretrained("./medical_ft_model")


In [None]:
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,
                          Trainer, TrainingArguments, DataCollatorForSeq2Seq)
import torch, pandas as pd
from datasets import Dataset as HFDataset
from torch.utils.data import Dataset as TorchDataset

# ──────────────────────────────────────────────────────────────
#  PyTorch Dataset wrapper
# ──────────────────────────────────────────────────────────────
class ConversationSummaryDataset(TorchDataset):
    """
    Expects an HF Dataset with columns: 'dialogue' and 'summary'
    """
    def __init__(self, hf_ds, tokenizer,
                 max_input_len=512, max_output_len=256):
        self.ds, self.tok = hf_ds, tokenizer
        self.max_in, self.max_out = max_input_len, max_output_len

    def __len__(self): return len(self.ds)

    def __getitem__(self, idx):
        row = self.ds[idx]
        src_txt = str(row["dialogue"])
        tgt_txt = str(row["note"])      # <- was 'note'

        src = self.tok(src_txt, max_length=self.max_in,
                       truncation=True, padding="max_length",
                       return_tensors="pt")
        tgt = self.tok(tgt_txt, max_length=self.max_out,
                       truncation=True, padding="max_length",
                       return_tensors="pt")

        labels = tgt["input_ids"].clone()
        labels[labels == self.tok.pad_token_id] = -100
        return {"input_ids":      src["input_ids"].squeeze(),
                "attention_mask": src["attention_mask"].squeeze(),
                "labels":         labels.squeeze()}

# ──────────────────────────────────────────────────────────────
#  Load Stage‑1 checkpoint
# ──────────────────────────────────────────────────────────────
MODEL_DIR = "./medical_ft_model"
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

# make sure pad token is defined
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))

# ──────────────────────────────────────────────────────────────
#  Build HF Dataset from CSV
# ──────────────────────────────────────────────────────────────
df = pd.read_csv("/content/clinical_notes.csv")  # columns: dialogue, summary
hf_clin = HFDataset.from_pandas(df)

train_ds = hf_clin.shuffle(seed=42).select(range(400))
eval_ds  = hf_clin.shuffle(seed=123).select(range(400, 464))

train_dataset = ConversationSummaryDataset(train_ds, tokenizer)
eval_dataset  = ConversationSummaryDataset(eval_ds, tokenizer)

# ──────────────────────────────────────────────────────────────
#  Training setup
# ──────────────────────────────────────────────────────────────
training_args = TrainingArguments(
    output_dir="./final_sft_model",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    eval_strategy="epoch",          # <- fixed name
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ──────────────────────────────────────────────────────────────
#  Fine‑tune and save
# ──────────────────────────────────────────────────────────────
trainer.train()
trainer.save_model("./final_sft_model")
tokenizer.save_pretrained("./final_sft_model")


In [None]:
!zip -r /content/final_sft_model.zip /content/final_sft_model

In [None]:
from google.colab import files
files.download("/content/final_sft_model.zip")

In [None]:
from transformers import pipeline

# Initialize the summarization pipeline with the fine-tuned model
summarizer = pipeline(
    "summarization",
    model="./final_sft_model",
    tokenizer="./final_sft_model",
    device=0 if torch.cuda.is_available() else -1
)

# Define a custom conversation
custom_conversation = """
Doctor: Hi, Mr. X, I'm Dr. Y. How are you feeling today?
Patient: Not too good, doctor. I've been feeling really sick lately.
Doctor: I understand. Can you tell me what symptoms you're experiencing?
Patient: Yes, I've been having a fever, a dry cough, and dyspnea.
Doctor: I see. You were hospitalized due to moderate ARDS from COVID-19, is that correct?
Patient: Yes, that's correct.
Doctor: During your physical therapy, we encountered some difficulties. Can you tell me more about that?
Patient: Yes, I had trouble with position changes and deep breathing. Every time I tried to change my position or take a deep breath, I would start coughing and it would make me really short of breath.
Doctor: I understand. To avoid rapid deterioration and respiratory failure, we instructed you to change positions very slowly and step-by-step, right?
Patient: Yes, that's right.
Doctor: And I see that this approach increased your oxygen saturation, for example, on day 5 with 6 L/min of oxygen from 93% to 97%.
Patient: Yes, that's correct.
Doctor: Good. We also had to adapt your breathing exercises to avoid prolonged coughing and oxygen desaturation. Can you tell me more about that?
Patient: Yes, I was instructed to stop every deep breath before coughing and to hold my breath for better air distribution.
Doctor: I see that you performed the breathing exercises well and managed to increase your oxygen saturation.
Patient: Yes, I did my best.
Doctor: You also had difficulty maintaining sufficient oxygen saturation during physical activity, is that correct?
Patient: Yes, I did. But with close monitoring and frequent breaks, I was able to perform low-level strength and walking exercises without any significant deoxygenation.
Doctor: I see that your exercise progression was low on days 1 to 5, but then increased daily until your hospital discharge to a rehabilitation clinic on day 10.
Patient: Yes, that's correct.
Doctor: Great. I'd like to keep monitoring your progress and see how you're doing. Can you keep me updated on any changes in your symptoms?
Patient: Yes, of course, doctor.
Doctor: Alright, let's keep in touch. If you have any questions or concerns, don't hesitate to reach out to me.
Patient: Thank you, doctor.
"""

# Generate summary
summary = summarizer(custom_conversation, min_length=40, do_sample=False)
print("Generated Summary:")
print(summary[0]['summary_text'])


In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score # Install the rouge_score dependency

In [None]:
import evaluate
from tqdm import tqdm

rouge = evaluate.load("rouge")

In [None]:
from datasets import load_dataset # import the library
import evaluate
from tqdm import tqdm

rouge = evaluate.load("rouge")

# Load the dataset
notechat = load_dataset("akemiH/NoteChat") # load the dataset and assign it to a variable

eval_df = notechat["train"].select(range(3000, 3500)).to_pandas()

# Check for missing values and clean if necessary
if eval_df.isnull().values.any():
    print("Found missing values in the evaluation set. Dropping them.")
    eval_df = eval_df.dropna()

batch_size = 4
num_samples = len(eval_df)
num_batches = num_samples // batch_size + int(num_samples % batch_size != 0)

predictions = []
references = []

In [None]:
summarizer = pipeline("summarization", device=0)  # Use device=-1 for CPU if GPU is unavailable

# Iterate through batches and generate summaries
for i in tqdm(range(num_batches), desc="Generating Summaries"):
    # Define the start and end indices for the current batch
    start = i * batch_size
    end = min(start + batch_size, num_samples)

    # Extract batch conversations and references
    batch_conversations = eval_df["conversation"][start:end].tolist()
    batch_refs = eval_df["data"][start:end].tolist()  # Adjust column name if necessary

    # Prepare prompts for summarization
    prompts = [
        f"Summarize the following conversation:\n\n{conv}" for conv in batch_conversations
    ]

    # Generate summaries for the batch
    results = summarizer(
        prompts,
        max_new_tokens=100,  # Adjust token limit based on your model's capabilities
        do_sample=False,
        truncation=True
    )

    # Extract the predicted summaries and add them to the list
    predictions.extend([r["summary_text"] for r in results])

    # Collect the references for the batch
    references.extend(batch_refs)

In [None]:
from transformers import pipeline

summarizer = pipeline(
    "summarization",
    model="/content/drive/MyDrive/bart_clinical_ft",
    tokenizer="/content/drive/MyDrive/final_sft_model",
    device=0 if torch.cuda.is_available() else -1,
)

batch_size = 4
num_samples = len(eval_ds)
num_batches = (num_samples + batch_size - 1) // batch_size

predictions = []
references = []

for i in tqdm(range(num_batches), desc="Generating summaries"):
    start, end = i * batch_size, min((i + 1) * batch_size, num_samples)

    batch_conversations = [str(c) for c in eval_ds["dialogue"][start:end]]
    batch_refs = [str(r) for r in eval_ds["note"][start:end]]

    prompts = []
    for conv in batch_conversations:
        if isinstance(conv, str) and len(conv.strip()) > 5:
            prompts.append(f"Summarize the following conversation:\n\n{conv}")

    if not prompts:
        continue  # skip empty batch

    for prompt in prompts:
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        if input_ids.shape[-1] > 900:    # avoid too long prompts
            print(f"⚠️ Skipping overly long prompt with {input_ids.shape[-1]} tokens")
            continue

        result = summarizer(
            prompt,
            max_new_tokens=120,
            do_sample=False,
        )
        predictions.append(result[0]["summary_text"])

    references.extend(batch_refs)

print(f"Generated {len(predictions)} summaries.")


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch, pandas as pd
from datasets import Dataset as HFDataset

# ──────────────────────────────────────────────────────────
# 1) Load & repair (CPU), then move to GPU yourself
# ──────────────────────────────────────────────────────────
MODEL_DIR = "/content/drive/MyDrive/final_sft_model"

# a) load normally (no device_map)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model.resize_token_embeddings(len(tokenizer))

# b) fix pad/EOS and resize embeddings
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '<pad>'})

# c) move to GPU (or CPU)
device_id = 0 if torch.cuda.is_available() else -1
model = model.to(f"cuda:{device_id}" if device_id >= 0 else "cpu")

# ──────────────────────────────────────────────────────────
# 2) Build the pipeline with the same device
# ──────────────────────────────────────────────────────────
summarizer = pipeline(
    "summarization",
    model=model,
    tokenizer=tokenizer,
    device=device_id            # <— this makes inputs & model live together
)

# ──────────────────────────────────────────────────────────
# 3) Generate in batches (all on GPU)
# ──────────────────────────────────────────────────────────
df     = pd.read_csv("/content/clinical_notes.csv")  # columns: dialogue, note
hf_clin = HFDataset.from_pandas(df).shuffle(seed=42)
eval_ds = hf_clin.select(range(400, 464))

batch_size  = 4
num_samples = len(eval_ds)
num_batches = (num_samples + batch_size - 1) // batch_size

predictions, references = [], []

for i in range(num_batches):
    start, end = i * batch_size, min((i + 1) * batch_size, num_samples)
    convs = [str(x) for x in eval_ds["dialogue"][start:end]]
    refs  = [str(x) for x in eval_ds["note"][start:end]]

    prompts = [
        f"Summarize the following conversation:\n\n{c}"
        for c in convs if len(c.strip()) > 10
    ]
    if not prompts:
        continue

    # this call now runs entirely on GPU
    outputs = summarizer(prompts, max_new_tokens=100, do_sample=False)

    predictions.extend([o["summary_text"] for o in outputs])
    references.extend(refs)

print(f"Generated {len(predictions)} summaries.")


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
predictions

In [None]:
!git clone https://github.com/maszhongming/UniEval.git

In [None]:
import sys
sys.path.append("/content/UniEval")  # if needed to make sure your Python can import from the UniEval folder

from utils import convert_to_json
from metric.evaluator import get_evaluator

In [None]:
!pip install -r UniEval/requirements.txt

In [None]:
# Lists for UniEval
src_list = eval_clin["dialogue"]          # already a list
ref_list = eval_clin["note"]           # <- was "note"

In [None]:
output_list = []
for pred in predictions: # Loop over each conversation string
    output_list.append(pred)

In [None]:
data = convert_to_json(
    src_list=src_list,
    ref_list=ref_list,
    output_list=output_list
)


In [None]:
import json

# Assuming you already have `data` from convert_to_json
with open("/content/unieval_data.json", "w") as f:
    json.dump(data, f, indent=2)


In [None]:
import json
with open("/content/unieval_data.json", "r") as f:
    data = json.load(f)


In [None]:
import os
os.environ[“CUDA_LAUNCH_BLOCKING”] = 1

In [None]:
task = "summarization, fact"
evaluator = get_evaluator(task, device="cuda")

In [None]:
import nltk

# Download the standard 'punkt' tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
eval_scores = evaluator.evaluate(data, print_result=True)

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM, AutoTokenizer,
    AutoModelForCausalLM, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
from transformers import GenerationConfig


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ──────────────────────────────────────────────────────────────
# 1) LOAD YOUR SFT‐FINE‐TUNED BART MODEL (no need to re‐SFT)
# ──────────────────────────────────────────────────────────────
SFT_DIR = "/content/drive/MyDrive/bart_clinical_ft"
# If you used Seq2SeqLM for summarization, load with that:
gen_cfg = GenerationConfig.from_pretrained(SFT_DIR)
gen_cfg.early_stopping = False

base = AutoModelForSeq2SeqLM.from_pretrained(SFT_DIR, device_map="cpu", generation_config=gen_cfg,)
# Or if you prefer causal‐LM interface, switch to AutoModelForCausalLM.

tok  = AutoTokenizer.from_pretrained(SFT_DIR)
tok.pad_token = tok.eos_token

# ──────────────────────────────────────────────────────────────
# 2) (Optional) QUANTIZE + PREPARE FOR K‐BIT + LoRA ADAPTER
# ──────────────────────────────────────────────────────────────
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
# re‐load base model in 4‑bit directly:
base = AutoModelForCausalLM.from_pretrained(
    SFT_DIR,
    quantization_config=bnb_config,
    device_map="auto",
    generation_config=gen_cfg,
)
# freeze all except LoRA
base = prepare_model_for_kbit_training(base)
base.gradient_checkpointing_enable()

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(base, lora_cfg).to(DEVICE)

# make sure pad token is set
model.config.pad_token_id = tok.eos_token_id
model.resize_token_embeddings(len(tok))


In [None]:
from torch.utils.data import Dataset

class ConvSumDS(Dataset):
    def __init__(self, hf_ds, tokenizer, max_in=512, max_out=256):
        self.ds, self.tok = hf_ds, tokenizer
        self.max_in, self.max_out = max_in, max_out
    def __len__(self): return len(self.ds)
    def __getitem__(self, i):
        row = self.ds[i]
        src, tgt = str(row["dialogue"]), str(row["note"])
        enc_in  = self.tok(src, max_length=self.max_in,
                           truncation=True, padding="max_length",
                           return_tensors="pt")
        enc_out = self.tok(tgt, max_length=self.max_out,
                           truncation=True, padding="max_length",
                           return_tensors="pt")
        labels = enc_out["input_ids"]
        labels[labels == self.tok.pad_token_id] = -100
        return {"input_ids":enc_in["input_ids"].squeeze(),
                "attention_mask":enc_in["attention_mask"].squeeze(),
                "labels":labels.squeeze()}


In [None]:
# ---- PCGrad (Yu et al. 2020) -------------
class PCGrad:
    def __init__(self, optimizer):
        self._optim = optimizer
    def zero_grad(self): self._optim.zero_grad()
    @torch.no_grad()
    def pc_backward(self, grads: list[torch.Tensor]):
        """grads = list of flat grad tensors (one per objective)"""
        # 1) project conflicting grads
        for i in range(len(grads)):
            for j in range(i+1, len(grads)):
                gij = torch.dot(grads[i], grads[j])
                if gij < 0:    # conflict
                    grads[i] -= (gij / grads[j].norm()**2) * grads[j]
        # 2) average the (now non‑conflicting) grads
        merged = torch.stack(grads, dim=0).mean(0)
        # 3) copy into model params
        idx = 0
        for p in model.parameters():
            if p.requires_grad:
                sz = p.numel()
                p.grad = merged[idx:idx+sz].view_as(p).clone()
                idx += sz
    def step(self): self._optim.step()


In [None]:
import sys
sys.path.append("/content/UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

# 1) Instantiate two evaluators **on CPU**:
sum_eval  = get_evaluator("summarization", device="cpu")   # coherence, consistency, fluency (+ relevance)
fact_eval = get_evaluator("fact",          device="cpu")   # factual consistency

def batched_unieval(src_list, hyp_list):
    """
    Given a batch of sources and hypotheses (summaries),
    returns a dict of lists with keys
      'coherence', 'consistency', 'fluency', 'factual'
    each list has length == len(src_list).
    """
    # 2) prepare JSON for UniEval
    data = convert_to_json(output_list=hyp_list, src_list=src_list)

    # 3) summarization dims in one call
    #    dims can be any subset of ['coherence','consistency','fluency','relevance']
    sum_scores = sum_eval.evaluate(
        data,
        dims=["coherence","consistency","fluency"],
        individual=True,      # one dict per example
        overall=False
    )
    # sum_scores: numpy array of shape (batch,3)

    # 4) split into python lists
    coherence   = sum_scores[:,0].tolist()
    consistency = sum_scores[:,1].tolist()
    fluency     = sum_scores[:,2].tolist()

    # 5) factual consistency
    fact_dicts  = fact_eval.evaluate(data)    # returns list[{'consistency':…},…]
    factual     = [d["consistency"] for d in fact_dicts]

    return {
        "coherence":   coherence,
        "consistency": consistency,
        "fluency":     fluency,
        "factual":     factual
    }


In [None]:
# ════════════════════════════════════════════════════════════════
#  pip install -q "trl==0.7.1" "transformers>=4.39" bitsandbytes peft
#  git clone https://github.com/yangkevin2/UniEval.git   # (or pip install)
# ════════════════════════════════════════════════════════════════
import os, sys, gc, torch, pandas as pd, numpy as np
from torch.utils.data import Dataset, DataLoader

# ────────────────────────────────────────────────────────────────
# 1) UniEval helpers  (CPU only, load once)
# ────────────────────────────────────────────────────────────────
sys.path.append("/content/UniEval")                # path to UniEval
from utils            import convert_to_json
from metric.evaluator import get_evaluator

sum_eval  = get_evaluator("summarization", device="cpu")
fact_eval = get_evaluator("fact",          device="cpu")

@torch.inference_mode()
def unieval_4way(src: list[str], hyp: list[str]) -> torch.Tensor:
    """returns (B,4) tensor: [coh,cons,flu,fact]"""
    data = convert_to_json(output_list=hyp, src_list=src)

    tri = sum_eval.evaluate(data,
                            dims=["coherence","consistency","fluency"],
                            individual=True, overall=False)      # ndarray
    coh, con, flu = tri.T.tolist()
    fact = [d["consistency"] for d in fact_eval.evaluate(data)]
    return torch.tensor([coh, con, flu, fact]).T       # on CPU
# ────────────────────────────────────────────────────────────────


# ────────────────────────────────────────────────────────────────
# 2) load 4‑bit BART‑SFT  +  LoRA adapter
# ────────────────────────────────────────────────────────────────
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          BitsAndBytesConfig, GenerationConfig)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl  import PPOConfig, PPOTrainer, AutoModelForSeq2SeqLMWithValueHead

DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
CKPT_DIR = "/content/drive/MyDrive/final_sft_model"     # ★ your ckpt

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 2‑A)  load WITHOUT passing generation_config
base = AutoModelForSeq2SeqLM.from_pretrained(
    CKPT_DIR, quantization_config=bnb_cfg, device_map="auto"
)

# 2‑B) patch the generation config safely
gc_obj = base.generation_config or GenerationConfig()
if gc_obj.early_stopping is None:
    gc_obj.early_stopping = False          # must be bool / "never"
base.generation_config = gc_obj

# 2‑C) make it train‑friendly & add LoRA
base = prepare_model_for_kbit_training(base)
base.gradient_checkpointing_enable(); base.config.use_cache=False

lora_cfg = LoraConfig(
        task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"])

model = get_peft_model(base, lora_cfg).to(DEVICE)

tok = AutoTokenizer.from_pretrained(CKPT_DIR, use_fast=False)
tok.pad_token = tok.eos_token
model.resize_token_embeddings(len(tok))

# Value‑head wrappers for trl
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
              model, peft_config=lora_cfg).to(DEVICE)
ppo_ref   = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
              model, peft_config=lora_cfg).to(DEVICE).eval()
for p in ppo_ref.parameters(): p.requires_grad=False
# ────────────────────────────────────────────────────────────────


# ────────────────────────────────────────────────────────────────
# 3) tiny sample dataset (200 rows)
# ────────────────────────────────────────────────────────────────
df = pd.read_csv("/content/clinical_notes.csv")[["dialogue","note"]]

class NoteSet(Dataset):
    def __init__(self, frame, tok, L=512):
        self.f   = frame.reset_index(drop=True)
        self.tok = tok; self.L = L
    def __len__(self):  return len(self.f)
    def __getitem__(self, i):
        txt = str(self.f.iloc[i]["dialogue"])
        enc = self.tok(txt, truncation=True, max_length=self.L,
                       return_tensors="pt")
        return {"input_ids":enc["input_ids"].squeeze(),
                "attention_mask":enc["attention_mask"].squeeze(),
                "src_txt":txt}

loader = DataLoader(NoteSet(df.sample(200,random_state=0),tok),
                    batch_size=2, shuffle=True, pin_memory=True)
# ────────────────────────────────────────────────────────────────


# ────────────────────────────────────────────────────────────────
# 4) PPO trainer (trl‑0.7.1 only needs bs & mb_size)
# ────────────────────────────────────────────────────────────────
ppo_cfg = PPOConfig(batch_size=2, mini_batch_size=2, output_dir='/content/ppologs')

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, ppo_model.parameters()), lr=2e-5)

ppo_trainer = PPOTrainer(ppo_cfg, ppo_model,
                         ref_model = ppo_ref,
                        )
# ────────────────────────────────────────────────────────────────


# ────────────────────────────────────────────────────────────────
# 5) PC‑Grad utilities
# ────────────────────────────────────────────────────────────────
def pc_merge(flat_grads):
    for i in range(len(flat_grads)):
        for j in range(i+1, len(flat_grads)):
            dot = torch.dot(flat_grads[i], flat_grads[j])
            if dot < 0:
                flat_grads[i] -= (dot / (flat_grads[j].norm()**2 + 1e-12)) * flat_grads[j]
    return torch.stack(flat_grads).mean(0)

def flat_param_grads(model):
    return torch.cat([p.grad.flatten() for p in model.parameters()
                      if p.grad is not None])

def scatter_flat_grads(model, flat):
    idx = 0
    for p in model.parameters():
        if p.grad is None: continue
        n = p.grad.numel()
        p.grad.data = flat[idx:idx+n].view_as(p).clone()
        idx += n
# ────────────────────────────────────────────────────────────────


# ────────────────────────────────────────────────────────────────
# 6)   PPO  ✕  PC‑Grad  ✕  UniEval  training loop
# ────────────────────────────────────────────────────────────────
gen_kwargs = dict(max_new_tokens=64, do_sample=True,
                  pad_token_id=tok.eos_token_id)

for ep in range(3):
    for batch in loader:
        queries = batch["input_ids"].to(DEVICE)
        attn    = batch["attention_mask"].to(DEVICE)
        src_txt = batch["src_txt"]

        # rollout
        with torch.no_grad():
            responses = ppo_model.generate(queries, attention_mask=attn,
                                           **gen_kwargs)

        hyp_txt = tok.batch_decode(responses, skip_special_tokens=True)

        # 4‑way reward (CPU → GPU)
        R = unieval_4way(src_txt, hyp_txt).to(DEVICE)   # (B,4)

        # collect per‑objective grads
        flat_grads = []
        for k in range(4):
            rew_k = R[:, k]
            loss_k, *_ = ppo_trainer._loss(queries, responses, rew_k)
            optimizer.zero_grad()
            loss_k.backward(retain_graph=True)
            flat_grads.append(flat_param_grads(ppo_model))

        # PC‑Grad merge → apply
        merged = pc_merge(flat_grads)
        scatter_flat_grads(ppo_model, merged)
        torch.nn.utils.clip_grad_norm_(ppo_model.parameters(), 1.0)
        optimizer.step(); optimizer.zero_grad()

        torch.cuda.empty_cache(); gc.collect()

    print(f"✓ epoch {ep+1}/3 finished")

print("done ✅")


In [None]:
!pip install --quiet --no-cache-dir \
    "transformers==4.39.3" \
    "trl==0.7.1" \
    bitsandbytes peft datasets evaluate

In [None]:
import importlib, pkg_resources, sys, warnings
print("transformers →", pkg_resources.get_distribution("transformers").version)
print("trl           →", pkg_resources.get_distribution("trl").version)

# quick sanity check that failed before
from trl import PPOConfig
warnings.filterwarnings("ignore")   # suppress the cuda‑cache msg
print("✅  PPOConfig imported – versions are compatible")


In [None]:
pip install -U --no‑cache‑dir "transformers==4.37.0" "trl==0.7.1"

In [None]:
# 1️⃣  remove the too‑new wheel
!pip uninstall -y transformers



In [None]:

# 2️⃣  install the last compatible release
!pip install --no-cache-dir "transformers==4.37.0"



In [None]:
#  (optional) re‑install trl in case pip removed dependencies
!pip install --no-cache-dir "trl==0.7.1"

In [None]:
import importlib.util, transformers, peft
print("transformers:", transformers.__version__)
print("peft       :", peft.__version__)

from trl import PPOConfig
print("✅  trl and peft now import together!")


In [None]:
pip install --no-cache-dir peft==0.3.0