In [1]:
# ════════════════════════════════════════════════════════════════
# PPO Training with TRL (Replaces Custom Training Loop)
# ════════════════════════════════════════════════════════════════
import os, gc, torch, pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, GPT2LMHeadModel
from peft import LoraConfig, get_peft_model
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# Set device and enable TF32 for faster math
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
DEVICE = "cpu"
print(f"Using CPU for all operations")

# ────────────────────────────────────────────────────────────────
# 1) UniEval scorer
# ────────────────────────────────────────────────────────────────
import sys
sys.path.append(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\UniEval")
from utils import convert_to_json
from metric.evaluator import get_evaluator

sum_eval = get_evaluator("summarization", device=DEVICE)

@torch.inference_mode()
def get_unieval_scores(src, hyp, ref):
    """Get UniEval scores for generated summaries"""
    data = convert_to_json(
        output_list=hyp,
        src_list=src,
        ref_list=ref,
    )
    raw = sum_eval.evaluate(data)
    scores = [
        [d["coherence"], d["consistency"], d["fluency"], d["relevance"]]
        for d in raw
    ]
    return torch.tensor(scores, dtype=torch.float32)


# ────────────────────────────────────────────────────────────────
# 1) PPO Setup with Value Head
# ────────────────────────────────────────────────────────────────
BASE_MODEL = "gpt2"
NUM_CANDIDATES = 2
ppo_config = PPOConfig(
    batch_size=NUM_CANDIDATES,  # Matches our candidate count
    mini_batch_size=1,
    learning_rate=1.5e-5,
    log_with=None,
    optimize_cuda_cache=True,
)

# Load base model with value head
base_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL,
    peft_config=LoraConfig(
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
        target_modules=["c_attn", "c_proj"],
    ),
).to(DEVICE)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Initialize reference model (frozen)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(BASE_MODEL).to(DEVICE)
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

# ────────────────────────────────────────────────────────────────
# 3) Dataset with sampling
# ────────────────────────────────────────────────────────────────
df = pd.read_csv(r"C:\Users\BMSCE CSE.DESKTOP-IUB6THA\Downloads\kshitij\combined_clinical_notes.csv")[["dialogue", "note"]]

class ClinicalDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        dialogue = str(self.df.iloc[idx]["dialogue"])
        reference = str(self.df.iloc[idx]["note"])
        prompt = f"Summarize the following conversation:\n\n{dialogue}"
        
        inputs = self.tokenizer(
            prompt, 
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "prompt": prompt,
            "reference": reference,
            "dialogue": dialogue
        }

# Create a small dataset to demonstrate the concept
dataset = ClinicalDataset(df.sample(10, random_state=42), tokenizer)
loader = DataLoader(dataset, batch_size=1, shuffle=True, pin_memory=True)

# ────────────────────────────────────────────────────────────────
# 2) Modified Training Loop with PPO
# ────────────────────────────────────────────────────────────────
def ppo_dominance_train():
    ppo_trainer = PPOTrainer(
        config=ppo_config,
        model=base_model,
        ref_model=ref_model,
        tokenizer=tokenizer,
    )

    for epoch in range(3):
        for batch in tqdm(loader, desc=f"Epoch {epoch+1}"):
            # Explicit device transfer
            # inputs = batch["input_ids"].to(DEVICE)
            # masks = batch["attention_mask"].to(DEVICE)
            
            # Get input data
            dialogues = batch["dialogue"]
            references = batch["reference"]
            inputs = batch["input_ids"]
            masks = batch["attention_mask"]
            
            # Generate multiple candidates per input
            response_tensors = []
            texts = []
            for _ in range(NUM_CANDIDATES):
                inputs_gpu = inputs.squeeze().to(DEVICE)
                masks_gpu = masks.squeeze().to(DEVICE)
                
                generation = ppo_trainer.generate(
                    inputs.squeeze(),
                    attention_mask=masks.squeeze(),
                    max_new_tokens=64,
                    do_sample=True,
                    top_p=0.9,
                    temperature=0.7,
                    pad_token_id=tokenizer.eos_token_id,
                )
                response_tensors.append(generation.squeeze().to(DEVICE))
                texts.append(tokenizer.decode(generation.cpu(), skip_special_tokens=True))

            # Calculate dominance rewards
            src_texts = [dialogues[0]] * NUM_CANDIDATES
            ref_texts = [references[0]] * NUM_CANDIDATES
            scores = get_unieval_scores(src_texts, texts, ref_texts).numpy()
            
            # Dominance scoring logic
            dom_counts = np.zeros(NUM_CANDIDATES)
            for i in range(NUM_CANDIDATES):
                for j in range(NUM_CANDIDATES):
                    if i == j: continue
                    if np.all(scores[i] >= scores[j]) and np.any(scores[i] > scores[j]):
                        dom_counts[i] += 1
            
            # Normalize rewards [-1, 1]
            rewards = (2 * (dom_counts / (NUM_CANDIDATES-1))) - 1
            reward_tensors = [torch.tensor(r, dtype=torch.float32).to(DEVICE) for r in rewards]
            
            # PPO Update with device-aware tensors
            query_tensors = [t.to(DEVICE) for t in [inputs.squeeze().clone()]*NUM_CANDIDATES]
            response_tensors = [r.to(DEVICE) for r in response_tensors]
            
            stats = ppo_trainer.step(
                queries=query_tensors,
                responses=response_tensors,
                scores=[torch.tensor(r, dtype=torch.float32) for r in rewards]
            )

            torch.cuda.empty_cache()
            del inputs, masks, generation
            gc.collect()
            
            # Log metrics
            print(f"Batch Reward Mean: {np.mean(rewards):.2f}")
            print(f"PPO Stats: {stats}")

    # Save final model
    base_model.save_pretrained("ppo_dominance_model")
    return ppo_trainer

# Run the training
ppo_trainer = ppo_dominance_train()

  from .autonotebook import tqdm as notebook_tqdm


Using CPU for all operations


Epoch 1:   0%|                                                                        | 0/10 [00:00<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)