In [None]:
# fix_codet5p_q4_lora_final_fixed.py
# Fine-tune CodeT5p-220M to generate XML outputs using 4-bit QLoRA with LoRA
# Fixed version to handle 'context' field + flattening + tensor creation issues

import os, gc, json, torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
)
from datasets import DatasetDict, Dataset
import numpy as np
import evaluate
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training,TaskType

# ------------------------------------------------------------------
# 0️⃣  CUDA + Memory Configuration
# ------------------------------------------------------------------
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ------------------------------------------------------------------
# 1️⃣  Model + Tokenizer Setup
# ------------------------------------------------------------------
model_name = "Salesforce/codet5p-220m"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

print("🚀 Loading quantized model (4-bit)...")
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    # padding_side="left",
    add_bos_token=True,
    add_eos_token=True,
    use_fast=False,
)
tokenizer.pad_token = tokenizer.eos_token
print("✅ Model & Tokenizer loaded.")

# ------------------------------------------------------------------
# 2️⃣  Prepare for LoRA (k-bit training)
# ------------------------------------------------------------------
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
peft_model = get_peft_model(model, lora_config)
peft_model.config.use_cache = False

def print_trainable_parameters(model):
    trainable_params, all_param = 0, 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} / {all_param:,} "
          f"({100 * trainable_params / all_param:.2f}%)")

print_trainable_parameters(peft_model)

# ------------------------------------------------------------------
# 3️⃣  Dataset Loading (with flatten fix)
# ------------------------------------------------------------------
def flatten_jsonl(path):
    fixed = []
    with open(path) as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)

            # Combine fields cleanly
            context = obj.get("context", "")
            prompt = obj.get("prompt", "")
            output = obj.get("output", "")

            # Flatten lists into strings
            if isinstance(context, list): context = " ".join(map(str, context))
            if isinstance(prompt, list): prompt = " ".join(map(str, prompt))
            if isinstance(output, list): output = " ".join(map(str, output))

            # Create combined input text
            input_text = f"{context.strip()} {prompt.strip()}".strip()

            fixed.append({
                "input_text": input_text,
                "output_text": str(output),
            })
    return fixed

data_files = {
    "train": "/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/train.jsonl",
    "validation": "/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/validation.jsonl",
    "test": "/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/test.jsonl",
}

print("📂 Loading and flattening dataset...")
splits = {k: flatten_jsonl(v) for k, v in data_files.items()}

dataset_dict = DatasetDict({
    "train": Dataset.from_list(splits["train"]),
    "validation": Dataset.from_list(splits["validation"]),
    "test": Dataset.from_list(splits["test"]),
})
print(dataset_dict)

# ------------------------------------------------------------------
# 4️⃣  Tokenization
# ------------------------------------------------------------------
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 256

def tokenize_function(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        batch["output_text"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🧠 Tokenizing...")
tokenized_datasets = dataset_dict.map(
    tokenize_function,
    batched=True,
    remove_columns=["input_text", "output_text"],
)
print("✅ Tokenization complete.")

# ------------------------------------------------------------------
# 5️⃣  Evaluation Metrics
# ------------------------------------------------------------------
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple) or (hasattr(preds, "ndim") and preds.ndim == 3):
        pred_ids = np.argmax(preds, axis=-1)
    else:
        pred_ids = preds
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    exact_match = np.mean([p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels)])
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    decoded_labels_for_bleu = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
    return {
        "exact_match": round(float(exact_match), 4),
        "cer": round(float(cer), 4),
        "bleu": round(float(bleu["score"]), 4),
    }




🚀 Loading quantized model (4-bit)...
✅ Model & Tokenizer loaded.
Trainable params: 2,654,208 / 154,757,376 (1.72%)
📂 Loading and flattening dataset...
DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 700
    })
    validation: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 150
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 150
    })
})
🧠 Tokenizing...


Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

✅ Tokenization complete.


In [31]:
# ------------------------------------------------------------------
# 6️⃣  Training Arguments (memory-safe)
# ------------------------------------------------------------------
output_dir = "./codet5p-finetuned-nlp-to-xml"

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=3,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     gradient_accumulation_steps=8,
#     warmup_steps=1,
#     weight_decay=0.01,
#     learning_rate=2e-4,
#     optim="paged_adamw_8bit",
#     logging_dir="./logs",
#     logging_steps=100,
#     eval_strategy="steps",
#     eval_steps=30,
#     save_strategy="steps",
#     save_steps=30,
#     do_eval=True,
#     gradient_checkpointing=True,
#     load_best_model_at_end=False,
#     metric_for_best_model="exact_match",
#     greater_is_better=True,
#     fp16=True,
#     report_to="none",
#     remove_unused_columns=False,
# )
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_steps=1,
    weight_decay=0.01,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_dir="./logs",
    logging_steps=30,

    # ✅ Disable evaluation during training
    eval_strategy="no",
    do_eval=False,  # ✅ no evaluation

    # ✅ Remove or keep save only if you still want checkpoints
    save_strategy="steps",
    save_steps=30,

    gradient_checkpointing=True,
    load_best_model_at_end=False,  # no eval, so no "best model"
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)

print("✅ TrainingArguments configured.")

# ------------------------------------------------------------------
# 7️⃣  Trainer
# ------------------------------------------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model, padding="longest")

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------------------------------------------------------------
# 8️⃣  Training (with cleanup)
# ------------------------------------------------------------------
def pre_train_cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

pre_train_cleanup()

print("\n🔥 Starting fine-tuning (memory-safe QLoRA)...")
trainer.train()
print("🎉 Fine-tuning complete!")




  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


✅ TrainingArguments configured.

🔥 Starting fine-tuning (memory-safe QLoRA)...


Step,Training Loss
30,2.6655
60,1.5586


🎉 Fine-tuning complete!


In [None]:
# # (Your trainer.train() line)
# print("🎉 Fine-tuning complete!")

# print("\n🧹 Freeing VRAM... (deleting trainer and train data)")
# # --- THIS IS THE MEMORY FIX ---
# del trainer
# del tokenized_datasets["train"] # We don't need the training set anymore
# gc.collect()
# torch.cuda.empty_cache()
# # --- END OF MEMORY FIX ---

# print("✅ VRAM cleared. Starting memory-safe manual evaluation...")

🎉 Fine-tuning complete!

🧹 Freeing VRAM... (deleting trainer and train data)
✅ VRAM cleared. Starting memory-safe manual evaluation...


In [None]:
# # This should now work without an OOM error
# print("\n📊 Running evaluation...")
# metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
# print("📊 Evaluation metrics:", metrics)


📊 Running evaluation...


NameError: name 'trainer' is not defined

In [34]:
# (After trainer.train() and VRAM cleanup)

print("\n📊 Running manual evaluation (memory-safe)...")
from torch.utils.data import DataLoader
import gc
import numpy as np

# Set model to evaluation mode
peft_model.eval()

eval_dataset = tokenized_datasets["validation"]

eval_loader = DataLoader(
    eval_dataset, 
    batch_size=1, 
    collate_fn=data_collator
)

all_preds, all_labels = [], []

# Get the max length you defined during tokenization
MAX_TARGET_LENGTH = 256 

for batch in eval_loader:
    # Move inputs to GPU
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    
    # Labels stay on CPU as numpy
    labels = batch["labels"].numpy() # Shape is (1, 256)

    with torch.no_grad():
        # Generate predictions
        generated_ids = peft_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=MAX_TARGET_LENGTH,
            num_beams=1,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Move generated IDs to CPU
    preds = generated_ids.cpu().numpy() # Shape is (1, variable_length)

    # --- Padding Fix Starts Here Padding Fix ---
    
    # Get the length of the padded labels (e.g., 256)
    label_length = labels.shape[1]
    
    # Create an empty numpy array filled with the pad token, 
    # matching the shape of the labels.
    padded_preds = np.full((preds.shape[0], label_length), tokenizer.pad_token_id)
    
    # Find how many tokens were actually generated
    current_pred_length = preds.shape[1]
    
    # Make sure we don't try to copy more than fits
    copy_length = min(current_pred_length, label_length)
    
    # Copy the generated tokens into the padded array
    padded_preds[:, :copy_length] = preds[:, :copy_length]
    
    # --- Padding Fix Ends Here Padding Fix ---
    
    all_preds.extend(padded_preds)  # Add the PADDED predictions
    all_labels.extend(labels)       # Add the PADDED labels

    # Aggressively clean up memory
    del input_ids, attention_mask, labels, batch, generated_ids, preds, padded_preds
    gc.collect()
    torch.cuda.empty_cache()

# Now, compute metrics
print("...Evaluation complete. Computing metrics.")
# This will now work, as both arrays will have shape (num_examples, 256)
metrics = compute_metrics((np.array(all_preds), np.array(all_labels)))
print("📊 Evaluation metrics:", metrics)


📊 Running manual evaluation (memory-safe)...


...Evaluation complete. Computing metrics.
📊 Evaluation metrics: {'exact_match': 0.0, 'cer': 0.9827, 'bleu': 0.1836}
