In [18]:
# fix_codet5p_q4_lora_final_fixed.py
# Fine-tune CodeT5p-220M to generate XML outputs using 4-bit QLoRA with LoRA
# Fixed version to handle 'context' field + flattening + tensor creation issues

import os, gc, json, torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
)
from datasets import DatasetDict, Dataset
import numpy as np
import evaluate
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training,TaskType

# ------------------------------------------------------------------
# 0️⃣  CUDA + Memory Configuration
# ------------------------------------------------------------------
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ------------------------------------------------------------------
# 1️⃣  Model + Tokenizer Setup
# ------------------------------------------------------------------
model_name = "Salesforce/codet5p-220m"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

print("🚀 Loading quantized model (4-bit)...")
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    # padding_side="left",
    add_bos_token=True,
    add_eos_token=True,
    use_fast=False,
)
tokenizer.pad_token = tokenizer.eos_token
print("✅ Model & Tokenizer loaded.")

# ------------------------------------------------------------------
# 2️⃣  Prepare for LoRA (k-bit training)
# ------------------------------------------------------------------
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
peft_model = get_peft_model(model, lora_config)
peft_model.config.use_cache = False

def print_trainable_parameters(model):
    trainable_params, all_param = 0, 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} / {all_param:,} "
          f"({100 * trainable_params / all_param:.2f}%)")

print_trainable_parameters(peft_model)

# ------------------------------------------------------------------
# 3️⃣  Dataset Loading (with flatten fix)
# ------------------------------------------------------------------
def flatten_jsonl(path):
    fixed = []
    with open(path) as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)

            # Combine fields cleanly
            context = obj.get("context", "")
            prompt = obj.get("prompt", "")
            output = obj.get("output", "")

            # Flatten lists into strings
            if isinstance(context, list): context = " ".join(map(str, context))
            if isinstance(prompt, list): prompt = " ".join(map(str, prompt))
            if isinstance(output, list): output = " ".join(map(str, output))

            # Create combined input text
            input_text = f"{context.strip()} {prompt.strip()}".strip()

            fixed.append({
                "input_text": input_text,
                "output_text": str(output),
            })
    return fixed

data_files = {
    "train": "/home/sysadm/Music/unitime_nlp/data_generator/data/Courseofferings_dataset/train.jsonl",
    "validation": "/home/sysadm/Music/unitime_nlp/data_generator/data/Courseofferings_dataset/validation.jsonl",
    "test": "/home/sysadm/Music/unitime_nlp/data_generator/data/Courseofferings_dataset/test.jsonl",
}

print("📂 Loading and flattening dataset...")
splits = {k: flatten_jsonl(v) for k, v in data_files.items()}

dataset_dict = DatasetDict({
    "train": Dataset.from_list(splits["train"]),
    "validation": Dataset.from_list(splits["validation"]),
    "test": Dataset.from_list(splits["test"]),
})
print(dataset_dict)

# ------------------------------------------------------------------
# 4️⃣  Tokenization
# ------------------------------------------------------------------
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512

def tokenize_function(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        batch["output_text"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("🧠 Tokenizing...")
tokenized_datasets = dataset_dict.map(
    tokenize_function,
    batched=True,
    remove_columns=["input_text", "output_text"],
)
print("✅ Tokenization complete.")

# ------------------------------------------------------------------
# 5️⃣  Evaluation Metrics
# ------------------------------------------------------------------
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple) or (hasattr(preds, "ndim") and preds.ndim == 3):
        pred_ids = np.argmax(preds, axis=-1)
    else:
        pred_ids = preds
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    exact_match = np.mean([p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels)])
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    decoded_labels_for_bleu = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
    return {
        "exact_match": round(float(exact_match), 4),
        "cer": round(float(cer), 4),
        "bleu": round(float(bleu["score"]), 4),
    }




🚀 Loading quantized model (4-bit)...


✅ Model & Tokenizer loaded.
Trainable params: 2,654,208 / 154,757,376 (1.72%)
📂 Loading and flattening dataset...
DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 1400
    })
    validation: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 300
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 300
    })
})
🧠 Tokenizing...


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

✅ Tokenization complete.


In [19]:

# training_args = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=3,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     gradient_accumulation_steps=8,
#     warmup_steps=1,
#     weight_decay=0.01,
#     learning_rate=2e-4,
#     optim="paged_adamw_8bit",
#     logging_dir="./logs",
#     logging_steps=100,
#     eval_strategy="steps",
#     eval_steps=30,
#     save_strategy="steps",
#     save_steps=30,
#     do_eval=True,
#     gradient_checkpointing=True,
#     load_best_model_at_end=False,
#     metric_for_best_model="exact_match",
#     greater_is_better=True,
#     fp16=True,
#     report_to="none",
#     remove_unused_columns=False,
# )

In [None]:
# ------------------------------------------------------------------
# 6️⃣  Training Arguments (memory-safe)
# ------------------------------------------------------------------
output_dir = "./Offereing-nlp-to-xml"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=7,
    per_device_train_batch_size=4,
    # per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_steps=1,
    weight_decay=0.01,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_dir="./logs",
    logging_steps=30,

    # ✅ Disable evaluation during training
    eval_strategy="no",
    do_eval=False,  # ✅ no evaluation

    # ✅ Remove or keep save only if you still want checkpoints
    save_strategy="steps",
    save_steps=100,

    gradient_checkpointing=True,
    load_best_model_at_end=False,  # no eval, so no "best model"
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)

print("✅ TrainingArguments configured.")

# ------------------------------------------------------------------
# 7️⃣  Trainer
# ------------------------------------------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model, padding="longest")

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------------------------------------------------------------
# 8️⃣  Training (with cleanup)
# ------------------------------------------------------------------
def pre_train_cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

pre_train_cleanup()

print("\n🔥 Starting fine-tuning (memory-safe QLoRA)...")
trainer.train()
print("🎉 Fine-tuning complete!")




  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


✅ TrainingArguments configured.

🔥 Starting fine-tuning (memory-safe QLoRA)...


Step,Training Loss
30,1.6653
60,0.8059
90,0.2748
120,0.1042
150,0.0685
180,0.0567
210,0.051
240,0.0474
270,0.0457
300,0.0452


🎉 Fine-tuning complete!


In [21]:
# # (Your trainer.train() line)
# print("🎉 Fine-tuning complete!")

# print("\n🧹 Freeing VRAM... (deleting trainer and train data)")
# # --- THIS IS THE MEMORY FIX ---
# del trainer
# del tokenized_datasets["train"] # We don't need the training set anymore
# gc.collect()
# torch.cuda.empty_cache()
# # --- END OF MEMORY FIX ---

# print("✅ VRAM cleared. Starting memory-safe manual evaluation...")

In [22]:
# # This should now work without an OOM error
# print("\n📊 Running evaluation...")
# metrics = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
# print("📊 Evaluation metrics:", metrics)

In [None]:
# (After trainer.train() and VRAM cleanup)

print("\n📊 Running manual evaluation (memory-safe)...")
from torch.utils.data import DataLoader
import gc
import numpy as np

peft_model.eval()
eval_dataset = tokenized_datasets["validation"]

eval_loader = DataLoader(
    eval_dataset, 
    batch_size=1, 
    collate_fn=data_collator
)

all_preds, all_labels = [], []

MAX_TARGET_LENGTH = 256 

for batch in eval_loader:
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    
    labels = batch["labels"].numpy() 

    with torch.no_grad():
        generated_ids = peft_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=MAX_TARGET_LENGTH,
            num_beams=1,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Move generated IDs to CPU
    preds = generated_ids.cpu().numpy() 

    
    label_length = labels.shape[1]
    
    padded_preds = np.full((preds.shape[0], label_length), tokenizer.pad_token_id)
    
    current_pred_length = preds.shape[1]
    
    # Make sure we don't try to copy more than fits
    copy_length = min(current_pred_length, label_length)
    
    # Copy the generated tokens into the padded array
    padded_preds[:, :copy_length] = preds[:, :copy_length]
    
    # --- Padding Fix Ends Here Padding Fix ---
    
    all_preds.extend(padded_preds) 
    all_labels.extend(labels)       

    # Aggressively clean up memory
    del input_ids, attention_mask, labels, batch, generated_ids, preds, padded_preds
    gc.collect()
    torch.cuda.empty_cache()

print("...Evaluation complete. Computing metrics.")
metrics = compute_metrics((np.array(all_preds), np.array(all_labels)))
print("📊 Evaluation metrics:", metrics)


📊 Running manual evaluation (memory-safe)...
...Evaluation complete. Computing metrics.
📊 Evaluation metrics: {'exact_match': 0.0, 'cer': 0.0291, 'bleu': 97.4163}


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import PeftModel
import gc

# --- 1. Define Model Names ---
base_model_name = "Salesforce/codet5p-220m"
adapter_path = "/home/sysadm/Music/unitime_nlp/test/codet5p-finetuned-nlp-to-xml/checkpoint-308"
# --- 2. Load 4-bit Config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# --- 3. Load Base Model (Quantized) ---
print(f"Loading base model: {base_model_name}")
model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Automatically uses your GPU
    trust_remote_code=True,
)

# --- 4. Load Tokenizer ---
# (Must match the settings you trained with)
tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    add_bos_token=True,
    add_eos_token=True,
    use_fast=False,
)
# tokenizer.pad_token = tokenizer.eos_token

# --- 5. Load the PEFT Adapter ---
print(f"Loading adapter from: {adapter_path}")
# This line merges your saved adapter onto the base model
model = PeftModel.from_pretrained(model, adapter_path)
model.eval()  # Set to evaluation mode

print("✅ Model is ready for inference!")



Loading base model: Salesforce/codet5p-220m
Loading adapter from: /home/sysadm/Music/unitime_nlp/test/codet5p-finetuned-nlp-to-xml/checkpoint-308
✅ Model is ready for inference!


In [3]:
# --- 6. Define Input ---
context = "COURSE OFFERING REQUEST"
prompt_text = "Add a new class: DLCS 10 (Deep Learning). Place it in EDUC on MWF between 0830 and 0920. It's a Lab with a limit of 25."

# Format input just like training
input_text = f"{context.strip()} {prompt_text.strip()}".strip()
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# --- 7. Generate ---
print("...Generating XML...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,   # Your MAX_TARGET_LENGTH
        num_beams=4,          # Use beam search for better results
        early_stopping=False,
        pad_token_id=tokenizer.pad_token_id
    )

xml_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n--- Generated XML ---")
print(xml_output)

...Generating XML...

--- Generated XML ---
<offerings campus="woebegon" year="2010" term="Fal" dateFormat="yyyy/M/d" timeFormat="HHmm" created="Sat Oct 18 19:33:17 CEST 2025" includeExams="none">
  <offering id="132371" offered="true" action="insert">
    <course id="737984" subject="CUBE" courseNbr="106" controlling="true" title="CUBE_106"/>
    <config name="1" limit="25">
      <subpart type="Lab" suffix="" minPerWeek="150"/>
      <class id="CUBE 10 Lab L1" type="Lab" suffix="L1" limit="25" studentScheduling="true" displayInScheduleBook="true" cancelled="false" managingDept="0100">
        <time days="MWF" startTime="0830" endTime="0920" timePattern="3 x 50"/>
        <room building="EDUC" roomNbr="0830"/>
      </class>
    </config>
  </offering>
</offerings>
