In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq, BitsAndBytesConfig
from datasets import load_dataset
import torch
import json
import numpy as np
import evaluate
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training # Import PEFT libraries

# ✅ STEP 1: Configure QLoRA - BitsAndBytesConfig
# -------------------------------------------------------------------
# This configuration object tells the model how to be quantized
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
# )
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )
# ✅ STEP 2: Load Model and Tokenizer (with Quantization)
# -------------------------------------------------------------------
model_name = "Salesforce/codet5p-220m"

# Load the model with the 4-bit quantization config
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto", 
)

# Load the tokenizer as before
# tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# ✅ STEP 3: Configure LoRA - LoraConfig
# -------------------------------------------------------------------
# This configuration tells PEFT how to apply LoRA to the quantized model

# Prepare the model for k-bit training
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=32, # Rank of the update matrices. Lower rank means less trainable parameters.
    lora_alpha=32, # Alpha parameter for scaling. A common setting is 2 * r.
    target_modules=[
       "q", "k", "v"
    ], # Target modules to apply LoRA to. For T5, these are the query, key, and value matrices in the attention layers.
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # 
)
model.gradient_checkpointing_enable()
# Apply PEFT to the model
peft_model = get_peft_model(model, lora_config)

# --- Sanity Check: Print the number of trainable parameters ---
def print_trainable_parameters(model):
    """Prints the number of trainable parameters in the model."""
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print("Trainable parameters after applying LoRA:")
# print(print_number_of_trainable_model_parameters(peft_model))

print_trainable_parameters(peft_model)
# -------------------------------------------------------------------


✅ Using device: NVIDIA GeForce RTX 2080 Ti
Trainable parameters after applying LoRA:
trainable params: 5308416 || all params: 157411584 || trainable%: 3.37


In [2]:
# # from google.colab import files
# import json
# # Upload (it will prompt you)
# # uploaded = files.upload()

# # Suppose you upload train.json, val.json, test.json

# def load_jsonl_dataset(filename):
#     data = []
#     with open(filename, "r") as f:
#         for line in f:
#             if line.strip():  # skip empty lines
#                 item = json.loads(line)
#                 data.append({"input": item["prompt"], "output": item["output"]})
#     return data

# # train_data = load_jsonl_dataset("train.jsonl")
# # val_data = load_jsonl_dataset("validation.jsonl")
# # test_data = load_jsonl_dataset("test.jsonl")

# train_data = load_jsonl_dataset("/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/train.jsonl")
# val_data = load_jsonl_dataset("/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/validation.jsonl")
# test_data = load_jsonl_dataset("/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/test.jsonl")

# # full_dataset = DatasetDict({
# #     "train": Dataset.from_list(train_data),
# #     "validation": Dataset.from_list(val_data),
# #     "test": Dataset.from_list(test_data)
# # })


# from datasets import load_dataset

# # data_files = {
# #     "train": "train.jsonl",
# #     "validation": "validation.jsonl",
# #     "test": "test.jsonl"
# # }
# full_dataset = load_dataset("json", data_files=data_files)

In [3]:
from datasets import load_dataset

data_files = {
    "train": "/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/train.jsonl",
    "validation": "/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/validation.jsonl",
    "test": "/home/sysadm/Music/unitime_nlp/data/Courseofferings_dataset/test.jsonl"
}

full_dataset = load_dataset("json", data_files=data_files)

In [4]:
def tokenize_function(batch):
    model_inputs = tokenizer(batch["prompt"], max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer(): labels = tokenizer(batch["output"], max_length=512, truncation=True)
    model_inputs["labels"] = labels["input_ids"]; return model_inputs
print("\n✅ Tokenizing datasets...")
tokenized_datasets = full_dataset.map(tokenize_function, batched=True).remove_columns(["prompt", "output"])
# val_tokenized = full_dataset["validation"]
# train_tokenized = full_dataset["train"]


✅ Tokenizing datasets...


In [5]:
import evaluate
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # preds are the model's logits, so we need to decode them
    # -100 is a special value used for padding, so we ignore it
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # --- Calculate Exact Match ---
    exact_match = np.mean([1 if pred.strip() == label.strip() else 0 for pred, label in zip(decoded_preds, decoded_labels)])

    # --- Calculate Character Error Rate (CER) ---
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)

    # --- Calculate BLEU Score ---
    # Sacrebleu expects references to be a list of lists
    decoded_labels_for_bleu = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)

    return {
        "exact_match": round(exact_match, 4),
        "cer": round(cer, 4),
        "bleu": round(bleu["score"], 4)
    }
# -------------------------------------------------------------------


In [6]:
output_dir = "./codet5p-finetuned-nlp-to-xml"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=4,
    warmup_steps=1,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_dir='./logs',
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match", # ⭐ NEW! Use exact_match to find the best model
    greater_is_better=True,              # ⭐ NEW! Higher exact_match is better
    fp16=False,
    report_to="none",
)
print("\n✅ Training Arguments configured.")

peft_model.config.use_cache = False

# # ✅ STEP 4: Set training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
#     eval_strategy="steps",
#     save_strategy="epoch",
#     logging_dir="./logs",
#     num_train_epochs=5,
#     fp16=True if torch.cuda.is_available() else False,
#     report_to="none"
# )


# # # ✅ STEP 5: Train the model
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_tokenized,
#     eval_dataset=val_tokenized,
#     tokenizer=tokenizer
# )

# print("🚀 Starting training...")
# trainer.train()
# print("✅ Training complete")



✅ Training Arguments configured.


In [9]:
# ✅ STEP 7: Create the Trainer (No changes here)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)
trainer = Trainer(
    model=peft_model, args=training_args, train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"], tokenizer=tokenizer,
    data_collator=data_collator,compute_metrics=compute_metrics,
)


# ✅ STEP 8: Start Fine-Tuning
print("\n🔥 Starting fine-tuning with QLoRA...")
trainer.train()
print("🎉 Fine-tuning complete!")

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🔥 Starting fine-tuning with QLoRA...


Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.14 GiB. GPU 0 has a total capacity of 10.56 GiB of which 1.01 GiB is free. Process 90107 has 5.89 GiB memory in use. Process 3951507 has 732.00 MiB memory in use. Including non-PyTorch memory, this process has 2.62 GiB memory in use. Of the allocated memory 1.56 GiB is allocated by PyTorch, and 883.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)