In [1]:
import os, gc, json, torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig,
)
from datasets import DatasetDict, Dataset
import numpy as np
import evaluate
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel

# ------------------------------------------------------------------
# 0Ô∏è‚É£  CUDA + Memory Configuration
# ------------------------------------------------------------------
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# ------------------------------------------------------------------
# 1Ô∏è‚É£  Model + Tokenizer Setup
# ------------------------------------------------------------------
# CHANGE 1: Updated to 770M model
model_name = "Salesforce/codet5p-770m"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

print(f"üöÄ Loading quantized model ({model_name})...")
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    add_bos_token=True,
    add_eos_token=True,
    use_fast=False,
)
tokenizer.pad_token = tokenizer.eos_token
print("‚úÖ Model & Tokenizer loaded.")

# ------------------------------------------------------------------
# 2Ô∏è‚É£  Prepare for LoRA (k-bit training)
# ------------------------------------------------------------------
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "k", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)
peft_model = get_peft_model(model, lora_config)
peft_model.config.use_cache = False

def print_trainable_parameters(model):
    trainable_params, all_param = 0, 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"Trainable params: {trainable_params:,} / {all_param:,} "
          f"({100 * trainable_params / all_param:.2f}%)")

print_trainable_parameters(peft_model)

# ------------------------------------------------------------------
# 3Ô∏è‚É£  Dataset Loading
# ------------------------------------------------------------------
def flatten_jsonl(path):
    fixed = []
    with open(path) as f:
        for line in f:
            if not line.strip():
                continue
            obj = json.loads(line)

            # Extract prompt and output based on your dataset generator
            prompt = obj.get("prompt", "")
            output = obj.get("output", "")

            # Ensure strings
            if isinstance(prompt, list): prompt = " ".join(map(str, prompt))
            if isinstance(output, list): output = " ".join(map(str, output))

            # Input is just the prompt (no context field in your generator)
            input_text = prompt.strip()

            fixed.append({
                "input_text": input_text,
                "output_text": str(output),
            })
    return fixed

# Update paths to your generated dataset location
data_files = {
    "train": "/home/sysadm/Music/unitime/unitime_update_dataset/train.jsonl",
    "validation": "/home/sysadm/Music/unitime/unitime_update_dataset/validation.jsonl",
    "test": "/home/sysadm/Music/unitime/unitime_update_dataset/test.jsonl",

}

print("üìÇ Loading and flattening dataset...")
# Check if files exist before loading to avoid cryptic errors
if not os.path.exists(data_files["train"]):
    raise FileNotFoundError("Run your dataset generator script first to create ./unitime_update_dataset/")

splits = {k: flatten_jsonl(v) for k, v in data_files.items()}

dataset_dict = DatasetDict({
    "train": Dataset.from_list(splits["train"]),
    "validation": Dataset.from_list(splits["validation"]),
    "test": Dataset.from_list(splits["test"]),
})
print(dataset_dict)

# ------------------------------------------------------------------
# 4Ô∏è‚É£  Tokenization
# ------------------------------------------------------------------
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 512

def tokenize_function(batch):
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length",
    )
    labels = tokenizer(
        batch["output_text"],
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("üß† Tokenizing...")
tokenized_datasets = dataset_dict.map(
    tokenize_function,
    batched=True,
    remove_columns=["input_text", "output_text"],
)
print("‚úÖ Tokenization complete.")

# ------------------------------------------------------------------
# 5Ô∏è‚É£  Evaluation Metrics
# ------------------------------------------------------------------
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple) or (hasattr(preds, "ndim") and preds.ndim == 3):
        pred_ids = np.argmax(preds, axis=-1)
    else:
        pred_ids = preds
    
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    exact_match = np.mean([p.strip() == l.strip() for p, l in zip(decoded_preds, decoded_labels)])
    cer = cer_metric.compute(predictions=decoded_preds, references=decoded_labels)
    decoded_labels_for_bleu = [[label] for label in decoded_labels]
    bleu = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels_for_bleu)
    
    return {
        "exact_match": round(float(exact_match), 4),
        "cer": round(float(cer), 4),
        "bleu": round(float(bleu["score"]), 4),
    }


üöÄ Loading quantized model (Salesforce/codet5p-770m)...


config.json:   0%|          | 0.00/770 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

‚úÖ Model & Tokenizer loaded.
Trainable params: 7,077,888 / 493,059,072 (1.44%)
üìÇ Loading and flattening dataset...
DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 4200
    })
    validation: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 900
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 900
    })
})
üß† Tokenizing...


Map:   0%|          | 0/4200 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

‚úÖ Tokenization complete.


In [None]:
# ------------------------------------------------------------------
# 6Ô∏è‚É£  Training Arguments
# ------------------------------------------------------------------
output_dir = "./CodeT5p-770m-XML-Tuning"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=5,       # 770m learns faster, 5 epochs might be enough
    
    # CHANGE 2: Reduced batch size (770m is larger than 220m)
    per_device_train_batch_size=2,
    
    # CHANGE 3: Increased accumulation to maintain effective batch size
    gradient_accumulation_steps=16, 
    
    warmup_steps=1,
    weight_decay=0.01,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="no",       
    do_eval=False,
    save_strategy="steps",    
    save_steps=50,
    gradient_checkpointing=True,
    load_best_model_at_end=False,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
)

print("‚úÖ TrainingArguments configured.")

# ------------------------------------------------------------------
# 7Ô∏è‚É£  Trainer
# ------------------------------------------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model, padding="longest")

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ------------------------------------------------------------------
# 8Ô∏è‚É£  Training
# ------------------------------------------------------------------
def pre_train_cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

pre_train_cleanup()

print(f"\nüî• Starting fine-tuning {model_name}...")
trainer.train()

# SAVE FINAL ADAPTER
final_adapter_path = os.path.join(output_dir, "final_adapter")
trainer.save_model(final_adapter_path)
print(f"üéâ Fine-tuning complete! Adapter saved to '{final_adapter_path}'.")



‚úÖ TrainingArguments configured.


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.



üî• Starting fine-tuning Salesforce/codet5p-770m...


Step,Training Loss
10,2.5408
20,1.2601
30,0.9952
40,0.7091
50,0.4195
60,0.1789
70,0.0668
80,0.0352
90,0.0234
100,0.0173


In [1]:
import os, json, torch
import numpy as np
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm

# ------------------------------------------------------------------
# 1Ô∏è‚É£ Setup (Must match your training config)
# ------------------------------------------------------------------
base_model_name = "Salesforce/codet5p-770m"
adapter_path = "/home/sysadm/Music/unitime/unitime_nlp/data_generator/CodeT5p-770m-XML-Tuning/final_adapter" # Path to your saved adapter
test_file_path = "/home/sysadm/Music/unitime/unitime_update_dataset/test.jsonl"

# Load Metric Calculators
cer_metric = evaluate.load("cer")
bleu_metric = evaluate.load("sacrebleu")

# ------------------------------------------------------------------
# 2Ô∏è‚É£ Load Model & Tokenizer
# ------------------------------------------------------------------
print(f"üöÄ Loading base model: {base_model_name}")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

print(f"üîó Loading adapter from: {adapter_path}")
model = PeftModel.from_pretrained(model, adapter_path)
model.eval() # Set to evaluation mode

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# ------------------------------------------------------------------
# 3Ô∏è‚É£ Load & Prepare Test Data
# ------------------------------------------------------------------
print("üìÇ Loading test data...")
input_texts = []
reference_texts = []

with open(test_file_path, "r") as f:
    for line in f:
        if not line.strip(): continue
        obj = json.loads(line)
        
        # Match the flattening logic from training
        prompt = obj.get("prompt", "")
        if isinstance(prompt, list): prompt = " ".join(map(str, prompt))
        input_texts.append(prompt.strip())
        
        output = obj.get("output", "")
        if isinstance(output, list): output = " ".join(map(str, output))
        reference_texts.append(output.strip())

print(f"‚úÖ Found {len(input_texts)} test examples.")

# ------------------------------------------------------------------
# 4Ô∏è‚É£ Generation Loop (Batching for speed)
# ------------------------------------------------------------------
BATCH_SIZE = 4 # Increase if you have 24GB+ VRAM, decrease if OOM
generated_texts = []

print("‚ö° Starting generation...")
for i in tqdm(range(0, len(input_texts), BATCH_SIZE)):
    batch_inputs = input_texts[i : i + BATCH_SIZE]
    
    # Tokenize
    inputs = tokenizer(
        batch_inputs, 
        return_tensors="pt", 
        padding=True, 
        truncation=True, 
        max_length=512
    ).to("cuda")

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            num_beams=1, # greedy search is faster for eval
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    generated_texts.extend(decoded)

# ------------------------------------------------------------------
# 5Ô∏è‚É£ Calculate Metrics
# ------------------------------------------------------------------
print("\nüìä Computing metrics...")

# 1. Exact Match (Strict)
exact_matches = [1 if gen.strip() == ref.strip() else 0 for gen, ref in zip(generated_texts, reference_texts)]
exact_match_score = np.mean(exact_matches) * 100

# 2. CER (Character Error Rate) - Lower is better
cer_score = cer_metric.compute(predictions=generated_texts, references=reference_texts)

# 3. BLEU - Higher is better
# BLEU expects references to be a list of lists [[ref1], [ref2]]
bleu_refs = [[ref] for ref in reference_texts]
bleu_score = bleu_metric.compute(predictions=generated_texts, references=bleu_refs)

print("\n" + "="*30)
print("   üèÜ EVALUATION RESULTS üèÜ")
print("="*30)
print(f"‚úÖ Exact Match: {exact_match_score:.2f}%")
print(f"üìâ CER:         {cer_score:.4f}  (Lower is better)")
print(f"üìà BLEU:        {bleu_score['score']:.2f}  (Higher is better)")
print("="*30)

# Optional: Save failures to inspect
with open("eval_failures.txt", "w") as f:
    for gen, ref in zip(generated_texts, reference_texts):
        if gen.strip() != ref.strip():
            f.write(f"EXPECTED:\n{ref}\n\nGOT:\n{gen}\n\n{'='*20}\n")
print(f"üìù Incorrect predictions saved to 'eval_failures.txt' for debugging.")

üöÄ Loading base model: Salesforce/codet5p-770m
üîó Loading adapter from: /home/sysadm/Music/unitime/unitime_nlp/data_generator/CodeT5p-770m-XML-Tuning/final_adapter
üìÇ Loading test data...
‚úÖ Found 900 test examples.
‚ö° Starting generation...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 225/225 [51:45<00:00, 13.80s/it]



üìä Computing metrics...

   üèÜ EVALUATION RESULTS üèÜ
‚úÖ Exact Match: 79.11%
üìâ CER:         0.0011  (Lower is better)
üìà BLEU:        99.51  (Higher is better)
üìù Incorrect predictions saved to 'eval_failures.txt' for debugging.


In [6]:
# ------------------------------------------------------------------
# 9Ô∏è‚É£  Inference Check
# ------------------------------------------------------------------
print("\nüîç Running Inference Check...")
# del peft_model, trainer, model
# gc.collect()
# torch.cuda.empty_cache()

# Load Base Model (770M)
model_name = "Salesforce/codet5p-770m"
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Load Adapter
final_adapter_path="/home/sysadm/Music/unitime/unitime_nlp/data_generator/CodeT5p-770m-XML-Tuning/final_adapter"
print(f"Loading adapter from: {final_adapter_path}")
model = PeftModel.from_pretrained(model, final_adapter_path)
model.eval()

# Test Input (Matching your generator logic)
prompt_text = "Add a new course offering: DLCS 10 titled 'Deep Learning' as a Lab in EDUC room 106 on MWF 0830-0920 with limit 25."
input_text = prompt_text.strip()
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

print("...Generating XML...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        num_beams=4,
        early_stopping=False,
        pad_token_id=tokenizer.pad_token_id
    )

xml_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n--- Generated XML ---")
print(xml_output)


üîç Running Inference Check...
Loading adapter from: /home/sysadm/Music/unitime/unitime_nlp/data_generator/CodeT5p-770m-XML-Tuning/final_adapter
...Generating XML...

--- Generated XML ---
<offerings campus="woebegon"
           year="2010"
           term="Fal"
           dateFormat="yyyy/M/d"
           timeFormat="HHmm"
           created="Tue Nov 25 01:14:05 CEST 2025"
           includeExams="none">

  <offering offered="true" action="insert">
    <course subject="DLCS" courseNbr="10" controlling="true" title="Deep Learning"/>
    <config name="1" limit="25">
      <subpart type="Lab" suffix="" minPerWeek="150"/>
      <class type="Lab" suffix="L1" limit="25"
             studentScheduling="true" displayInScheduleBook="true"
             cancelled="false" managingDept="0100">
        <time days="MWF" startTime="0830" endTime="0920" timePattern="3 x 50"/>
        <room building="EDUC" roomNbr="106"/>
      </class>
    </config>
  </offering>
</offerings>
