In [1]:
# %pip install transformers datasets torch

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch
import json

In [3]:
# ✅ STEP 1: Setup Model and Tokenizer (CodeT5+)
model_name = "Salesforce/codet5p-220m"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/446M [00:00<?, ?B/s]

✅ Using device: NVIDIA GeForce RTX 2080 Ti


T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [4]:
# from google.colab import files
# uploaded = files.upload()

In [5]:
# ✅ STEP 2: Load and preprocess dataset
def load_json_dataset(path):
    with open(path) as f:
        data = json.load(f)
    return [{"input": item["input"], "output": item["output"]} for item in data]

train_data = load_json_dataset("/home/sysadm/Music/unitime_nlp/data/processed/train.json")
val_data = load_json_dataset("/home/sysadm/Music/unitime_nlp/data/processed/val.json")
test_data = load_json_dataset("/home/sysadm/Music/unitime_nlp/data/processed/test.json")

# Organize into HuggingFace dataset
full_dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

In [6]:
# ✅ STEP 3: Tokenize the data
def tokenize(batch):
    model_inputs = tokenizer(batch["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["output"], max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = full_dataset["train"].map(tokenize, batched=True)
val_tokenized = full_dataset["validation"].map(tokenize, batched=True)
test_tokenized = full_dataset["test"].map(tokenize, batched=True)


Map:   0%|          | 0/2400 [00:00<?, ? examples/s]



Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [7]:
# ✅ STEP 4: Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=5,
    fp16=True if torch.cuda.is_available() else False,
    report_to="none"
)


# ✅ STEP 5: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer
)

print("🚀 Starting training...")
trainer.train()
print("✅ Training complete")


  trainer = Trainer(


🚀 Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,1.3739,0.044497
1000,0.0319,0.021832
1500,0.0221,0.018519
2000,0.0191,0.01727
2500,0.0174,0.01675
3000,0.0166,0.016371


✅ Training complete


In [8]:
import re

# def extract_id_map(xml: str) -> dict:
#     """
#     Extracts key entity IDs from ground truth XML.
#     Returns a dictionary like {'course_id': '904', ...}
#     """
#     id_map = {}
#     id_map["course_id"] = re.search(r'<course\s+id="(\d+)"', xml).group(1)
#     id_map["offering_id"] = re.search(r'<offering\s+id="(\d+)"', xml).group(1)
#     id_map["class_id"] = re.search(r'<class\s+id="(\d+)"', xml).group(1)
#     id_map["instructor_id"] = re.search(r'<instructor\s+id="(\d+)"', xml).group(1)
#     return id_map

# def extract_id_map(xml_text):
#     """
#     Extracts tag and id attribute value pairs from the XML.
#     Example: <course id="904" ...> → {'course': '904'}
#     """
#     return {
#         f"{tag} id": id_val
#         for tag, id_val in re.findall(r'<(\w+)[^>]*?\bid="(\d+)"', xml_text)
#     }



In [9]:
# def replace_ids(xml: str, id_map: dict) -> str:
#     """
#     Replace ID fields in predicted XML with the true values from id_map.
#     """
#     xml = re.sub(r'(course\s+id=")\d+(")', rf'\1{id_map["course_id"]}\2', xml)
#     xml = re.sub(r'(offering\s+id=")\d+(")', rf'\1{id_map["offering_id"]}\2', xml)
#     xml = re.sub(r'(class\s+id=")\d+(")', rf'\1{id_map["class_id"]}\2', xml)
#     xml = re.sub(r'(instructor\s+id=")\d+(")', rf'\1{id_map["instructor_id"]}\2', xml)
#     return xml

# def replace_ids(xml_text, id_map):
#     """
#     Replaces ids in the prediction based on the tag context using id_map.
#     Example: Replace course id="XYZ" with id_map["course id"]
#     """
#     def replacer(match):
#         tag = match.group(1)
#         attr = match.group(2)
#         value = match.group(3)

#         key = f"{tag} {attr}"
#         if key in id_map:
#             return f'{attr}="{id_map[key]}"'
#         else:
#             return match.group(0)  # leave unchanged

#     # Match: <tag ... id="value" ...>
#     return re.sub(r'<(\w+)[^>]*?\b(id)="(\d+)"', replacer, xml_text)



In [10]:
import re

def extract_id_map(xml_text):
    """
    Extract all ID-like attributes and return a mapping:
    {'course id': '904', 'offering id': '6728', ...}
    """
    return dict(re.findall(r'(\w+)\s*=\s*"(\d+)"', xml_text))

def replace_ids(xml_text, id_map):
    """
    Replace all id="..." fields in xml_text with values from id_map
    based on their attribute name.
    """
    def replacer(match):
        attr = match.group(1)
        if attr in id_map:
            return f'{attr}="{id_map[attr]}"'
        else:
            return match.group(0)  # leave unchanged if not in map

    return re.sub(r'(\w+)\s*=\s*"\d+"', replacer, xml_text)


In [11]:

# ✅ STEP 6: Predict and fix XML
def fix_xml(text):
    text = text.strip()
    if not text.startswith("<"):
        text = "<" + text
    if text.count("<") > text.count(">"):
        text += ">"
    return text

print("🔍 Running prediction on test set...")
raw_test = full_dataset["test"]  # Needed for original input/output

for i in range(5):
    example = test_tokenized[i]
    input_text = raw_test[i]["input"]
    ground_truth = raw_test[i]["output"]

    # Prepare inputs for model
    inputs = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in example.items() if k in tokenizer.model_input_names}

    # Generate prediction
    outputs = model.generate(**inputs, max_length=512)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Fix malformed XML
    fixed_prediction = fix_xml(prediction)

    # Correct ID values based on ground truth
    id_map = extract_id_map(ground_truth)
    final_prediction = replace_ids(fixed_prediction, id_map)

    # Output result
    print("📥 Input:", input_text)
    print("✅ Raw Prediction:", prediction)
    print("🛠 Fixed XML:", fixed_prediction)
    print("🔁 Final with Correct IDs:", final_prediction)
    print("🎯 Ground Truth:", ground_truth)
    print("-" * 50)



🔍 Running prediction on test set...


📥 Input: Student 5850 wants to enroll in ART 302, ENGL 102, PSYC 101 with alternatives BIOL 302
✅ Raw Prediction: <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE students SYSTEM "StudentSectioning.dtd">
<students campus="woebegon" year="2010" term="Fal">
    <student externalId="5850">
        <courseRequest subject="ART" courseNbr="302" priority="1" credit="4"/>
        <courseRequest subject="ENGL" courseNbr="102" priority="2" credit="5"/>
        <courseRequest subject="PSYC" courseNbr="101" priority="3" credit="5"/>
        <courseRequest subject="BIOL" courseNbr="302" priority="4" credit="5" alternative="true"/>
        <courseRequest subject="PSYC" courseNbr="201" priority="5" credit="5"/>
    </student>
</students>
🛠 Fixed XML: <?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE students SYSTEM "StudentSectioning.dtd">
<students campus="woebegon" year="2010" term="Fal">
    <student externalId="5850">
        <courseRequest subject="ART" courseNbr="302" priority="1" credit="4"/>
 