In [None]:
%pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch
import json

In [None]:
# ✅ STEP 1: Setup Model and Tokenizer (CodeT5+)
model_name = "Salesforce/codet5p-220m"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print("✅ Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
model.to(device)

✅ Using device: Tesla T4


T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test_dataset.json to test_dataset (1).json
Saving train_dataset.json to train_dataset (1).json
Saving val_dataset.json to val_dataset (1).json


In [None]:
# ✅ STEP 2: Load and preprocess dataset
def load_json_dataset(path):
    with open(path) as f:
        data = json.load(f)
    return [{"input": item["input"], "output": item["output"]} for item in data]

train_data = load_json_dataset("train_dataset.json")
val_data = load_json_dataset("val_dataset.json")
test_data = load_json_dataset("test_dataset.json")

# Organize into HuggingFace dataset
full_dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data),
    "test": Dataset.from_list(test_data)
})

In [None]:
# ✅ STEP 3: Tokenize the data
def tokenize(batch):
    model_inputs = tokenizer(batch["input"], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["output"], max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tokenized = full_dataset["train"].map(tokenize, batched=True)
val_tokenized = full_dataset["validation"].map(tokenize, batched=True)
test_tokenized = full_dataset["test"].map(tokenize, batched=True)


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]



Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# ✅ STEP 4: Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="steps",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=5,
    fp16=True if torch.cuda.is_available() else False,
    report_to="none"
)


# ✅ STEP 5: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer
)

print("🚀 Starting training...")
trainer.train()
print("✅ Training complete")


  trainer = Trainer(


🚀 Starting training...


Step,Training Loss,Validation Loss
500,0.0229,0.022678
1000,0.0199,0.022586
1500,0.0202,0.022648
2000,0.0181,0.022871


✅ Training complete


In [None]:
import re

# def extract_id_map(xml: str) -> dict:
#     """
#     Extracts key entity IDs from ground truth XML.
#     Returns a dictionary like {'course_id': '904', ...}
#     """
#     id_map = {}
#     id_map["course_id"] = re.search(r'<course\s+id="(\d+)"', xml).group(1)
#     id_map["offering_id"] = re.search(r'<offering\s+id="(\d+)"', xml).group(1)
#     id_map["class_id"] = re.search(r'<class\s+id="(\d+)"', xml).group(1)
#     id_map["instructor_id"] = re.search(r'<instructor\s+id="(\d+)"', xml).group(1)
#     return id_map

# def extract_id_map(xml_text):
#     """
#     Extracts tag and id attribute value pairs from the XML.
#     Example: <course id="904" ...> → {'course': '904'}
#     """
#     return {
#         f"{tag} id": id_val
#         for tag, id_val in re.findall(r'<(\w+)[^>]*?\bid="(\d+)"', xml_text)
#     }



In [None]:
# def replace_ids(xml: str, id_map: dict) -> str:
#     """
#     Replace ID fields in predicted XML with the true values from id_map.
#     """
#     xml = re.sub(r'(course\s+id=")\d+(")', rf'\1{id_map["course_id"]}\2', xml)
#     xml = re.sub(r'(offering\s+id=")\d+(")', rf'\1{id_map["offering_id"]}\2', xml)
#     xml = re.sub(r'(class\s+id=")\d+(")', rf'\1{id_map["class_id"]}\2', xml)
#     xml = re.sub(r'(instructor\s+id=")\d+(")', rf'\1{id_map["instructor_id"]}\2', xml)
#     return xml

# def replace_ids(xml_text, id_map):
#     """
#     Replaces ids in the prediction based on the tag context using id_map.
#     Example: Replace course id="XYZ" with id_map["course id"]
#     """
#     def replacer(match):
#         tag = match.group(1)
#         attr = match.group(2)
#         value = match.group(3)

#         key = f"{tag} {attr}"
#         if key in id_map:
#             return f'{attr}="{id_map[key]}"'
#         else:
#             return match.group(0)  # leave unchanged

#     # Match: <tag ... id="value" ...>
#     return re.sub(r'<(\w+)[^>]*?\b(id)="(\d+)"', replacer, xml_text)



In [None]:
import re

def extract_id_map(xml_text):
    """
    Extract all ID-like attributes and return a mapping:
    {'course id': '904', 'offering id': '6728', ...}
    """
    return dict(re.findall(r'(\w+)\s*=\s*"(\d+)"', xml_text))

def replace_ids(xml_text, id_map):
    """
    Replace all id="..." fields in xml_text with values from id_map
    based on their attribute name.
    """
    def replacer(match):
        attr = match.group(1)
        if attr in id_map:
            return f'{attr}="{id_map[attr]}"'
        else:
            return match.group(0)  # leave unchanged if not in map

    return re.sub(r'(\w+)\s*=\s*"\d+"', replacer, xml_text)


In [None]:

# ✅ STEP 6: Predict and fix XML
def fix_xml(text):
    text = text.strip()
    if not text.startswith("<"):
        text = "<" + text
    if text.count("<") > text.count(">"):
        text += ">"
    return text

print("🔍 Running prediction on test set...")
raw_test = full_dataset["test"]  # Needed for original input/output

for i in range(5):
    example = test_tokenized[i]
    input_text = raw_test[i]["input"]
    ground_truth = raw_test[i]["output"]

    # Prepare inputs for model
    inputs = {k: torch.tensor(v).unsqueeze(0).to(device) for k, v in example.items() if k in tokenizer.model_input_names}

    # Generate prediction
    outputs = model.generate(**inputs, max_length=512)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Fix malformed XML
    fixed_prediction = fix_xml(prediction)

    # Correct ID values based on ground truth
    id_map = extract_id_map(ground_truth)
    final_prediction = replace_ids(fixed_prediction, id_map)

    # Output result
    print("📥 Input:", input_text)
    print("✅ Raw Prediction:", prediction)
    print("🛠 Fixed XML:", fixed_prediction)
    print("🔁 Final with Correct IDs:", final_prediction)
    print("🎯 Ground Truth:", ground_truth)
    print("-" * 50)



🔍 Running prediction on test set...
📥 Input: Create course offering ART401 ART Course with Dr. Williams on Friday from 9:00 AM to 10:00 AM in room THTR202 with limit 20 students
✅ Raw Prediction: <offerings campus="MAIN" year="2024" term="Fall"><offering id="6896" offered="true"><course id="945" subject="ART" courseNbr="401" title="ART Course"><class id="57703" suffix="1" type="SEM" limit="20"><time days="F" startTime="0900" endTime="1000"/><room building="THTR" roomNbr="202"/><instructor id="85" fname="Dr." lname="Williams" lead="true"/></class></course></offering></offerings>
🛠 Fixed XML: <offerings campus="MAIN" year="2024" term="Fall"><offering id="6896" offered="true"><course id="945" subject="ART" courseNbr="401" title="ART Course"><class id="57703" suffix="1" type="SEM" limit="20"><time days="F" startTime="0900" endTime="1000"/><room building="THTR" roomNbr="202"/><instructor id="85" fname="Dr." lname="Williams" lead="true"/></class></course></offering></offerings>
🔁 Final with 