In [1]:
%pip install transformers datasets torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
from google.colab import files
uploaded = files.upload()


Saving test_dataset.json to test_dataset.json
Saving train_dataset.json to train_dataset.json
Saving val_dataset.json to val_dataset.json


In [3]:
import json
from datasets import Dataset, DatasetDict

def load_dataset(path):
    with open(path) as f:
        data = json.load(f)
    return Dataset.from_list([{"input_text": d["input"], "target_text": d["output"]} for d in data])

def get_dataset():
    train_data = load_dataset("train_dataset.json")
    val_data = load_dataset("val_dataset.json")
    test_data = load_dataset("test_dataset.json")  # note the typo, may be 'test_dataset.json'

    print("✅ Loaded datasets:")
    print(f"Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")

    return DatasetDict({
        "train": train_data,
        "validation": val_data,
        "test": test_data
    })


In [4]:
from transformers import T5Tokenizer

def tokenize_data(dataset, tokenizer, max_input_len=256, max_target_len=512):
    def preprocess(example):
        input_enc = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=max_input_len)
        target_enc = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=max_target_len)
        return {
            "input_ids": input_enc["input_ids"],
            "attention_mask": input_enc["attention_mask"],
            "labels": target_enc["input_ids"]
        }
    tokenized = dataset.map(preprocess, batched=True)
    print("✅ Tokenization complete")
    return tokenized


In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

def fine_tune(tokenized_datasets):
    model = T5ForConditionalGeneration.from_pretrained("t5-base")

    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="steps",
        eval_steps=100,
        logging_steps=50,
        save_steps=200,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"]
    )

    print("🚀 Starting training...")
    trainer.train()
    print("✅ Training complete")
    model.save_pretrained("t5-custom-finetuned")
    return model


In [8]:
import torch

In [9]:
def predict(model, tokenizer, dataset, max_input_len=256, max_target_len=512):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    for example in dataset.select(range(3)):
        inputs = tokenizer(example["input_text"], return_tensors="pt", padding=True, truncation=True, max_length=max_input_len)
        inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input to GPU
        output_ids = model.generate(**inputs, max_length=max_target_len)
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        print("📥 Input:", example["input_text"])
        print("✅ Prediction:", prediction)
        print("🎯 Ground Truth:", example["target_text"])
        print("-" * 50)


In [10]:
from transformers import T5Tokenizer

if __name__ == "__main__":
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    # Load and preprocess
    datasets = get_dataset()
    tokenized_datasets = tokenize_data(datasets, tokenizer)

    # Train
    model = fine_tune(tokenized_datasets)

    # Predict
    print("🔍 Running prediction on test set...")
    predict(model, tokenizer, datasets["test"])


✅ Loaded datasets:
Train size: 1600, Val size: 200, Test size: 200


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

✅ Tokenization complete
🚀 Starting training...


Step,Training Loss,Validation Loss
100,0.4212,0.161472
200,0.1422,0.057624
300,0.0824,0.037516
400,0.0607,0.030647
500,0.0495,0.027939
600,0.0461,0.026339
700,0.0422,0.02556
800,0.0403,0.025174
900,0.0374,0.024838
1000,0.0427,0.024617


✅ Training complete
🔍 Running prediction on test set...
📥 Input: Create course offering ART401 ART Course with Dr. Williams on Friday from 9:00 AM to 10:00 AM in room THTR202 with limit 20 students
✅ Prediction: offerings campus="MAIN" year="2024" term="Fall">offering id="562" offered="true">course id="390" subject="ART" courseNbr="401" title="ART Course">class id="71726" suffix="1" type="LEC" limit="20">time days="F" startTime="0900" endTime="1000"/>room building="THTR" roomNbr="202"/>instructor id="67" fname="Dr." lname="Williams" lead="true"/>/class>/course>/offering>/offerings>
🎯 Ground Truth: <offerings campus="MAIN" year="2024" term="Fall"><offering id="5858" offered="true"><course id="999" subject="ART" courseNbr="401" title="ART Course"><class id="92516" suffix="1" type="SEM" limit="20"><time days="F" startTime="0900" endTime="1000"/><room building="THTR" roomNbr="202"/><instructor id="79" fname="Dr." lname="Williams" lead="true"/></class></course></offering></offerings>
------