In [3]:
%pip install transformers datasets





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import json
from datasets import Dataset, DatasetDict

def load_dataset(path):
    with open(path) as f:
        data = json.load(f)
    return Dataset.from_list([{"input_text": d["input"], "target_text": d["output"]} for d in data])

def get_dataset():
    train_data = load_dataset("data/processed/train_datset.json")
    val_data = load_dataset("data/processed/val_dataset.json")
    test_data = load_dataset("data/processed/test_dataset.jon")  # note the typo, may be 'test_dataset.json'
    
    print("✅ Loaded datasets:")
    print(f"Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}")
    
    return DatasetDict({
        "train": train_data,
        "validation": val_data,
        "test": test_data
    })


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from transformers import T5Tokenizer

def tokenize_data(dataset, tokenizer, max_input_len=256, max_target_len=512):
    def preprocess(example):
        input_enc = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=max_input_len)
        target_enc = tokenizer(example["target_text"], truncation=True, padding="max_length", max_length=max_target_len)
        return {
            "input_ids": input_enc["input_ids"],
            "attention_mask": input_enc["attention_mask"],
            "labels": target_enc["input_ids"]
        }
    tokenized = dataset.map(preprocess, batched=True)
    print("✅ Tokenization complete")
    return tokenized


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [6]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

def fine_tune(tokenized_datasets):
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        eval_steps=100,
        logging_steps=50,
        save_steps=200,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        save_total_limit=2,
        fp16=False,
        logging_dir="./logs",
        report_to="none"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"]
    )

    print("🚀 Starting training...")
    trainer.train()
    print("✅ Training complete")
    model.save_pretrained("t5-custom-finetuned")
    return model


In [7]:
from transformers import T5Tokenizer

def predict(model, tokenizer, dataset, max_input_len=256, max_target_len=512):
    model.eval()
    for example in dataset.select(range(3)):  # Just test 3 predictions
        inputs = tokenizer(example["input_text"], return_tensors="pt", padding=True, truncation=True, max_length=max_input_len)
        output_ids = model.generate(inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=max_target_len)
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        print("📥 Input:", example["input_text"])
        print("✅ Prediction:", prediction)
        print("🎯 Ground Truth:", example["target_text"])
        print("-" * 50)


In [8]:
from transformers import T5Tokenizer

if __name__ == "__main__":
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    
    # Load and preprocess
    datasets = get_dataset()
    tokenized_datasets = tokenize_data(datasets, tokenizer)
    
    # Train
    model = fine_tune(tokenized_datasets)
    
    # Predict
    print("🔍 Running prediction on test set...")
    predict(model, tokenizer, datasets["test"])


ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
