In [None]:
# --- 1. Install Libraries ---
!pip install transformers datasets sacrebleu torch

# --- 2. Import All Dependencies ---
import torch
import re
from datasets import load_dataset, Dataset
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)
from google.colab import drive

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [None]:
# --- Load Raw Data (Our "Target" Text) ---
print("Loading wikitext dataset...")
full_dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")

# We'll use 100,000 examples as we discussed.
# You can lower this to ~5,000 for a very fast test run.
slice_size = 100000
dataset_slice = full_dataset.select(range(slice_size))

print(f"\nCreated a working slice of {len(dataset_slice)} documents.")
print(f"Example 'good' text: \n'{dataset_slice[5]['text']}'")

Loading wikitext dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

wikitext-103-v1/test-00000-of-00001.parq(…):   0%|          | 0.00/722k [00:00<?, ?B/s]

wikitext-103-v1/train-00000-of-00002.par(…):   0%|          | 0.00/156M [00:00<?, ?B/s]

wikitext-103-v1/train-00001-of-00002.par(…):   0%|          | 0.00/156M [00:00<?, ?B/s]

wikitext-103-v1/validation-00000-of-0000(…):   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]


Created a working slice of 50000 documents.
Example 'good' text: 
' It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . 
'


In [None]:
def create_seq2seq_examples(example):
    text = example['text'].strip()

    # 1. Filter out empty lines and headings
    if not text or text.startswith("=") or len(text.split()) < 5:
        return {"input_text": "", "target_text": ""}

    # 2. Define the Target (Y) - The "good" text
    # This is just the original, correct text.
    target_text = text

    # 3. Define the Input (X) - The "bad" text
    # We make it lowercase and remove all punctuation
    # We use a simple regex to keep only letters, numbers, and whitespace
    broken_text = re.sub(r'[^\w\s]', '', text.lower())

    # 4. Add the T5 Task Prefix
    # This prefix tells the model what "translation" task to perform.
    input_text = "correct: " + broken_text

    return {"input_text": input_text, "target_text": target_text}

In [None]:
print("Applying preprocessing to all examples...")

# Apply our function to every example in the dataset
# num_proc=4 uses 4 cores to speed this up.
raw_dataset = dataset_slice.map(
    create_seq2seq_examples,
    num_proc=4,
    remove_columns=['text']  # We don't need the original 'text' column anymore
)

# Filter out the empty examples we created
processed_dataset = raw_dataset.filter(lambda x: len(x['input_text']) > 10)

print(f"\nFinished processing. We have {len(processed_dataset)} valid examples.")
print("\nExample of a training pair:")
print(f"INPUT (X):  '{processed_dataset[5]['input_text']}'")
print(f"TARGET (Y): '{processed_dataset[5]['target_text']}'")

Applying preprocessing to all examples...


Map (num_proc=4):   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]


Finished processing. We have 23163 valid examples.

Example of a training pair:
INPUT (X):  'correct: troops are divided into five classes  scouts  unk  engineers  lancers and armored soldier  troopers can switch classes by changing their assigned weapon  changing class does not greatly affect the stats gained while in a previous class  with victory in battle  experience points are awarded to the squad  which are distributed into five different attributes shared by the entire squad  a feature differing from early games  method of distributing to different unit types '
TARGET (Y): 'Troops are divided into five classes : Scouts , <unk> , Engineers , Lancers and Armored Soldier . Troopers can switch classes by changing their assigned weapon . Changing class does not greatly affect the stats gained while in a previous class . With victory in battle , experience points are awarded to the squad , which are distributed into five different attributes shared by the entire squad , a feature dif

In [None]:
# --- Load Model and Tokenizer ---
model_name = "t5-small"

print(f"Loading '{model_name}' tokenizer and model...")
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"\nModel loaded and moved to {device}.")

Loading 't5-small' tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Model loaded and moved to cuda.


In [None]:
# --- Define Tokenization Function ---

# We'll truncate sequences to 128 tokens.
# T5 is efficient, but this keeps training fast.
MAX_LENGTH = 128

def tokenize_function(examples):
    # Tokenize the "inputs" (our broken text)
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=MAX_LENGTH,
        truncation=True
    )

    # Tokenize the "targets" (our correct text)
    # We use this 'with' block to ensure the tokenizer knows
    # it's tokenizing the "target" or "label" text.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=MAX_LENGTH,
            truncation=True
        )

    # Add the tokenized labels to our model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# --- Apply Tokenization and Split ---
print("Tokenizing all examples...")

# Apply the tokenization function to all our examples
tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['input_text', 'target_text'] # Not needed anymore
)

# Split the dataset into 90% train, 10% validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print("\nData is tokenized and split:")
print(f"Training examples:   {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")

Tokenizing all examples...


Map:   0%|          | 0/23163 [00:00<?, ? examples/s]




Data is tokenized and split:
Training examples:   20846
Validation examples: 2317


In [None]:
# --- Mount Google Drive ---
print("Mounting Google Drive... Please authorize.")
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

Mounting Google Drive... Please authorize.
Mounted at /content/drive
Google Drive mounted successfully.


In [None]:
# --- Set Up Trainer (Corrected) ---

# 1. Import the correct Seq2Seq classes
from transformers import (
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)


# 3. Define the directory in your Google Drive to save the model
output_dir = "/content/drive/MyDrive/t5-punctuation-model"

# 4. This special collator correctly pads both inputs and labels
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


# 5. *** Use Seq2SeqTrainingArguments ***
# This class IS designed for T5 and DOES accept 'predict_with_generate'
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,                     # 3 epochs is a good start
    per_device_train_batch_size=8,          # 8 is safe for 't5-small' on a T4 GPU
    per_device_eval_batch_size=8,
    weight_decay=0.01,                      # Adds regularization

    # Evaluation and Saving
    eval_strategy="epoch",                # Run validation every epoch
    save_strategy="epoch",                # Save a checkpoint every epoch
    load_best_model_at_end=True,          # Keep only the best model

    # This is the critical argument, and it works with this class
    predict_with_generate=True,

    report_to="none"                        # Disables online logging
)

# 6. *** Use Seq2SeqTrainer ***
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("\nTrainer initialized successfully with Seq2SeqTrainer.")

  trainer = Seq2SeqTrainer(



Trainer initialized successfully with Seq2SeqTrainer.


In [None]:
# --- Train the Model! ---
print("Starting training...")
trainer.train()

print("\nTraining complete!")
print(f"The best model has been saved to: {output_dir}")

Starting training...


Epoch,Training Loss,Validation Loss
1,0.4898,0.367846
2,0.4253,0.334047
3,0.4046,0.324816


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].



Training complete!
The best model has been saved to: /content/drive/MyDrive/t5-punctuation-model


In [None]:
local_save_path = "./my-local-t5-model"
trainer.save_model(local_save_path)
print(f"A temporary local copy has also been saved to: {local_save_path}")

A temporary local copy has also been saved to: ./my-local-t5-model


In [None]:
# --- Test Your Trained Model ---
from transformers import T5ForConditionalGeneration, T5Tokenizer

# 1. Load your saved model from Google Drive
# The Trainer saves the best model in the 'output_dir'
model_path = "/content/drive/MyDrive/t5-punctuation-model/checkpoint-7818"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# 2. Make sure model is on the GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
print("Loaded fine-tuned model from Google Drive.")

# 3. Create the prediction function
def correct(text):
    # Add the "correct:" prefix, lowercase, and remove punctuation
    input_text = "correct: " + re.sub(r'[^\w\s]', '', text.lower())

    # Tokenize the input
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=128,
        truncation=True
    ).to(device)

    # 4. Generate the corrected text
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            max_length=128,
            num_beams=4,
            early_stopping=True
        )

    # 5. Decode the output and return it
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# --- Let's try it! ---
print("\n" + "="*30)
print("--- TESTING THE NEW S2S MODEL ---")
print("="*30 + "\n")

text_1 = "hello my name is shahaan what is yours"
text_2 = "this is a test of the punctuation model i hope it works"
text_3 = "the game was fun but i think it could be better"

# Test 1
print(f"Input:    '{text_1}'")
print(f"Output:   '{correct(text_1)}'")
print("-" * 20)

# Test 2
print(f"Input:    '{text_2}'")
print(f"Output:   '{correct(text_2)}'")
print("-" * 20)

# Test 3
print(f"Input:    '{text_3}'")
print(f"Output:   '{correct(text_3)}'")
print("-" * 20)

Loaded fine-tuned model from Google Drive.

--- TESTING THE NEW S2S MODEL ---

Input:    'hello my name is shahaan what is yours'
Output:   'Hello, my name is Shahaan, what is yours?'
--------------------
Input:    'this is a test of the punctuation model i hope it works'
Output:   'This is a test of the punctuation model, I hope it works.'
--------------------
Input:    'the game was fun but i think it could be better'
Output:   'The game was fun but I think it could be better.'
--------------------


In [None]:
test_string = "the quick brown fox jumps over the lazy dog this is a classic sentence used for typing practice but it also serves as a good test for our model i wonder if it will know where to put the period and how to capitalize the word 'this' in the middle of the text it's a non-trivial task because the model has to understand context not just individual words for example will it know what to do with a sentence like this what do you think the final output will be i am excited to see the result"

# Now you can run your function
corrected_version = correct(test_string)

print(f"Input:    '{test_string}'")
print(f"Output:   '{corrected_version}'")

Input:    'the quick brown fox jumps over the lazy dog this is a classic sentence used for typing practice but it also serves as a good test for our model i wonder if it will know where to put the period and how to capitalize the word 'this' in the middle of the text it's a non-trivial task because the model has to understand context not just individual words for example will it know what to do with a sentence like this what do you think the final output will be i am excited to see the result'
Output:   'The quick brown fox jumps over the lazy dog. This is a classic sentence used for typing practice, but it also serves as a good test for our model. I wonder if it will know where to put the period and how to capitalize the word. This in the middle of the text, its a nontrivial task because the model has to understand context, not just individual words. For example, will it know what to do with a sentence like this? What do you think the final output will be. I am excited to see the resu