In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Install required libraries
!pip install transformers datasets evaluate sentencepiece --quiet

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
import pandas as pd
import torch
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# Sample data: prompt → instruction
data = {
    "prompt": [
        "Build a login page with email and password",
        "Create a dashboard with charts and filters",
        "Generate a REST API for user data",
        "Design a contact form with validation",
        "Make a landing page with hero section"
    ],
    "instruction": [
        "Create HTML form with email/password fields and JS validation",
        "Use chart.js and dropdown filters in a dashboard layout",
        "Define GET/POST endpoints using Flask or Express",
        "Build form with required fields and regex validation",
        "Design responsive hero section with CTA button"
    ]
}

df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

In [None]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def preprocess(example):
    inputs = tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=64)
    targets = tokenizer(example["instruction"], padding="max_length", truncation=True, max_length=64)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, remove_columns=["prompt", "instruction"])

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    num_train_epochs=10,
    logging_dir="./logs",
    save_strategy="no",
    predict_with_generate=True,
    report_to="none"  # ✅ disables wandb and other loggers
)

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=30, training_loss=10.694913736979167, metrics={'train_runtime': 3.5086, 'train_samples_per_second': 14.251, 'train_steps_per_second': 8.55, 'total_flos': 845886259200.0, 'train_loss': 10.694913736979167, 'epoch': 10.0})

In [None]:
def generate_instruction(prompt):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
test_prompt = "Build a signup page with username and password"
print("Prompt:", test_prompt)
print("Instruction:", generate_instruction(test_prompt))

Prompt: Build a signup page with username and password
Instruction: Create a signup page with username and password
