In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# -----------------------------
# 0️⃣ Install/Upgrade dependencies
# -----------------------------
!pip install --upgrade transformers datasets torch --quiet

# -----------------------------
# 1️⃣ Import libraries
# -----------------------------
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# -----------------------------
# 2️⃣ Load dataset
# -----------------------------
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")  # Small, public text dataset
train_dataset = dataset['train']

# -----------------------------
# 3️⃣ Load tokenizer and model
# -----------------------------
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
model = GPT2LMHeadModel.from_pretrained("distilgpt2")

# ⚠️ GPT-2 has no default padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

# -----------------------------
# 4️⃣ Tokenize dataset
# -----------------------------
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)
    # GPT-2 expects 'labels' during training
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

# -----------------------------
# 5️⃣ Define training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100
)

# -----------------------------
# 6️⃣ Create Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets
)

# -----------------------------
# 7️⃣ Train model
# -----------------------------
trainer.train()

# -----------------------------
# 8️⃣ Generate text using fine-tuned model
# -----------------------------
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
print("Generated text:\n")
print(tokenizer.decode(outputs[0]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[34m[1mwandb[0m: Currently logged in as: [33mpreethikuppuri1309[0m ([33mpreethikuppuri1309-umbc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.9796
200,1.616
300,1.3825
400,1.5241
500,1.448
600,1.5389
700,1.4798
800,1.5038
900,1.3641
1000,1.3642
