In [None]:
# STEP 0: Install packages (skip if already done)
# !pip install transformers datasets accelerate --quiet

# STEP 1: Imports
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch

# STEP 2: Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# GPT-2 has no pad_token by default
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.config.pad_token_id = tokenizer.eos_token_id

# STEP 3: Prepare your custom dataset
data = {
    "text": [
        "Farcaster is a decentralized social protocol.",
        "Farcaster is open-source social media.",
        "Farcaster is user-owned social data.",
        "Farcaster is blockchain-based social.",
        "Farcaster is censorship-resistant.",
        "Farcaster is not controlled by big tech.",
        "Farcaster is where you own your content.",
        "Farcaster is a new way to socialize online.",
        "Farcaster is for building social apps.",
        "Farcaster is an alternative to Twitter.",
        "Farcaster is growing fast.",
        "Farcaster is developer-friendly.",
        "Farcaster is privacy-focused.",
        "Farcaster is community-driven.",
        "Farcaster is about digital ownership.",
        "Farcaster is more than just a feed.",
        "Farcaster is an evolving ecosystem.",
        "Farcaster is for the future of social.",
        "Farcaster is a web3 social layer.",
        "Farcaster is truly decentralized social."
    ]
}

dataset = Dataset.from_dict(data)

# STEP 4: Tokenize properly (no padding needed here)
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# STEP 5: Group texts into full sequences
# Not needed for tiny dataset — you can skip group_texts()

# STEP 6: Data collator for CausalLM (no MLM for GPT-style)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# STEP 7: Training args
training_args = TrainingArguments(
    output_dir="./farcaster-model",
    overwrite_output_dir=True,
    num_train_epochs=30,                         # ⬅️ increased for small dataset
    per_device_train_batch_size=2,
    save_steps=1000,                             # ⬅️ not needed to save too often
    save_total_limit=1,
    logging_steps=5,
    report_to="none",
    learning_rate=5e-5,                          # ⬅️ you can try slightly lower
    weight_decay=0.01
)

# STEP 8: Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# STEP 9: Train the model
trainer.train()

# STEP 10: Save trained model
trainer.save_model("./farcaster-model")
tokenizer.save_pretrained("./farcaster-model")
