In [None]:
from datasets import load_dataset

# Load the TinyStories dataset
dataset = load_dataset("roneneldan/TinyStories")

# Check the structure
print(dataset['train'][0])

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Running on: {device}')

In [None]:
from transformers import AutoTokenizer

# Use GPT-Neo tokenizer (standard practice for this replication)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Truncate to a small context length (e.g., 512) as stories are short
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize a subset for quick testing
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=16)

In [None]:
tokenized_datasets

In [None]:
from transformers import AutoModelForCausalLM, GPTNeoConfig

# Define a "Tiny" configuration
config = GPTNeoConfig(
    vocab_size=len(tokenizer),        # Match GPT-2 vocab (~50k)
    max_position_embeddings=512,      # Context window (stories are short)
    hidden_size=64,                   # Very small embedding dimension
    num_layers=8,                     # Only 4 transformer blocks
    num_heads=16,                     # 4 attention heads
    attention_types=[[['local'], 8]]
)

# Initialize the model from scratch (NOT pre-trained)
model = AutoModelForCausalLM.from_config(config)

print(f"Model parameters: {model.num_parameters() / 1_000_000:.2f}M")

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Data Collator handles dynamic padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./tiny-stories-model",
    num_train_epochs=1,              # 1 epoch is often enough for this dataset to see convergence
    per_device_train_batch_size=8,  # Reduced batch size to mitigate OutOfMemoryError
    save_steps=5000,
    logging_steps=500,
    learning_rate=5e-4,              # Slightly higher LR for small models
    weight_decay=0.01,
    push_to_hub=False,
    fp16=True,                       # Use Mixed Precision if on GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
# To Start training the model from scratch
# trainer.train()

# To resume from a checkpoint
checkpoint_path = "/content/tiny-stories-model/checkpoint-55000"
trainer.train(resume_from_checkpoint=checkpoint_path)

In [None]:
# Move model to evaluation mode
model.eval()

# Prompt with a typical TinyStories opening
prompt = 'A cat performs a new trick for her friends but starts shivering.'

# prompt = "Once upon a time, there was a little girl named Lily."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate
outputs = model.generate(
    inputs.input_ids, 
    max_length=200, 
    do_sample=True, 
    temperature=0.7, 
    top_k=50
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))