In [2]:
# CELL 1: Install necessary libraries for GPT-2 fine-tuning
# transformers: for model architecture, datasets: to load story data,


!pip install -q transformers datasets accelerate

In [3]:
# CELL 2: Importing Essential Modules
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Print confirmation of successful imports
print("Libraries imported successfully.")

Libraries imported successfully.


In [4]:
# CELL 3: GPU Verification
# This checks if Google Colab has assigned a GPU (CUDA) for execution.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Execution will run on: {device}")

Execution will run on: cuda


In [5]:
# CELL 4: Setting Random Seeds for Reproducibility
# This is crucial for assignments so all students get consistent creative results.
import random
import numpy as np
import os

seed_value = 42

# 1. Set seed for Python's built-in random module
random.seed(seed_value)

# 2. Set seed for NumPy (used for data processing)
np.random.seed(seed_value)

# 3. Set seed for PyTorch
torch.manual_seed(seed_value)

# 4. Set seed for environmental hash
os.environ['PYTHONHASHSEED'] = str(seed_value)

# 5. Additional configuration for GPU (if available)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    # Ensure deterministic behavior for the backend
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print(f"Random seed set to: {seed_value}")

Random seed set to: 42


In [7]:
# CELL 5: Defining the Model Name
# We are using 'distilgpt2', a faster and lighter version of GPT-2.
# This ensures a "long and proper execution" that is efficient for students.

model_name = "distilgpt2"

print(f"Target model for fine-tuning set to: {model_name}")

Target model for fine-tuning set to: distilgpt2


In [8]:
# CELL 6: Initializing the Tokenizer
# We load the tokenizer for 'distilgpt2' to convert text into token IDs.
# The tokenizer ensures our story data is formatted correctly for the model.

from transformers import GPT2Tokenizer

# Load the pre-trained tokenizer matching our model name
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Display tokenizer details to confirm successful loading
print(f"Tokenizer loaded for: {model_name}")
print(f"Vocabulary Size: {tokenizer.vocab_size}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer loaded for: distilgpt2
Vocabulary Size: 50257


In [9]:
# CELL 7: Loading the Pre-trained DistilGPT2 Model
# We load the weights for our chosen model and move it to our
# designated device (GPU or CPU) for processing.

from transformers import GPT2LMHeadModel

# Load the pre-trained model weights
model = GPT2LMHeadModel.from_pretrained(model_name)

# Move the model to the device identified in Cell 4
model.to(device)

print(f"Model {model_name} loaded and moved to {device} successfully.")

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: distilgpt2
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
transformer.h.{0, 1, 2, 3, 4, 5}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model distilgpt2 loaded and moved to cuda successfully.


In [10]:
# CELL 8: Configuring Special Tokens and Padding
# GPT-2 does not have a pad_token by default. We set it to the eos_token
# so the model knows where a story ends and can handle batching.

# Set the padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Configure the model to use the same padding token ID
model.config.pad_token_id = model.config.eos_token_id

print("Special tokens configured.")
print(f"Pad Token: {tokenizer.pad_token}")
print(f"Pad Token ID: {tokenizer.pad_token_id}")

Special tokens configured.
Pad Token: <|endoftext|>
Pad Token ID: 50256


In [11]:
# CELL 9: Downloading the TinyStories Dataset
# We load the 'train' split of the TinyStories dataset.
# We take 1% of the data to ensure the notebook remains responsive
# while still providing enough data for high-quality fine-tuning.

from datasets import load_dataset

# Load the dataset directly from the Hugging Face Hub
raw_datasets = load_dataset("roneneldan/TinyStories", split='train[:1%]')

print("Dataset downloaded successfully.")
print(f"Number of stories loaded: {len(raw_datasets)}")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00004-2d5a1467fff108(…):   0%|          | 0.00/249M [00:00<?, ?B/s]

data/train-00001-of-00004-5852b56a2bd28f(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/train-00002-of-00004-a26307300439e9(…):   0%|          | 0.00/246M [00:00<?, ?B/s]

data/train-00003-of-00004-d243063613e5a0(…):   0%|          | 0.00/248M [00:00<?, ?B/s]

data/validation-00000-of-00001-869c898b5(…):   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

Dataset downloaded successfully.
Number of stories loaded: 21197


In [12]:
# CELL 10: Data Exploration
# Let's look at the first few stories to understand the structure
# of the data our model will be learning from.

print("--- Sample Story 1 ---")
print(raw_datasets[0]['text'])

print("\n--- Sample Story 2 ---")
print(raw_datasets[1]['text'])

--- Sample Story 1 ---
One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.

--- Sample Story 2 ---
Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.

One day, Beep was driving in the park when he saw a big tree

In [13]:
# CELL 11: Defining the Tokenization Function
# This function will be mapped across our dataset to convert raw text
# into input_ids and attention_mask tensors.

def tokenize_function(examples):
    # We tokenize the 'text' field of our dataset.
    # truncation=True ensures we don't exceed the model's context window.
    return tokenizer(examples["text"], truncation=True)

print("Tokenization function defined.")

Tokenization function defined.


In [14]:
# CELL 12: Mapping the Tokenizer across the Dataset
# This step applies our tokenization function to every story in the dataset.
# 'batched=True' speeds up the process by handling multiple stories at once.

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

print("Tokenization complete.")
print(f"Structure of tokenized data: {tokenized_datasets.column_names}")

Map:   0%|          | 0/21197 [00:00<?, ? examples/s]

Tokenization complete.
Structure of tokenized data: ['input_ids', 'attention_mask']


In [15]:
# CELL 13: Formatting the Dataset for PyTorch
# We cast the tokenized data into PyTorch tensors so the model
# can process them during the fine-tuning stage.

# Set the format to 'torch' and specify which columns to keep
tokenized_datasets.set_format(
    type='torch',
    columns=['input_ids', 'attention_mask']
)

print("Dataset format set to PyTorch tensors.")
# Verify the type of the first element to confirm the change
print(f"Type of first input_ids: {type(tokenized_datasets[0]['input_ids'])}")

Dataset format set to PyTorch tensors.
Type of first input_ids: <class 'torch.Tensor'>


In [16]:
# CELL 14: Grouping Stories into Blocks
# First, we reset the format to 'python' so the function can handle lists
tokenized_datasets.reset_format()

block_size = 128

def group_texts(examples):
    # Concatenate all sequences in the batch
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # Drop the remainder to keep blocks uniform
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    # Split into chunks of block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    # Set labels for Causal Language Modeling
    result["labels"] = result["input_ids"].copy()
    return result

# Apply the mapping to the tokenized dataset
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

# NOW set the format to PyTorch for the final training-ready dataset
lm_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

print(f"Grouping complete. New dataset size: {len(lm_datasets)} blocks.")

Map:   0%|          | 0/21197 [00:00<?, ? examples/s]

Grouping complete. New dataset size: 36781 blocks.


In [1]:
# Updated CELL 12 for GPU
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model and move it to the GPU
model = AutoModelForCausalLM.from_config(config).to(device)

print(f"Model is now running on: {device.upper()}")

NameError: name 'AutoModelForCausalLM' is not defined

In [17]:
# CELL 15: Defining Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",          # Directory for model checkpoints
    eval_strategy="no",              # Updated keyword from evaluation_strategy
    learning_rate=5e-5,              # How fast the model adjusts weights
    per_device_train_batch_size=4,   # Small batch size to fit in memory
    num_train_epochs=1,              # One full pass through the dataset
    weight_decay=0.01,               # Regularization to prevent over-fitting
    logging_steps=100,               # Print progress every 100 steps
    save_total_limit=1,              # Keep only the latest model to save space
    push_to_hub=False,
    report_to="none"
)

print("Training arguments configured.")

Training arguments configured.


In [18]:
# CELL 16: Initializing the Trainer
from transformers import Trainer

# The Trainer handles the training loop, including batching and backpropagation.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets,
)

print("Trainer initialized and ready for training.")

Trainer initialized and ready for training.


In [19]:
# CELL 17: Starting the Training Process
# This command triggers the training loop defined in the Trainer.
# Monitor the progress bar below; it will show the loss decreasing over time.

print("Starting training... (This will take a long time on CPU)")
trainer.train()

print("Training completed!")

Starting training... (This will take a long time on CPU)


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.72491
200,2.578532
300,2.515604
400,2.460284
500,2.431438
600,2.382959
700,2.384184
800,2.349761
900,2.340253
1000,2.338339


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Training completed!


In [20]:
# CELL 18: Saving the Final Model
# We save both the model and the tokenizer into a dedicated folder.
model_save_path = "./fine_tuned_tiny_stories"

# Save the model weights
model.save_pretrained(model_save_path)

# Save the tokenizer (essential for text generation)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer successfully saved to {model_save_path}")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model and tokenizer successfully saved to ./fine_tuned_tiny_stories


In [21]:
# CELL 19: Generating a Story
from transformers import pipeline

# Initialize the generation pipeline using your saved model
generator = pipeline(
    "text-generation",
    model=model_save_path,
    tokenizer=model_save_path,
    device=0 if torch.cuda.is_available() else -1
)

# Set a starting prompt
prompt = "Once upon a time, a little bird named Pip"

# Generate text
output = generator(
    prompt,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,  # Controls randomness (higher = more creative)
    truncation=True
)

print("\n--- GENERATED STORY ---")
print(output[0]['generated_text'])

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

Passing `generation_config` together with generation-related arguments=({'max_length', 'num_return_sequences', 'temperature'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- GENERATED STORY ---
Once upon a time, a little bird named Pip lived in a big tree branch. Pip loved to explore the tree and see the birds. One day, Pip saw a big tree with a long branch that was broken. Pip wanted to help the bird, so he hopped over to the branch and tried to get it back. But the branch was broken and the bird was lost. Pip was sad and didn't know what to do.

The next day, Pip went to the tree branch and saw a big, bright, shiny tree with many branches. It was so big that it looked like a bird. Pip thought it was a bird, so he hopped over to the branch and tried to get it back. But the bird was too big for him and he could not fly. He tried and tried, but the branch was too high for him. The bird was sad and couldn't reach Pip. Pip tried and tried, but the tree branch was too high for him. He tried and tried, but the branch was too high for him. The bird was too high for him to do.

Pip was sad and didn't know what to do. He looked around and saw a big, blue tree

In [22]:
# CELL 20: Project Completion
import torch
import gc

# Clear the model from GPU/RAM to free up resources
if 'model' in locals():
    del model
if 'trainer' in locals():
    del trainer

gc.collect()
torch.cuda.empty_cache()

print("Project Completed! Your model is saved in './fine_tuned_tiny_stories'.")
print("You can now download that folder to your local computer if you wish.")

Project Completed! Your model is saved in './fine_tuned_tiny_stories'.
You can now download that folder to your local computer if you wish.


In [23]:
# CELL 19: Generating a Story with Improved Settings
from transformers import pipeline

# Initialize the generation pipeline
generator = pipeline(
    "text-generation",
    model=model_save_path,
    tokenizer=model_save_path,
    device=0 if torch.cuda.is_available() else -1
)

# New prompt for Pip
prompt = "Once upon a time, a little bird named Pip"

# Generate text with repetition control
output = generator(
    prompt,
    max_new_tokens=150,      # Generate up to 150 new words
    temperature=0.8,         # Adds a bit of creativity
    repetition_penalty=1.2,  # Discourages repeating the same words
    no_repeat_ngram_size=2,  # Prevents any 2-word phrase from repeating
    do_sample=True,          # Allows for more varied word choices
    truncation=True
)

print("\n--- IMPROVED GENERATED STORY ---")
print(output[0]['generated_text'])

Loading weights:   0%|          | 0/76 [00:00<?, ?it/s]

Passing `generation_config` together with generation-related arguments=({'repetition_penalty', 'temperature', 'no_repeat_ngram_size', 'max_new_tokens', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
Both `max_new_tokens` (=150) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- IMPROVED GENERATED STORY ---
Once upon a time, a little bird named Pip lived in an ordinary house. One day, while playing outside, she heard a noise. She stopped and looked around, but couldn't see anything. The sound made her feel sad. 
Pip thought about how strange it was that people would act like she loved them. So, just then something exciting happened. Little birds were flying by and coming to the tree. All of a sudden they heard different noises coming from the living room! The little girl was so happy and she jumped onto top-down and ran over to take a closer look. Suddenly one loud roar came out for everyone. Pip was scared and didnâ€™t know where it started because she was stuck in the attic. Finally getting up, as soon when he finally


In [24]:
# CELL 20: Project Completion and Cleanup
import torch
import gc

# 1. Delete the large objects from memory
if 'model' in locals():
    del model
if 'trainer' in locals():
    del trainer
if 'generator' in locals():
    del generator

# 2. Force Python to clear unused memory (Garbage Collection)
gc.collect()

# 3. Specifically clear the GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("✅ Project Completed!")
print("✅ Model weights released from Memory.")
print(f"✅ Your model is safely stored in: {model_save_path}")

✅ Project Completed!
✅ Model weights released from Memory.
✅ Your model is safely stored in: ./fine_tuned_tiny_stories
