In [1]:
# I start by installing all the required libraries.
# - transformers: For loading the tokenizer.
# - datasets: For easily loading the COCO/Conceptual Captions dataset.
# - timm: A great library for computer vision models, used here for our ViT.
# - matplotlib: For plotting our loss curve later.
# The "-q" flag makes the installation quieter.

!pip install transformers datasets torch timm matplotlib -q

In [2]:
import torch
import timm
from datasets import load_dataset
from transformers import AutoTokenizer

print("All libraries imported successfully!")

All libraries imported successfully!


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
print("Loading a slice of the Conceptual Captions dataset...")
# Note: We can also use "coco_captions" if you prefer.
dataset = load_dataset("conceptual_captions", split="train[:5000]")

# Let's inspect the loaded dataset to see its structure.
print("\nDataset loaded successfully!")
print(dataset)
print("\nExample entry:")
print(dataset[0])

Loading a slice of the Conceptual Captions dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

unlabeled/train-00000-of-00002.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

unlabeled/train-00001-of-00002.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

unlabeled/validation-00000-of-00001.parq(…):   0%|          | 0.00/1.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3318333 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/15840 [00:00<?, ? examples/s]


Dataset loaded successfully!
Dataset({
    features: ['image_url', 'caption'],
    num_rows: 5000
})

Example entry:
{'image_url': 'http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/AAAAAAAAM6o/_11MuAAKalQ/IMG_3422.JPG?imgmax=800', 'caption': 'a very typical bus station'}


In [4]:
# The tokenizer converts our text captions into numerical IDs that the model
# can understand. We'll use the tokenizer from GPT2, which is a standard choice.
#
# A crucial step is to ensure a padding token is set. The padding token is used
# to make all sequences in a batch have the same length. If a tokenizer doesn't
# have a pre-defined pad token, a common practice is to use the end-of-sentence
# (eos) token for this purpose.

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Check if a pad token exists. If not, set it to the eos_token.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer did not have a pad_token, setting it to eos_token.")

print("\nTokenizer loaded and configured:")
print(f"  - Pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"  - Vocab size: {tokenizer.vocab_size}")


Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizer did not have a pad_token, setting it to eos_token.

Tokenizer loaded and configured:
  - Pad token: '<|endoftext|>' (ID: 50256)
  - Vocab size: 50257


In [6]:
# This is the part of the model that "sees" the image.
# I've use a pre-trained Vision Transformer (ViT) from the `timm` library.
#
# Key Steps:
# 1. `pretrained=True`: We load weights that are already trained on a large
#    dataset (ImageNet-21k). This gives us a powerful vision backbone for free.
# 2. `eval()`: We set the model to evaluation mode because we are not training it.
# 3. Freeze Weights: We iterate through all its parameters and set
#    `param.requires_grad = False`. This "freezes" the vision encoder, meaning
#    its weights won't be updated during training. This isolates the training
#    to only your new AR Decoder, making the smoke test faster and more stable.

print("Loading pre-trained Vision Transformer (ViT)...")
# Using a base-sized ViT with 16x16 patches, trained on ImageNet-21k
vision_encoder = timm.create_model(
    'vit_base_patch16_224.augreg_in21k',
    pretrained=True
).to(device)

# Set to evaluation mode and freeze all parameters
vision_encoder.eval()
for param in vision_encoder.parameters():
    param.requires_grad = False

print("\nVision encoder loaded and frozen successfully!")
print(f"Model: {vision_encoder.default_cfg['architecture']}")

# This print statement shows the structure of the model.
# print(vision_encoder)

Loading pre-trained Vision Transformer (ViT)...


model.safetensors:   0%|          | 0.00/410M [00:00<?, ?B/s]


Vision encoder loaded and frozen successfully!
Model: vit_base_patch16_224


In [8]:
print("\n\n--- Phase 0 Setup Complete! ---")
print("You now have:")
print(f"1. A working environment on device: {device}")
print(f"2. A dataset with {len(dataset)} examples.")
print(f"3. A configured tokenizer with a vocab size of {tokenizer.vocab_size}.")
print("4. A powerful, frozen vision encoder ready to create image embeddings.")
print("\n Proceeding to Phase 1: Building the AR Decoder.")



--- Phase 0 Setup Complete! ---
You now have:
1. A working environment on device: cpu
2. A dataset with 5000 examples.
3. A configured tokenizer with a vocab size of 50257.
4. A powerful, frozen vision encoder ready to create image embeddings.

 Proceeding to Phase 1: Building the AR Decoder.
