In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Multimodal Instruction Tuning -- Vizuara Notebook Series

Welcome to the Vizuara notebook series on **Multimodal Instruction Tuning**. These notebooks teach you how to build a LLaVA-style multimodal model from first principles.

## Learning Path

| # | Notebook | Topic | Time |
|---|----------|-------|------|
| 1 | `01_multimodal_projection.ipynb` | Building the bridge between vision and language encoders | 45 min |
| 2 | `02_instruction_tuning_pipeline.ipynb` | Two-stage training: alignment + instruction tuning | 50 min |
| 3 | `03_cross_modal_attention.ipynb` | Visualizing how language models learn to see | 40 min |

## Prerequisites

- Basic understanding of Transformers (self-attention, embeddings)
- Familiarity with PyTorch (nn.Module, training loops)
- Understanding of Vision Transformers (helpful but not required)

## What You Will Build

By the end of this series, you will have:

1. **Built a multimodal projection layer** that bridges vision features into language model token space
2. **Implemented the two-stage training pipeline** used by LLaVA (feature alignment + instruction tuning)
3. **Visualized cross-modal attention** to understand how text tokens "see" image patches
4. **Trained a working visual question answering model** from scratch

## How to Use These Notebooks

Each notebook is self-contained and runs in Google Colab with a T4 GPU. Open each notebook in order and run all cells. Look for the TODO sections where you implement key components yourself.

In [None]:
# Quick links to open each notebook in Colab
notebooks = {
    "01 - Multimodal Projection": "01_multimodal_projection.ipynb",
    "02 - Instruction Tuning Pipeline": "02_instruction_tuning_pipeline.ipynb",
    "03 - Cross-Modal Attention": "03_cross_modal_attention.ipynb",
}

print("Vizuara Notebook Series: Multimodal Instruction Tuning")
print("=" * 55)
for title, filename in notebooks.items():
    print(f"  {title}")
    print(f"    File: {filename}")
    print()