In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Multimodal Fusion Architectures — Notebook Series

*Vizuara AI*

## Welcome!

This series of 3 notebooks will take you from the fundamentals of multimodal fusion all the way to building real vision-language architectures (LLaVA, Flamingo) and training them with contrastive learning.

Each notebook is self-contained and can be run independently in Google Colab with a T4 GPU.

## Learning Path

| # | Notebook | What You Will Build | Time |
|---|----------|-------------------|------|
| 1 | **Fusion Strategies from First Principles** | Early, Late, and Cross-Attention fusion models trained on a synthetic multimodal task | 45 min |
| 2 | **Building LLaVA and Flamingo from Scratch** | Simplified LLaVA and Flamingo architectures for visual question answering on CIFAR-10 | 60 min |
| 3 | **Training Multimodal Models** | A mini-CLIP model trained with contrastive loss for zero-shot image retrieval | 50 min |

## Prerequisites

- Basic PyTorch (tensors, nn.Module, training loops)
- Understanding of Transformer self-attention (recommended but not required -- we explain cross-attention from scratch)
- Familiarity with CNNs (convolutions, pooling)

## How to Use

1. Open each notebook in Google Colab
2. Run cells sequentially from top to bottom
3. Complete the TODO exercises (they have scaffolding and verification cells)
4. Reflect on the questions at the end of each notebook

In [None]:
print("Ready to explore multimodal fusion architectures!")
print("Start with Notebook 01: Fusion Strategies from First Principles")