In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Contrastive Pretraining (CLIP-style) -- Notebook Series

*A Vizuara learning path for understanding CLIP from first principles*

## Learning Path

This series of 3 notebooks takes you from the foundations of contrastive learning to building and evaluating a complete CLIP model.

### Notebook 1: Contrastive Learning and the InfoNCE Loss
- **Estimated time:** 45 minutes
- **Prerequisites:** Basic PyTorch, linear algebra
- **What you will learn:**
  - The intuition behind contrastive learning
  - Cosine similarity and embedding spaces
  - The InfoNCE loss function from scratch
  - Training a contrastive encoder on synthetic data
- **Final output:** A trained contrastive encoder with visualized embedding clusters

### Notebook 2: Building CLIP from Scratch
- **Estimated time:** 60 minutes
- **Prerequisites:** Notebook 1, basic understanding of Transformers
- **What you will learn:**
  - Vision Transformer (ViT) image encoder
  - Transformer text encoder
  - The complete CLIP dual-encoder architecture
  - Training CLIP on CIFAR-10 with synthetic captions
- **Final output:** A trained Mini-CLIP model with t-SNE visualization

### Notebook 3: Zero-Shot Transfer and Evaluation
- **Estimated time:** 45 minutes
- **Prerequisites:** Notebooks 1 and 2
- **What you will learn:**
  - Zero-shot classification using text prompts
  - Prompt engineering and ensembling
  - Image retrieval with CLIP
  - Understanding CLIP's limitations
- **Final output:** Full evaluation with confusion matrix and retrieval demos

## How to Use These Notebooks

In [None]:
print("Welcome to the Vizuara CLIP series!")
print()
print("Each notebook is designed to run in Google Colab with a T4 GPU.")
print("Training times are under 10 minutes per notebook.")
print()
print("Start with Notebook 1 and work through them in order.")
print("Each notebook builds on concepts from the previous one.")

## Key References

- Radford et al., "Learning Transferable Visual Models From Natural Language Supervision" (2021) -- The original CLIP paper
- OpenCLIP: https://github.com/mlfoundations/open_clip -- Open-source implementation
- Dosovitskiy et al., "An Image is Worth 16x16 Words" (2020) -- Vision Transformer (ViT)