# Phase 1: Static ASL Training & Ablations

### Imports

In [3]:
# Standard PyTorch + Torchvision stack
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Subset
from torchvision import datasets, transforms, models

# Reproducibility (essential for research and debugging)
import random
SEED = 1337
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Note: For complete reproducibility, you may also need:
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

# Device (GPU if available)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

# Loading data
import numpy as np
import kagglehub
import os
from sklearn.model_selection import train_test_split

Using device: cuda


### ResNet-18

In [4]:
# Load ResNet-18
res18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

### Preprocessing

In [5]:
# ImageNet channel-wise statistics (computed over millions of images)
IMAGENET_MEAN = [0.485, 0.456, 0.406]  # Mean per channel (R, G, B)
IMAGENET_STD  = [0.229, 0.224, 0.225]  # Std dev per channel

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 64
SEED = 429
LIMIT_PER_CLASS = 5

# Download data
path = kagglehub.dataset_download("grassknoted/asl-alphabet")
data_dir = os.path.join(path, "asl_alphabet_train", "asl_alphabet_train")

# Training transforms
train_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
])

# Validation transforms
val_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
])

# Load dataset
raw_dataset = datasets.ImageFolder(root=data_dir) 

# 5 images per class
indices_to_use = []
targets = np.array(raw_dataset.targets)
classes = np.unique(targets)

for cls in classes:
    cls_indices = np.where(targets == cls)[0]
    indices_to_use.extend(cls_indices[:LIMIT_PER_CLASS])

# Get labels
subset_labels = targets[indices_to_use]

# 80/20 split
train_idx, val_idx = train_test_split(
    indices_to_use, 
    test_size=0.2, 
    stratify=subset_labels, 
    random_state=SEED
)

train_source = datasets.ImageFolder(root=data_dir, transform=train_tf)
val_source   = datasets.ImageFolder(root=data_dir, transform=val_tf)

# Final subsets
full_train_source = datasets.ImageFolder(root=data_dir, transform=train_tf)
full_val_source   = datasets.ImageFolder(root=data_dir, transform=val_tf)

train_ds = Subset(full_train_source, train_idx)
val_ds   = Subset(full_val_source, val_idx)

train_ds.classes = train_source.classes
val_ds.classes = val_source.classes

# Dataloaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

NUM_CLASSES = 29
print(f' Dataset: {len(train_ds):,} train, {len(val_ds):,} val')
print(f' Classes: {train_ds.classes}')

Using Colab cache for faster access to the 'asl-alphabet' dataset.
 Dataset: 116 train, 29 val
 Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']


### Adapt ResNet-18 for ASL

In [10]:
# Start with ImageNet-pretrained weights
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

# Examine the original classifier
print(" Original FC layer:")
print(f"  Input features: {model.fc.in_features}")
print(f"  Output features: {model.fc.out_features} (ImageNet classes)")

# Replace with our custom classifier
# The in_features must match (512 for ResNet-18's final feature size)
# The NUM_CLASSES will change for other datasets
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)

print("\n New FC layer:")
print(f"  Input features: {model.fc.in_features}")
print(f"  Output features: {model.fc.out_features} (our classes)")

# Move model to GPU if available
model = model.to(DEVICE)

 Original FC layer:
  Input features: 512
  Output features: 1000 (ImageNet classes)

 New FC layer:
  Input features: 512
  Output features: 29 (our classes)


### Freezing and Unfreezing

In [None]:
def set_requires_grad(module: nn.Module, requires_grad: bool):
    """
    Recursively set requires_grad for all parameters in a module.
    
    Args:
        module: PyTorch module (layer, block, or entire model)
        requires_grad: True to unfreeze (train), False to freeze
    """
    for param in module.parameters():
        param.requires_grad = requires_grad
    
    # Print status
    param_count = sum(p.numel() for p in module.parameters())
    status = "UNFROZEN (trainable)" if requires_grad else "FROZEN"
    print(f"  {module.__class__.__name__}: {param_count:,} parameters {status}")

 Freezing entire model...
  ResNet: 11,191,389 parameters FROZEN

 Unfreezing only the FC layer...
  Linear: 14,877 parameters UNFROZEN (trainable)

 Trainable: 14,877 / 11,191,389 parameters (0.132933%)


### Training and Evaluation Functions

In [13]:
criterion = nn.CrossEntropyLoss()

def train_one_epoch(model, loader, optimizer):
    """
    Train for one epoch.
    
    Returns:
        tuple: (average_loss, accuracy)
    """
    model.train()  # Enable dropout, batch norm training mode
    
    total_samples = 0
    correct_predictions = 0
    running_loss = 0.0
    
    for batch_idx, (images, labels) in enumerate(loader):
        # Move data to device (GPU/CPU)
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        
        # Forward pass
        optimizer.zero_grad()  # Clear previous gradients
        logits = model(images)
        loss = criterion(logits, labels)
        
        # Backward pass
        loss.backward()  # Compute gradients
        optimizer.step()  # Update weights
        
        # Track metrics
        running_loss += loss.item() * images.size(0)
        predictions = logits.argmax(dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_samples += images.size(0)
        
        # Optional: Print progress
        if batch_idx % 100 == 0:
            print(f"    Batch {batch_idx}/{len(loader)}, "
                  f"Loss: {loss.item():.4f}")
    
    avg_loss = running_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

@torch.no_grad()  # Decorator disables gradient computation
def evaluate(model, loader):
    """
    Evaluate model on validation/test set.
    
    Returns:
        tuple: (average_loss, accuracy)
    """
    model.eval()  # Disable dropout, batch norm eval mode
    
    total_samples = 0
    correct_predictions = 0
    running_loss = 0.0
    
    for images, labels in loader:
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        
        # Forward pass only (no backward)
        logits = model(images)
        loss = criterion(logits, labels)
        
        # Track metrics
        running_loss += loss.item() * images.size(0)
        predictions = logits.argmax(dim=1)
        correct_predictions += (predictions == labels).sum().item()
        total_samples += images.size(0)
    
    avg_loss = running_loss / total_samples
    accuracy = correct_predictions / total_samples
    return avg_loss, accuracy

### Phase 1.1: Head-Only Fine Tuning

In [14]:
# Hyperparameters for Phase 1
EPOCHS_HEAD_ONLY = 3    
LR_HEAD = 1e-3          

print("\n" + "="*60)
print(" PHASE 1: HEAD-ONLY FINE-TUNING")
print("="*60)

# Step 1: Freeze entire model
print("\n Freezing all layers...")
set_requires_grad(model, False)

# Step 2: Unfreeze only the classifier head
print("\n Unfreezing classifier head...")
set_requires_grad(model.fc, True)

# Step 3: Create optimizer for ONLY trainable parameters
# filter() ensures we only optimize parameters with requires_grad=True
trainable_params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.Adam(trainable_params, lr=LR_HEAD)

print(f"\n Optimizer setup:")
print(f"   Learning rate: {LR_HEAD}")
print(f"   Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Step 4: Training loop
print("\n Training progress:")
print("-" * 60)

best_val_acc = 0.0
for epoch in range(1, EPOCHS_HEAD_ONLY + 1):
    print(f"\nEpoch {epoch}/{EPOCHS_HEAD_ONLY}")
    
    # Train
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer)
    
    # Validate
    val_loss, val_acc = evaluate(model, val_loader)
    
    # Track best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        # Optional: Save best model
        # torch.save(model.state_dict(), 'best_model_phase1.pth')
    
    print(f"   Train: Loss={train_loss:.4f}, Acc={train_acc:.3f}")
    print(f"   Val:   Loss={val_loss:.4f}, Acc={val_acc:.3f} "
          f"{' New best!' if val_acc == best_val_acc else ''}")

print("\n Phase 1 Complete!")
print(f"   Best validation accuracy: {best_val_acc:.3f}")


 PHASE 1: HEAD-ONLY FINE-TUNING

 Freezing all layers...
  ResNet: 11,191,389 parameters FROZEN

 Unfreezing classifier head...
  Linear: 14,877 parameters UNFROZEN (trainable)

 Optimizer setup:
   Learning rate: 0.001
   Trainable params: 14,877

 Training progress:
------------------------------------------------------------

Epoch 1/3
    Batch 0/2, Loss: 3.5293
   Train: Loss=3.5571, Acc=0.026
   Val:   Loss=3.3233, Acc=0.103  New best!

Epoch 2/3
    Batch 0/2, Loss: 3.3060
   Train: Loss=3.2733, Acc=0.086
   Val:   Loss=3.1990, Acc=0.103  New best!

Epoch 3/3
    Batch 0/2, Loss: 3.1617
   Train: Loss=3.0907, Acc=0.216
   Val:   Loss=3.0888, Acc=0.172  New best!

 Phase 1 Complete!
   Best validation accuracy: 0.172
