# Video Emotion Model Training (ResNet-18 Transfer Learning)

Train a facial emotion recognition model using transfer learning.

**Dataset**: FER2013 (35K grayscale face images, 48x48)

**Model**: ResNet-18 pretrained → fine-tuned for 4 emotions

**Expected Accuracy**: 75-80%

## 1. Setup & Dependencies

In [None]:
# Install dependencies
!pip install kaggle --quiet

In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2. Download FER2013 Dataset

**Option A**: Upload `kaggle.json` (Kaggle API key)

**Option B**: Manually download from https://www.kaggle.com/datasets/msambare/fer2013

In [None]:
# Option A: Use Kaggle API (upload kaggle.json first)
# from google.colab import files
# files.upload()  # Upload kaggle.json

# !mkdir -p ~/.kaggle
# !mv kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d msambare/fer2013
# !unzip -q fer2013.zip

In [None]:
# Option B: Upload manually from Google Drive or local
# Assumes you've uploaded and unzipped the dataset
# Expected structure:
# ./train/angry/, ./train/happy/, ./train/sad/, ./train/neutral/, etc.
# ./test/angry/, ./test/happy/, ./test/sad/, ./test/neutral/, etc.

TRAIN_DIR = './train'
TEST_DIR = './test'

# Check if dataset exists
if os.path.exists(TRAIN_DIR):
    print("Dataset found!")
    for emotion in os.listdir(TRAIN_DIR):
        count = len(os.listdir(os.path.join(TRAIN_DIR, emotion)))
        print(f"  {emotion}: {count} images")
else:
    print("Dataset not found. Please upload FER2013 dataset.")

## 3. Define Emotion Mapping

FER2013 has 7 emotions → Map to AEMER's 4 emotions

In [None]:
# FER2013 original emotions: angry, disgust, fear, happy, sad, surprise, neutral
# AEMER target emotions: angry, happy, sad, neutral

# Mapping: disgust→angry, fear→sad, surprise→happy
EMOTION_MAPPING = {
    'angry': 'angry',
    'disgust': 'angry',     # Map to angry
    'fear': 'sad',          # Map to sad
    'happy': 'happy',
    'sad': 'sad',
    'surprise': 'happy',    # Map to happy (positive emotion)
    'neutral': 'neutral'
}

TARGET_EMOTIONS = ['angry', 'happy', 'sad', 'neutral']
EMOTION_TO_IDX = {e: i for i, e in enumerate(TARGET_EMOTIONS)}
IDX_TO_EMOTION = {i: e for e, i in EMOTION_TO_IDX.items()}

print("Emotion mapping:")
print(EMOTION_TO_IDX)

## 4. Create Dataset Class

In [None]:
class FER2013Dataset(Dataset):
    """FER2013 Dataset with emotion mapping to 4 classes."""
    
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.images = []
        self.labels = []
        
        # Load all images
        for emotion_folder in os.listdir(root_dir):
            emotion_path = os.path.join(root_dir, emotion_folder)
            if not os.path.isdir(emotion_path):
                continue
            
            # Map to target emotion
            mapped_emotion = EMOTION_MAPPING.get(emotion_folder.lower())
            if mapped_emotion is None:
                continue
            
            label = EMOTION_TO_IDX[mapped_emotion]
            
            for img_file in os.listdir(emotion_path):
                img_path = os.path.join(emotion_path, img_file)
                self.images.append(img_path)
                self.labels.append(label)
        
        print(f"Loaded {len(self.images)} images")
        
        # Print class distribution
        from collections import Counter
        dist = Counter(self.labels)
        for idx, count in sorted(dist.items()):
            print(f"  {IDX_TO_EMOTION[idx]}: {count}")
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_path = self.images[idx]
        image = Image.open(img_path).convert('RGB')  # Convert grayscale to RGB for ResNet
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

## 5. Data Transforms & Augmentation

In [None]:
# Training transforms with augmentation
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # ResNet expects 224x224
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Validation/Test transforms (no augmentation)
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
# Create datasets
print("Loading training data...")
train_dataset = FER2013Dataset(TRAIN_DIR, transform=train_transform)

print("\nLoading test data...")
test_dataset = FER2013Dataset(TEST_DIR, transform=val_transform)

# Create data loaders
BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"\nTrain batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

## 6. Define Model (ResNet-18 Transfer Learning)

In [None]:
class FacialEmotionResNet(nn.Module):
    """ResNet-18 based facial emotion classifier."""
    
    def __init__(self, num_classes=4, pretrained=True):
        super().__init__()
        
        # Load pretrained ResNet-18
        self.resnet = models.resnet18(pretrained=pretrained)
        
        # Freeze early layers (keep last 2 blocks + FC trainable)
        ct = 0
        for child in self.resnet.children():
            ct += 1
            if ct < 7:  # Freeze first 6 of 8 children (conv1, bn1, relu, maxpool, layer1, layer2)
                for param in child.parameters():
                    param.requires_grad = False
        
        # Replace final FC layer with deeper head
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(num_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        return self.resnet(x)


# Create model
model = FacialEmotionResNet(num_classes=4, pretrained=True)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params
print(f"Total parameters: {total_params:,}")
print(f"Trainable: {trainable_params:,} ({trainable_params/total_params*100:.1f}%)")
print(f"Frozen: {frozen_params:,} ({frozen_params/total_params*100:.1f}%)")

## 7. Training Setup

In [None]:
# Loss function with class weights (handle imbalance)
from collections import Counter
label_counts = Counter(train_dataset.labels)
total = sum(label_counts.values())
class_weights = torch.FloatTensor([total / label_counts[i] for i in range(4)]).to(device)
class_weights = class_weights / class_weights.sum() * 4  # Normalize

print("Class weights:", class_weights)

criterion = nn.CrossEntropyLoss(weight=class_weights)

# Optimizer with different learning rates
optimizer = optim.AdamW([
    {'params': model.resnet.fc.parameters(), 'lr': 1e-3},       # New layers - higher LR
    {'params': list(model.resnet.parameters())[:-2], 'lr': 1e-4}  # Pretrained - lower LR
], weight_decay=0.01)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5)

## 8. Training Loop

In [None]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for images, labels in tqdm(loader, desc="Training"):
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    return total_loss / len(loader), 100. * correct / total


def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Validating"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    return total_loss / len(loader), 100. * correct / total

In [None]:
# Training with early stopping
NUM_EPOCHS = 30
PATIENCE = 4
best_val_loss = float('inf')
patience_counter = 0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

print(f"Training for up to {NUM_EPOCHS} epochs (early stopping patience={PATIENCE})...")
print("="*60)

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = validate(model, test_loader, criterion, device)
    
    # Update scheduler
    scheduler.step(val_loss)
    
    # Save history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    # Early stopping based on val_loss (lower is better)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'video_model.pth')
        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f} Acc: {val_acc:.2f}% \u2b50 BEST")
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} Acc: {train_acc:.2f}% | Val Loss: {val_loss:.4f} Acc: {val_acc:.2f}% (no improvement {patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print(f"\n\U0001f6d1 Early stopping at epoch {epoch+1}!")
            break

print("="*60)
print(f"Best Val Loss: {best_val_loss:.4f} | Best Val Acc: {best_val_acc:.2f}%")
print(f"Total epochs run: {len(history['train_loss'])}")

## 9. Visualize Training

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Loss
ax1.plot(history['train_loss'], label='Train')
ax1.plot(history['val_loss'], label='Validation')
ax1.set_title('Loss')
ax1.set_xlabel('Epoch')
ax1.legend()

# Accuracy
ax2.plot(history['train_acc'], label='Train')
ax2.plot(history['val_acc'], label='Validation')
ax2.set_title('Accuracy')
ax2.set_xlabel('Epoch')
ax2.legend()

plt.tight_layout()
plt.savefig('training_history.png')
plt.show()

## 10. Evaluate & Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import numpy as np

# Load best model
model.load_state_dict(torch.load('video_model.pth'))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, predicted = outputs.max(1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.numpy())

# Classification report
print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=TARGET_EMOTIONS))

# Confusion matrices (normalized + raw)
cm = confusion_matrix(all_labels, all_preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left: Normalized (percentages)
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=TARGET_EMOTIONS, yticklabels=TARGET_EMOTIONS, ax=axes[0],
            vmin=0, vmax=1)
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_title('Normalized Confusion Matrix (Recall per Class)')

# Right: Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=TARGET_EMOTIONS, yticklabels=TARGET_EMOTIONS, ax=axes[1])
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
axes[1].set_title('Confusion Matrix (Raw Counts)')

plt.tight_layout()
plt.savefig('video_confusion_matrix.png', dpi=150)
plt.show()

# Per-class accuracy
print("\nPer-class accuracy (recall):")
for i, name in enumerate(TARGET_EMOTIONS):
    recall = cm_normalized[i, i]
    print(f"  {name}: {recall:.1%}")
print(f"  Overall: {np.trace(cm)/cm.sum():.1%}")

## 11. Test on Sample Image

In [None]:
def predict_emotion(model, image_path):
    """Predict emotion from a single image."""
    model.eval()
    
    # Load and transform image
    image = Image.open(image_path).convert('RGB')
    image_tensor = val_transform(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(image_tensor)
        probabilities = torch.softmax(outputs, dim=1)[0]
        predicted_idx = probabilities.argmax().item()
        confidence = probabilities[predicted_idx].item()
    
    predicted_emotion = IDX_TO_EMOTION[predicted_idx]
    
    print(f"Predicted: {predicted_emotion} ({confidence:.2%})")
    print("All probabilities:")
    for i, emotion in enumerate(TARGET_EMOTIONS):
        print(f"  {emotion}: {probabilities[i].item():.2%}")
    
    return predicted_emotion, confidence

# Test on a sample image from test set
sample_img = test_dataset.images[0]
print(f"Testing on: {sample_img}")
predict_emotion(model, sample_img)

## 12. Download Model

In [None]:
# Check model size
model_size = os.path.getsize('video_model.pth') / (1024 * 1024)
print(f"Model size: {model_size:.2f} MB")

# Download
from google.colab import files
files.download('video_model.pth')
print("\n✅ Download video_model.pth and place in project/VideoModel/ folder")

## Summary

**Model**: ResNet-18 Transfer Learning

**Input**: 224x224 RGB image (face crop)

**Output**: 4 emotions (angry, happy, sad, neutral)

**Usage**:
1. Detect face in video frame (OpenCV)
2. Crop & resize to 224x224
3. Normalize with ImageNet stats
4. Predict with model