In [1]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
import torch.nn.functional as F
import torchvision.models as models
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models import vgg16, VGG16_Weights
import random
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset
import torchvision.ops.boxes as box_ops

In [2]:
# Paths to WIDER FACE annotations
wider_root = "/home/hkatti/scratch/datasets/WIDER"
train_annot = os.path.join(wider_root, "wider_face_split/wider_face_train_bbx_gt.txt")
val_annot = os.path.join(wider_root, "wider_face_split/wider_face_val_bbx_gt.txt")
wider_images_dir = os.path.join(wider_root, "WIDER_train/images")

In [3]:
# Configs:
# Configuration for helper funcs (matches paper's parameters)
TARGET_SHORT = 600
MAX_LONG = 1000
FLIP_PROB = 0.5  # 50% chance of horizontal flip

In [5]:
# loading wider annotations
def load_wider_annotations(annot_path, images_root):
    """Properly parses WIDER annotations and constructs full image paths"""
    with open(annot_path, 'r') as f:
        lines = f.readlines()
    
    annotations = []
    img_paths = []
    i = 0
    while i < len(lines):
        # Image path relative to WIDER_train/images or WIDER_val/images
        rel_path = lines[i].strip()
        full_path = os.path.join(images_root, rel_path)
        i += 1
        
        if i >= len(lines):
            break
            
        num_faces = int(lines[i].strip())
        i += 1
        
        # Read bounding boxes and attributes
        faces = []
        for j in range(num_faces):
            if i >= len(lines):
                break
                
            parts = lines[i].strip().split()
            i += 1
            
            # Ensure we have at least the bounding box coordinates
            if len(parts) < 4:
                continue
                
            # Create dictionary with all available attributes
            face = {'bbox': [int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3])]}
            
            # Add attributes if available
            if len(parts) > 4:
                face['blur'] = int(parts[4]) if len(parts) > 4 else 0
                face['expression'] = int(parts[5]) if len(parts) > 5 else 0
                face['illumination'] = int(parts[6]) if len(parts) > 6 else 0
                face['invalid'] = int(parts[7]) if len(parts) > 7 else 0
                face['occlusion'] = int(parts[8]) if len(parts) > 8 else 0
                face['pose'] = int(parts[9]) if len(parts) > 9 else 0
            
            faces.append(face)
        
        img_paths.append(full_path)
        annotations.append(faces)
    
    return img_paths, annotations


In [None]:
# Load training data
train_imgs, train_anns = load_wider_annotations(train_annot, os.path.join(wider_root, "WIDER_train/images"))

# Load validation data (to be combined with training)
val_imgs, val_anns = load_wider_annotations(val_annot, os.path.join(wider_root, "WIDER_val/images"))

# Combine datasets as per paper methodology
all_images = train_imgs + val_imgs
all_annotations = train_anns + val_anns

print(f"Total training data: {len(all_images)} images, {sum(len(a) for a in all_annotations)} faces")


Total training data: 16106 images, 199132 faces


In [7]:
# computing difficulty
def compute_difficulty(ann):
    """
    Compute difficulty based on attributes in WIDER FACE annotation
    ann: Dictionary with facial attributes
    """
    difficulty = 0
    
    # Blur
    if ann.get('blur', 0) == 1:  # Normal blur
        difficulty += 0.5
    elif ann.get('blur', 0) == 2:  # Heavy blur
        difficulty += 1
        
    # Expression
    if ann.get('expression', 0) == 2:  # Extreme expression
        difficulty += 1
        
    # Illumination
    if ann.get('illumination', 0) == 2:  # Extreme illumination
        difficulty += 1
        
    # Occlusion
    if ann.get('occlusion', 0) == 1:  # Partial occlusion
        difficulty += 0.5
    elif ann.get('occlusion', 0) == 2:  # Heavy occlusion
        difficulty += 1
        
    # Pose
    if ann.get('pose', 0) == 1:  # Atypical pose
        difficulty += 1
        
    return difficulty


In [9]:
# Resizing image improved
def resize_image(image, annotations, target_short=600, max_long=1000):
    """
    Resize image and annotations while keeping aspect ratio.
    Shorter side is set to target_short, longer side capped at max_long.
    
    Args:
        image: Input image (numpy array)
        annotations: List of annotation dicts with 'bbox' keys
        target_short: Target size for shorter side (default 600)
        max_long: Maximum size for longer side (default 1000)
    
    Returns:
        Resized image, updated annotations
    """
    h, w = image.shape[:2]
    scale = target_short / min(h, w)
    
    # Check if scaled longer side exceeds maximum
    if max(h, w) * scale > max_long:
        scale = max_long / max(h, w)
    
    # Calculate new dimensions
    new_w = int(w * scale)
    new_h = int(h * scale)
    
    # Resize image using OpenCV
    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
    
    # Update annotations
    updated_annots = []
    for ann in annotations:
        # Create copy to avoid modifying original
        new_ann = ann.copy()
        
        # Scale bounding box coordinates [x1, y1, w, h]
        x, y, w, h = ann['bbox']
        
        # Scale coordinates and ensure they're valid
        new_x = max(0, x * scale)
        new_y = max(0, y * scale)
        new_w = min(w * scale, new_w - new_x)
        new_h = min(h * scale, new_h - new_y)
        
        # Skip invalid boxes
        if new_w <= 0 or new_h <= 0:
            continue
            
        new_ann['bbox'] = [new_x, new_y, new_w, new_h]
        updated_annots.append(new_ann)
    
    return resized, updated_annots


In [11]:
# New horizontal flip function:
def horizontal_flip(image, annotations):
    """
    Args:
        image: numpy array (H, W, C)
        annotations: List of dicts with 'bbox' [x1, y1, w, h]
    
    Returns:
        Flipped image, updated annotations
    """
    flipped_img = cv2.flip(image, 1)  # 1 = horizontal flip
    img_w = image.shape[1]
    
    flipped_annots = []
    for ann in annotations:
        # Mirror bbox x-coordinate
        x, y, w, h = ann['bbox']
        new_x = img_w - (x + w)  # x' = image_width - (x + width)
        
        # Create new annotation with flipped bbox
        new_ann = ann.copy()
        new_ann['bbox'] = [new_x, y, w, h]
        flipped_annots.append(new_ann)
    
    return flipped_img, flipped_annots


In [12]:
# Generate the 12 RPN anchors
def generate_rpn_anchors():
    """
    Generate 12 anchors per spatial location as per paper:
    Sizes: 64x64, 128x128, 256x256, 512x512
    Ratios: 1:1, 1:2, 2:1
    """
    base_sizes = [64, 128, 256, 512]  # Anchor sizes
    ratios = [1.0, 0.5, 2.0]  # 1:1, 1:2, 2:1
    
    anchors = []
    for size in base_sizes:
        area = size ** 2
        for ratio in ratios:
            # Calculate width/height for each ratio
            w = np.sqrt(area / ratio)
            h = w * ratio
            
            # Base anchor at (0,0) - will be shifted during RPN
            anchors.append([0, 0, w, h])  # (x_center, y_center, width, height)
    
    return np.array(anchors, dtype=np.float32)


In [13]:
# NMS thresholds
def apply_nms(proposals, scores, iou_threshold=0.7, top_n=2000):
    """
    Apply NMS to select top 2000 region proposals.
    
    Args:
        proposals: List of region proposals in [x1,y1,x2,y2] format
        scores: Confidence scores for each proposal
        iou_threshold: IoU threshold for suppression (paper uses 0.7 for RPN)
        top_n: Maximum proposals to keep (paper: 2000)
    
    Returns:
        Indices of kept proposals
    """
    # Convert to (x,y,w,h) format for OpenCV NMSBoxes
    boxes = [[x1, y1, x2-x1, y2-y1] for x1,y1,x2,y2 in proposals]
    
    # OpenCV's NMS implementation (optimized)
    keep_indices = cv2.dnn.NMSBoxes(
        boxes, scores, 
        score_threshold=0.0,  # Keep all initially
        nms_threshold=iou_threshold,
        top_k=top_n
    )
    
    return keep_indices.flatten()  # Flatten from 2D to 1D array


In [14]:
# ROIs for fore and background
def sample_rois(proposals, gt_boxes, fg_ratio=0.25):
    """
    Sample RoIs maintaining 1:3 foreground:background ratio.
    
    Args:
        proposals: List of [x1,y1,x2,y2] region proposals (after NMS)
        gt_boxes: List of ground truth [x1,y1,x2,y2] boxes
        fg_ratio: Desired foreground ratio (1/4 = 1:3)
    
    Returns:
        Sampled indices, labels (1=foreground, 0=background)
    """
    # Compute IoU between proposals and GT boxes
    iou_matrix = np.zeros((len(proposals), len(gt_boxes)))
    for i, prop in enumerate(proposals):
        for j, gt in enumerate(gt_boxes):
            iou_matrix[i,j] = calculate_iou(prop, gt)
    
    # Assign labels: foreground (IoU > 0.5) else background
    max_iou = iou_matrix.max(axis=1)
    labels = (max_iou > 0.5).astype(np.int32)
    
    # Split indices
    fg_indices = np.where(labels == 1)[0]
    bg_indices = np.where(labels == 0)[0]
    
    # Sample to maintain 1:3 ratio
    num_fg = min(len(fg_indices), int(len(proposals)*fg_ratio))
    num_bg = min(len(bg_indices), 3*num_fg)
    
    # Random selection
    fg_selected = np.random.choice(fg_indices, num_fg, replace=False)
    bg_selected = np.random.choice(bg_indices, num_bg, replace=False)
    
    # Combine and return
    sampled_indices = np.concatenate([fg_selected, bg_selected])
    return sampled_indices, labels[sampled_indices]

# Helper function for IoU calculation
def calculate_iou(box_a, box_b):
    """Compute Intersection-over-Union for two boxes in [x1,y1,x2,y2] format"""
    x1 = max(box_a[0], box_b[0])
    y1 = max(box_a[1], box_b[1])
    x2 = min(box_a[2], box_b[2])
    y2 = min(box_a[3], box_b[3])
    
    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    area_a = (box_a[2]-box_a[0]) * (box_a[3]-box_a[1])
    area_b = (box_b[2]-box_b[0]) * (box_b[3]-box_b[1])
    
    return inter_area / (area_a + area_b - inter_area + 1e-6)


In [18]:
# sanitizing blocks
def sanitize_boxes(image, annotations):
    """
    Ensure valid bounding boxes after transformations
    Converts between formats if needed and ensures positive width/height
    """
    height, width = image.shape[:2]
    
    valid_annots = []
    for ann in annotations:
        # Extract bbox coordinates
        x, y, w, h = ann['bbox']
        
        # Ensure positive width and height
        if w <= 0 or h <= 0:
            continue
            
        # Clamp coordinates to image boundaries
        x = max(0, min(x, width-1))
        y = max(0, min(y, height-1))
        
        # Calculate bottom-right coordinates
        x_max = min(x + w, width-1)
        y_max = min(y + h, height-1)
        
        # Recalculate width and height
        new_w = x_max - x
        new_h = y_max - y
        
        # Skip invalid boxes
        if new_w <= 0 or new_h <= 0:
            continue
            
        # Update annotation
        new_ann = ann.copy()
        new_ann['bbox'] = [x, y, new_w, new_h]
        valid_annots.append(new_ann)
        
    return valid_annots



In [19]:
# best - more optimizations: batch processing
import os
import cv2
import numpy as np
import pickle
import gc
from tqdm import tqdm

# Batch processing configuration
BATCH_SIZE = 500
SAVE_INTERMEDIATE = True
OUTPUT_DIR = "/home/hkatti/scratch/processed_batches"
SAVE_FORMAT = "jpg"  # Options: "npy", "jpg", "memmap"

# Create output directory if saving intermediates
if SAVE_INTERMEDIATE and not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Initialize statistics dictionary
stats = {
    "total_images": len(all_images),
    "loaded_images": 0,
    "filtered_images": 0,
    "resized_images": 0,
    "flipped_images": 0,
    "processed_images": 0,
    "error_images": 0
}

# Process in batches to manage memory
all_processed_data = []
num_batches = (len(all_images) + BATCH_SIZE - 1) // BATCH_SIZE

# Create overall progress bar for batches
batch_progress = tqdm(
    range(num_batches),
    desc="Processing batches",
    unit="batch",
    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{percentage:3.0f}%]"
)

for batch_idx in batch_progress:
    # Calculate batch range
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, len(all_images))
    
    # Get current batch
    batch_images = all_images[start_idx:end_idx]
    batch_annotations = all_annotations[start_idx:end_idx]
    
    # Create progress bar for current batch
    progress_bar = tqdm(
        zip(batch_images, batch_annotations),
        total=len(batch_images),
        desc=f"Batch {batch_idx+1}/{num_batches}",
        unit="img",
        leave=False,
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{percentage:3.0f}%]"
    )
    
    # Process current batch
    batch_processed_data = []
    for img_path, annotations in progress_bar:
        try:
            # 1. Load image
            image = cv2.imread(img_path)
            if image is None:
                stats["error_images"] += 1
                continue
            stats["loaded_images"] += 1
            
            # 2. Filter annotations (difficulty <=2 and image has <=1000 faces)
            valid_annots = []
            if len(annotations) <= 1000:
                for ann in annotations:
                    if compute_difficulty(ann) <= 2:
                        valid_annots.append(ann)
            
            if not valid_annots:
                continue
            stats["filtered_images"] += 1
            
            # 3. Resize image & annotations
            try:
                resized_img, resized_annots = resize_image(image, valid_annots, 
                                                         target_short=TARGET_SHORT,
                                                         max_long=MAX_LONG)
                resized_annots = sanitize_boxes(resized_img, resized_annots)
                if not resized_annots:
                    continue
                stats["resized_images"] += 1
            except Exception:
                stats["error_images"] += 1
                continue
            
            # 4. Apply horizontal flipping (50% chance)
            try:
                if random.random() < FLIP_PROB:
                    final_img, final_annots = horizontal_flip(resized_img, resized_annots)
                    stats["flipped_images"] += 1
                else:
                    final_img, final_annots = resized_img, resized_annots
            except Exception:
                stats["error_images"] += 1
                continue
            
            # 5. Prepare for model input
            batch_processed_data.append({
                'image': final_img.astype(np.float32) / 255.0,
                'annotations': final_annots,
                'path': img_path
            })
            stats["processed_images"] += 1
            
        except Exception:
            stats["error_images"] += 1
            continue
    
    # Update main progress bar with overall stats
    batch_progress.set_postfix(processed=f"{stats['processed_images']}/{stats['total_images']}")
    
    # Save the processed batch based on the selected format
    if SAVE_INTERMEDIATE:
        batch_dir = os.path.join(OUTPUT_DIR, f"batch_{batch_idx:03d}")
        if not os.path.exists(batch_dir):
            os.makedirs(batch_dir)
        
        if SAVE_FORMAT == "jpg":
            # Save as JPG images with metadata
            metadata = []
            for i, item in enumerate(batch_processed_data):
                # Convert normalized float32 back to uint8 for image saving
                img = (item['image'] * 255).astype(np.uint8)
                img_file = os.path.join(batch_dir, f"img_{i:05d}.jpg")
                cv2.imwrite(img_file, img)
                
                # Store metadata
                metadata.append({
                    'annotations': item['annotations'],
                    'path': item['path'],
                    'img_file': f"img_{i:05d}.jpg"
                })
            
            # Save metadata separately
            meta_file = os.path.join(batch_dir, "metadata.pkl")
            with open(meta_file, 'wb') as f:
                pickle.dump(metadata, f, protocol=4)
                
        elif SAVE_FORMAT == "npy":
            # Save each image and annotation separately
            for i, item in enumerate(batch_processed_data):
                img_file = os.path.join(batch_dir, f"img_{i:05d}.npy")
                np.save(img_file, item['image'])
                
                ann_file = os.path.join(batch_dir, f"ann_{i:05d}.pkl")
                with open(ann_file, 'wb') as f:
                    pickle.dump(item['annotations'], f, protocol=4)
            
            # Save paths
            with open(os.path.join(batch_dir, "paths.txt"), 'w') as f:
                for item in batch_processed_data:
                    f.write(f"{item['path']}\n")
                    
        elif SAVE_FORMAT == "memmap":
            # Create memmap file for images
            n_images = len(batch_processed_data)
            if n_images > 0:
                img_shape = batch_processed_data[0]['image'].shape
                images_file = os.path.join(batch_dir, "images.dat")
                fp = np.memmap(images_file, dtype='float32', mode='w+', 
                              shape=(n_images, *img_shape))
                
                # Write images to memmap file one by one
                for i, item in enumerate(batch_processed_data):
                    fp[i] = item['image']
                
                # Flush changes to disk
                fp.flush()
                del fp
                
                # Save annotations separately
                ann_file = os.path.join(batch_dir, "annotations.pkl")
                with open(ann_file, 'wb') as f:
                    pickle.dump([item['annotations'] for item in batch_processed_data], f, protocol=4)
                
                # Save paths
                with open(os.path.join(batch_dir, "paths.txt"), 'w') as f:
                    for item in batch_processed_data:
                        f.write(f"{item['path']}\n")
        
        print(f"\nSaved batch {batch_idx+1}/{num_batches} with {len(batch_processed_data)} images to {batch_dir}")
    
    # Clear batch data to free memory
    del batch_processed_data
    gc.collect()

# Print final statistics
print(f"\nProcessing complete: {stats['processed_images']}/{stats['total_images']} images successfully processed ({stats['processed_images']/stats['total_images']*100:.1f}%)")
print(f"Statistics:")
print(f"  - Total images: {stats['total_images']}")
print(f"  - Images loaded: {stats['loaded_images']} ({stats['loaded_images']/stats['total_images']*100:.1f}%)")
print(f"  - Images with valid annotations: {stats['filtered_images']} ({stats['filtered_images']/stats['loaded_images']*100:.1f}% of loaded)")
print(f"  - Images successfully resized: {stats['resized_images']}")
print(f"  - Images flipped: {stats['flipped_images']} ({stats['flipped_images']/stats['processed_images']*100:.1f}% of processed)")
print(f"  - Images with errors: {stats['error_images']} ({stats['error_images']/stats['total_images']*100:.1f}%)")
print(f"  - Final processed images: {stats['processed_images']} ({stats['processed_images']/stats['total_images']*100:.1f}%)")


Processing batches:   0%|          | 0/33 [  0%]

Processing batches:   0%|          | 0/33 [  0%]


Saved batch 1/33 with 498 images to /home/hkatti/scratch/processed_batches/batch_000


Processing batches:   6%|▌         | 2/33 [  6%]


Saved batch 2/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_001


Processing batches:   9%|▉         | 3/33 [  9%]


Saved batch 3/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_002


Processing batches:  12%|█▏        | 4/33 [ 12%]


Saved batch 4/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_003


Processing batches:  15%|█▌        | 5/33 [ 15%]


Saved batch 5/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_004


Processing batches:  18%|█▊        | 6/33 [ 18%]


Saved batch 6/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_005


Processing batches:  21%|██        | 7/33 [ 21%]


Saved batch 7/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_006


Processing batches:  24%|██▍       | 8/33 [ 24%]


Saved batch 8/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_007


Processing batches:  27%|██▋       | 9/33 [ 27%]


Saved batch 9/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_008


Processing batches:  30%|███       | 10/33 [ 30%]


Saved batch 10/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_009


Processing batches:  33%|███▎      | 11/33 [ 33%]


Saved batch 11/33 with 498 images to /home/hkatti/scratch/processed_batches/batch_010


Processing batches:  36%|███▋      | 12/33 [ 36%]


Saved batch 12/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_011


Processing batches:  39%|███▉      | 13/33 [ 39%]


Saved batch 13/33 with 497 images to /home/hkatti/scratch/processed_batches/batch_012


Processing batches:  42%|████▏     | 14/33 [ 42%]


Saved batch 14/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_013


Processing batches:  45%|████▌     | 15/33 [ 45%]


Saved batch 15/33 with 497 images to /home/hkatti/scratch/processed_batches/batch_014


Processing batches:  48%|████▊     | 16/33 [ 48%]


Saved batch 16/33 with 495 images to /home/hkatti/scratch/processed_batches/batch_015


Processing batches:  52%|█████▏    | 17/33 [ 52%]


Saved batch 17/33 with 497 images to /home/hkatti/scratch/processed_batches/batch_016


Processing batches:  55%|█████▍    | 18/33 [ 55%]


Saved batch 18/33 with 498 images to /home/hkatti/scratch/processed_batches/batch_017


Processing batches:  58%|█████▊    | 19/33 [ 58%]


Saved batch 19/33 with 498 images to /home/hkatti/scratch/processed_batches/batch_018


Processing batches:  61%|██████    | 20/33 [ 61%]


Saved batch 20/33 with 497 images to /home/hkatti/scratch/processed_batches/batch_019


Processing batches:  64%|██████▎   | 21/33 [ 64%]


Saved batch 21/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_020


Processing batches:  67%|██████▋   | 22/33 [ 67%]


Saved batch 22/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_021


Processing batches:  70%|██████▉   | 23/33 [ 70%]


Saved batch 23/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_022


Processing batches:  73%|███████▎  | 24/33 [ 73%]


Saved batch 24/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_023


Processing batches:  76%|███████▌  | 25/33 [ 76%]


Saved batch 25/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_024


Processing batches:  79%|███████▉  | 26/33 [ 79%]


Saved batch 26/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_025


Processing batches:  82%|████████▏ | 27/33 [ 82%]


Saved batch 27/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_026


Processing batches:  85%|████████▍ | 28/33 [ 85%]


Saved batch 28/33 with 500 images to /home/hkatti/scratch/processed_batches/batch_027


Processing batches:  88%|████████▊ | 29/33 [ 88%]


Saved batch 29/33 with 498 images to /home/hkatti/scratch/processed_batches/batch_028


Processing batches:  91%|█████████ | 30/33 [ 91%]


Saved batch 30/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_029


Processing batches:  94%|█████████▍| 31/33 [ 94%]


Saved batch 31/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_030


Processing batches:  97%|█████████▋| 32/33 [ 97%]


Saved batch 32/33 with 499 images to /home/hkatti/scratch/processed_batches/batch_031


Processing batches: 100%|██████████| 33/33 [100%]


Saved batch 33/33 with 106 images to /home/hkatti/scratch/processed_batches/batch_032

Processing complete: 16066/16106 images successfully processed (99.8%)
Statistics:
  - Total images: 16106
  - Images loaded: 16106 (100.0%)
  - Images with valid annotations: 16075 (99.8% of loaded)
  - Images successfully resized: 16066
  - Images flipped: 8012 (49.9% of processed)
  - Images with errors: 0 (0.0%)
  - Final processed images: 16066 (99.8%)





In [20]:
# Training:

# Memory-efficient dataset
class WiderFaceDataset(Dataset):
    def __init__(self, data_dir, max_samples=None):
        self.data_dir = data_dir
        self.metadata = []
        
        # Load metadata from all batches
        batch_folders = [f for f in os.listdir(data_dir) if f.startswith("batch_") and os.path.isdir(os.path.join(data_dir, f))]
        
        for batch_folder in sorted(batch_folders):
            batch_path = os.path.join(data_dir, batch_folder)
            meta_file = os.path.join(batch_path, "metadata.pkl")
            
            if os.path.exists(meta_file):
                with open(meta_file, 'rb') as f:
                    batch_metadata = pickle.load(f)
                
                # Add batch directory to each item
                for item in batch_metadata:
                    item['batch_dir'] = batch_path
                    self.metadata.append(item)
                    
                    # Stop if we've reached max_samples
                    if max_samples is not None and len(self.metadata) >= max_samples:
                        break
            
            # Stop if we've reached max_samples
            if max_samples is not None and len(self.metadata) >= max_samples:
                break
                
        print(f"Dataset initialized with {len(self.metadata)} samples")
    
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        item = self.metadata[idx]
        
        # Load image on demand
        img_file = os.path.join(item['batch_dir'], item['img_file'])
        image = cv2.imread(img_file)
        # Convert to RGB and normalize
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = image.astype(np.float32) / 255.0
        
        # Convert to tensor
        image = torch.from_numpy(image).permute(2, 0, 1)  # HWC -> CHW
        
        # Extract bounding boxes and convert from [x, y, w, h] to [x1, y1, x2, y2]
        boxes = []
        for ann in item['annotations']:
            x, y, w, h = ann['bbox']
            # Ensure positive width and height
            if w > 0 and h > 0:
                boxes.append([float(x), float(y), float(x + w), float(y + h)])
        
        # Create tensor from valid boxes
        if boxes:
            boxes_tensor = torch.tensor(boxes, dtype=torch.float32)
            labels = torch.ones(len(boxes), dtype=torch.int64)  # All faces are class 1
        else:
            # Create dummy box if no valid boxes (will be filtered out during training)
            boxes_tensor = torch.tensor([[0.0, 0.0, 1.0, 1.0]], dtype=torch.float32)
            labels = torch.zeros(1, dtype=torch.int64)  # Background
        
        return image, {'boxes': boxes_tensor, 'labels': labels}


# Feature concatenation module as described in paper section 3.2
class BackboneWithFeatureConcatenation(nn.Module):
    def __init__(self, backbone):
        super(BackboneWithFeatureConcatenation, self).__init__()
        self.backbone = backbone
        self.out_channels = 512  # Must define this for Faster R-CNN
        
        # Define layers for L2 normalization and 1x1 convolution
        self.l2_norm = L2Norm()
        self.conv1x1 = nn.Conv2d(1280, 512, kernel_size=1)  # 1280 = 256+512+512 (channels from conv3_3, conv4_3, conv5_3)
        
        # Freeze early VGG layers
        for layer in list(self.backbone.children())[:10]:
            for param in layer.parameters():
                param.requires_grad = False

    def forward(self, x):
        features = {}
        
        # Store intermediate feature maps
        intermediate_features = {}
        
        # Extract features from different VGG layers
        for i, layer in enumerate(self.backbone):
            x = layer(x)
            if i == 16:  # conv3_3 output
                intermediate_features['conv3'] = self.l2_norm(x)
            elif i == 23:  # conv4_3 output
                intermediate_features['conv4'] = self.l2_norm(x)
            elif i == 30:  # conv5_3 output
                intermediate_features['conv5'] = self.l2_norm(x)
        
        # Get spatial dimensions of smallest feature map (conv5)
        target_size = intermediate_features['conv5'].shape[2:]
        
        # Resize all feature maps to match conv5's spatial dimensions
        resized_conv3 = F.interpolate(intermediate_features['conv3'], 
                                     size=target_size, 
                                     mode='bilinear', 
                                     align_corners=False)
        
        resized_conv4 = F.interpolate(intermediate_features['conv4'], 
                                     size=target_size, 
                                     mode='bilinear', 
                                     align_corners=False)
        
        # Concatenate feature maps along channel dimension
        combined = torch.cat([resized_conv3, resized_conv4, intermediate_features['conv5']], dim=1)
        
        # Apply 1x1 convolution to reduce channel dimension
        features['0'] = self.conv1x1(combined)
        
        return features

# L2 Normalization layer for feature concatenation
class L2Norm(nn.Module):
    def __init__(self, scale=20.0):
        super(L2Norm, self).__init__()
        self.scale = scale
        
    def forward(self, x):
        # L2 normalization along channel dimension
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()
        return self.scale * x / (norm + 1e-10)

# Training function
def train_model(model, data_loader, num_epochs=10):
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model.to(device)
    
    # Optimizer with learning rate from paper
    optimizer = optim.SGD(
        [p for p in model.parameters() if p.requires_grad],
        lr=0.0001,
        momentum=0.9,
        weight_decay=0.0005
    )
    
    # Learning rate scheduler (optional)
    lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer, step_size=3, gamma=0.1
    )
    
    # Training loop
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        
        for images, targets in tqdm(data_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            # Move data to device
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Forward pass
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            # Backward pass and optimize
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            epoch_loss += losses.item()
        
        # Update learning rate
        lr_scheduler.step()
        
        # Print epoch statistics
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(data_loader):.4f}")
        
        # Save checkpoint after each epoch
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': epoch_loss,
        }, f"checkpoint_epoch_{epoch+1}.pth")
    
    return model

def create_faster_rcnn_model():
    # Load pre-trained VGG16
    vgg16 = models.vgg16(weights=VGG16_Weights.IMAGENET1K_V1)
    backbone = vgg16.features
    
    # Create modified backbone
    backbone_with_concat = BackboneWithFeatureConcatenation(backbone)
    
    # Anchor generator (12 anchors as per paper)
    anchor_sizes = ((64,), (128,), (256,), (512,))
    # aspect_ratios = ((1.0, 0.5, 2.0),) * len(anchor_sizes)
    aspect_ratios = ((1.0, 0.5, 2.0),) 
    anchor_generator = AnchorGenerator(
        sizes=anchor_sizes,
        aspect_ratios=aspect_ratios
    )
    
    # ROI aligner
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(
        featmap_names=['0'],  # Now using single combined feature map
        output_size=7,
        sampling_ratio=2
    )
    
    # Create Faster R-CNN model
    model = FasterRCNN(
        backbone=backbone_with_concat,
        num_classes=2,  # Background + Face
        rpn_anchor_generator=anchor_generator,
        box_roi_pool=roi_pooler,
        rpn_pre_nms_top_n_train=6000,
        rpn_post_nms_top_n_train=2000,
        rpn_nms_thresh=0.7,
        box_score_thresh=0.05,
        box_nms_thresh=0.3,
        box_detections_per_img=100
    )
    
    return model

In [None]:
# Main execution
if __name__ == "__main__":
    # Create dataset without loading all images into memory
    dataset = WiderFaceDataset("/home/hkatti/scratch/processed_batches", max_samples=5000)  # Adjust sample count as needed
    
    data_loader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=lambda batch: tuple(zip(*batch))  
    )
    
    # Create and train model
    model = create_faster_rcnn_model()
    trained_model = train_model(model, data_loader, num_epochs=5)  # Start with fewer epochs for testing
    
    # Save the trained model
    torch.save(trained_model.state_dict(), "faster_rcnn_face_detection.pth")
    print("Training complete and model saved!")

Dataset initialized with 5000 samples
Using device: cuda


Epoch 1/5:   9%|▉         | 474/5000 [00:43<06:23, 11.82it/s]

Epoch 1/5: 100%|██████████| 5000/5000 [07:28<00:00, 11.16it/s]


Epoch 1/5, Loss: 0.3580


Epoch 2/5: 100%|██████████| 5000/5000 [07:25<00:00, 11.22it/s]


Epoch 2/5, Loss: 0.2380


Epoch 3/5: 100%|██████████| 5000/5000 [07:22<00:00, 11.30it/s]


Epoch 3/5, Loss: 0.2010


Epoch 4/5: 100%|██████████| 5000/5000 [07:22<00:00, 11.30it/s]


Epoch 4/5, Loss: 0.1845


Epoch 5/5: 100%|██████████| 5000/5000 [07:21<00:00, 11.33it/s]


Epoch 5/5, Loss: 0.1817
Training complete and model saved!


First phase of training done:
1. Train on WIDER
2. Get the biggest mistakes for hard negative mining 
3. Below we fine tune on FDDB 

In [22]:
# Cell 14: Hard Negative Mining with Pre-trained Model
# Load the pre-trained model
def load_pretrained_model():
    # Create model architecture
    model = create_faster_rcnn_model()
    
    # Load saved weights
    model.load_state_dict(torch.load("faster_rcnn_face_detection.pth"))
    
    # Set to evaluation mode for inference
    model.eval()
    
    print("Pre-trained model loaded successfully!")
    return model

# Use the loaded model for hard negative mining
def hard_negative_mining(model, data_loader, confidence_threshold=0.8, iou_threshold=0.5):
    """
    Implements hard negative mining as described in the paper:
    - Run inference on training data
    - Collect regions with confidence > 0.8 but IoU < 0.5 with any ground truth
    - These are "hard negatives" (confident false positives)
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()  # Set to evaluation mode
    
    hard_negatives = []
    
    print("Running inference to find hard negatives...")
    with torch.no_grad():
        for images, targets in tqdm(data_loader, desc="Hard negative mining"):
            # Move to device
            images = [img.to(device) for img in images]
            targets_on_device = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Get predictions
            predictions = model(images)
            
            # Process each image in the batch
            for i, (prediction, target) in enumerate(zip(predictions, targets_on_device)):
                # Get boxes with high confidence
                high_conf_indices = torch.where(prediction['scores'] > confidence_threshold)[0]
                if len(high_conf_indices) == 0:
                    continue
                    
                high_conf_boxes = prediction['boxes'][high_conf_indices]
                high_conf_scores = prediction['scores'][high_conf_indices]
                
                # Get ground truth boxes
                gt_boxes = target['boxes']
                
                # Calculate IoU between predictions and ground truth
                hard_neg_indices = []
                for j, box in enumerate(high_conf_boxes):
                    # Calculate IoU with all ground truth boxes
                    ious = box_ops.box_iou(box.unsqueeze(0), gt_boxes)
                    max_iou = ious.max().item()
                    
                    # If max IoU is below threshold, this is a hard negative
                    if max_iou < iou_threshold:
                        hard_neg_indices.append(high_conf_indices[j])
                
                # If we found hard negatives, add them to our collection
                if hard_neg_indices:
                    hard_neg_boxes = prediction['boxes'][hard_neg_indices]
                    hard_neg_scores = prediction['scores'][hard_neg_indices]
                    
                    # Store the image and hard negative regions
                    hard_negatives.append({
                        'image': images[i].cpu(),
                        'hard_neg_boxes': hard_neg_boxes.cpu(),
                        'hard_neg_scores': hard_neg_scores.cpu()
                    })
    
    print(f"Found {len(hard_negatives)} images with hard negatives")
    return hard_negatives

# Create a dataset class for hard negatives
class HardNegativeDataset(Dataset):
    def __init__(self, original_dataset, hard_negatives):
        self.original_dataset = original_dataset
        self.hard_negatives = hard_negatives
        
    def __len__(self):
        return len(self.original_dataset) + len(self.hard_negatives)
    
    def __getitem__(self, idx):
        if idx < len(self.original_dataset):
            # Return original sample
            return self.original_dataset[idx]
        else:
            # Return hard negative sample
            hard_neg_idx = idx - len(self.original_dataset)
            hard_neg = self.hard_negatives[hard_neg_idx]
            
            # Create target with both ground truth and hard negatives
            image = hard_neg['image']
            
            # Create boxes tensor with hard negatives labeled as background (0)
            boxes = hard_neg['hard_neg_boxes']
            labels = torch.zeros(len(boxes), dtype=torch.int64)  # All hard negatives are background
            
            return image, {'boxes': boxes, 'labels': labels}

# Fine-tune with hard negatives
def fine_tune_with_hard_negatives(model, dataset, hard_negatives, num_epochs=5):
    # Create combined dataset
    combined_dataset = HardNegativeDataset(dataset, hard_negatives)
    
    # Create data loader
    data_loader = DataLoader(
        combined_dataset,
        batch_size=1,
        shuffle=True,
        collate_fn=lambda batch: tuple(zip(*batch))
    )
    
    # Fine-tune model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # Optimizer with lower learning rate for fine-tuning
    optimizer = optim.SGD(
        [p for p in model.parameters() if p.requires_grad],
        lr=0.00005,  # Lower learning rate for fine-tuning
        momentum=0.9,
        weight_decay=0.0005
    )
    
    # Training loop
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        
        for i, (images, targets) in enumerate(tqdm(data_loader, desc=f"Fine-tuning epoch {epoch+1}/{num_epochs}")):
            try:
                # Move data to device
                images = [img.to(device) for img in images]
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
                
                # Forward pass
                loss_dict = model(images, targets)
                losses = sum(loss for loss in loss_dict.values())
                
                # Backward pass and optimize
                optimizer.zero_grad()
                losses.backward()
                optimizer.step()
                
                epoch_loss += losses.item()
                
            except Exception as e:
                print(f"Error in batch {i}: {str(e)}")
                continue
        
        # Print epoch statistics
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(data_loader):.4f}")
    
    return model

# Execute hard negative mining and fine-tuning with pre-trained model
model = load_pretrained_model()
hard_negatives = hard_negative_mining(model, data_loader)
model = fine_tune_with_hard_negatives(model, dataset, hard_negatives)

# Save the fine-tuned model
torch.save(model.state_dict(), "faster_rcnn_face_detection_with_hard_negatives.pth")


  model.load_state_dict(torch.load("faster_rcnn_face_detection.pth"))


Pre-trained model loaded successfully!
Running inference to find hard negatives...


Hard negative mining: 100%|██████████| 5000/5000 [04:25<00:00, 18.82it/s]


Found 0 images with hard negatives


Fine-tuning epoch 1/5: 100%|██████████| 5000/5000 [07:20<00:00, 11.34it/s]


Epoch 1/5, Loss: 0.1803


Fine-tuning epoch 2/5: 100%|██████████| 5000/5000 [07:26<00:00, 11.20it/s]


Epoch 2/5, Loss: 0.1736


Fine-tuning epoch 3/5: 100%|██████████| 5000/5000 [07:25<00:00, 11.23it/s]


Epoch 3/5, Loss: 0.1683


Fine-tuning epoch 4/5: 100%|██████████| 5000/5000 [07:19<00:00, 11.37it/s]


Epoch 4/5, Loss: 0.1639


Fine-tuning epoch 5/5: 100%|██████████| 5000/5000 [07:20<00:00, 11.35it/s]


Epoch 5/5, Loss: 0.1606


In [23]:
# 15: FDDB Dataset Preparation
import os
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader

class FDDBDataset(Dataset):
    def __init__(self, fddb_dir, fold_id=None, transform=None):
        """
        FDDB Dataset loader
        
        Args:
            fddb_dir: Root directory of FDDB dataset
            fold_id: If provided, only load this specific fold (1-10)
            transform: Optional transforms to apply
        """
        self.fddb_dir = fddb_dir
        self.transform = transform
        self.images = []
        self.annotations = []
        
        # FDDB directory structure
        self.images_dir = os.path.join(fddb_dir, "originalPics")
        
        # Load fold files
        fold_files = []
        if fold_id is not None:
            # Load specific fold
            fold_file = os.path.join(fddb_dir, "FDDB-folds", f"FDDB-fold-{fold_id:02d}.txt")
            fold_files = [fold_file]
        else:
            # Load all folds
            for i in range(1, 11):
                fold_file = os.path.join(fddb_dir, "FDDB-folds", f"FDDB-fold-{i:02d}.txt")
                fold_files.append(fold_file)
        
        # Load annotations
        for fold_file in fold_files:
            annotation_file = fold_file.replace(".txt", "-ellipseList.txt")
            self._load_fold(fold_file, annotation_file)
            
        print(f"Loaded {len(self.images)} images with {sum(len(a) for a in self.annotations)} faces from FDDB")
    
    def _load_fold(self, image_list_file, annotation_file):
        """Load images and annotations from a specific fold"""
        # Read image list
        with open(image_list_file, 'r') as f:
            image_paths = [line.strip() for line in f if line.strip()]
        
        # Read annotations
        with open(annotation_file, 'r') as f:
            lines = [line.strip() for line in f if line.strip()]
        
        i = 0
        while i < len(lines):
            # Get image path
            img_path = lines[i]
            if img_path not in image_paths:
                i += 1
                continue
                
            # Get number of faces
            i += 1
            if i >= len(lines):
                break
                
            num_faces = int(lines[i])
            i += 1
            
            # Read face annotations
            faces = []
            for j in range(num_faces):
                if i >= len(lines):
                    break
                    
                # FDDB annotations are in ellipse format
                # major_axis_radius minor_axis_radius angle center_x center_y 1
                ellipse = list(map(float, lines[i].split()))
                i += 1
                
                # Convert ellipse to bounding box
                major_axis = ellipse[0]
                minor_axis = ellipse[1]
                angle = ellipse[2]
                center_x = ellipse[3]
                center_y = ellipse[4]
                
                # Calculate bounding box (approximation)
                w = 2 * major_axis
                h = 2 * minor_axis
                x = center_x - w/2
                y = center_y - h/2
                
                faces.append({'bbox': [x, y, w, h]})
            
            # Add image and annotations
            full_img_path = os.path.join(self.images_dir, img_path + ".jpg")
            if os.path.exists(full_img_path):
                self.images.append(full_img_path)
                self.annotations.append(faces)
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        # Load image
        img_path = self.images[idx]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Get annotations
        annotations = self.annotations[idx]
        
        # Apply transforms if any
        if self.transform:
            transformed = self.transform(image=image, bboxes=[ann['bbox'] for ann in annotations])
            image = transformed['image']
            boxes = transformed['bboxes']
            
            # Update annotations
            for i, box in enumerate(boxes):
                annotations[i]['bbox'] = box
        
        # Convert to tensor
        image = torch.from_numpy(image.astype(np.float32) / 255.0).permute(2, 0, 1)
        
        # Convert bounding boxes to [x1, y1, x2, y2] format
        boxes = []
        for ann in annotations:
            x, y, w, h = ann['bbox']
            boxes.append([float(x), float(y), float(x + w), float(y + h)])
        
        boxes_tensor = torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4), dtype=torch.float32)
        labels = torch.ones(len(boxes), dtype=torch.int64)  # All faces are class 1
        
        return image, {'boxes': boxes_tensor, 'labels': labels}

# Create FDDB dataset
fddb_dir = "/home/hkatti/scratch/datasets/FDDB"  # Update with your FDDB dataset path
fddb_dataset = FDDBDataset(fddb_dir)

# Create data loader for FDDB
fddb_loader = DataLoader(
    fddb_dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=lambda batch: tuple(zip(*batch))
)


Loaded 2845 images with 5171 faces from FDDB


In [24]:
# Cell 16: Multi-scale Training for FDDB Fine-tuning (without Albumentations)
class MultiScaleTransform:
    def __init__(self, scales=[480, 600, 750]):
        self.scales = scales
        
    def __call__(self, image, annotations):
        # Randomly select a scale
        scale = random.choice(self.scales)
        
        # Calculate the scaling factor
        h, w = image.shape[:2]
        scale_factor = scale / min(h, w)
        
        # Cap the longer side if needed (as in the paper)
        if max(h, w) * scale_factor > 1000:
            scale_factor = 1000 / max(h, w)
        
        # Calculate new dimensions
        new_h = int(h * scale_factor)
        new_w = int(w * scale_factor)
        
        # Resize image
        resized_img = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
        
        # Scale annotations
        resized_annotations = []
        for ann in annotations:
            x, y, w, h = ann['bbox']
            new_ann = ann.copy()
            new_ann['bbox'] = [x * scale_factor, y * scale_factor, 
                              w * scale_factor, h * scale_factor]
            resized_annotations.append(new_ann)
        
        return resized_img, resized_annotations

# Create FDDB dataset with multi-scale augmentation
class FDDBMultiScaleDataset(Dataset):
    def __init__(self, fddb_dir, scales=[480, 600, 750]):
        self.fddb_dataset = FDDBDataset(fddb_dir)
        self.multi_scale_transform = MultiScaleTransform(scales)
        
    def __len__(self):
        return len(self.fddb_dataset)
    
    def __getitem__(self, idx):
        # Get original image and annotations
        img_path = self.fddb_dataset.images[idx]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        annotations = self.fddb_dataset.annotations[idx]
        
        # Apply multi-scale transform
        resized_img, resized_annotations = self.multi_scale_transform(image, annotations)
        
        # Convert to tensor
        image_tensor = torch.from_numpy(resized_img.astype(np.float32) / 255.0).permute(2, 0, 1)
        
        # Convert bounding boxes to [x1, y1, x2, y2] format
        boxes = []
        for ann in resized_annotations:
            x, y, w, h = ann['bbox']
            # Ensure positive width and height
            if w > 0 and h > 0:
                boxes.append([float(x), float(y), float(x + w), float(y + h)])
        
        boxes_tensor = torch.tensor(boxes, dtype=torch.float32) if boxes else torch.zeros((0, 4), dtype=torch.float32)
        labels = torch.ones(len(boxes), dtype=torch.int64)  # All faces are class 1
        
        return image_tensor, {'boxes': boxes_tensor, 'labels': labels}

# Create FDDB dataset with multi-scale augmentation
fddb_multi_scale_dataset = FDDBMultiScaleDataset(fddb_dir)

# Create data loader for FDDB with multi-scale augmentation
fddb_multi_scale_loader = DataLoader(
    fddb_multi_scale_dataset,
    batch_size=1,
    shuffle=True,
    collate_fn=lambda batch: tuple(zip(*batch))
)


Loaded 2845 images with 5171 faces from FDDB


In [25]:
# Cell 17: Fine-tuning on FDDB
def fine_tune_on_fddb(model, data_loader, num_iterations=40000, learning_rate=0.001):
    """
    Fine-tune the model on FDDB dataset as described in the paper:
    - 40,000 iterations
    - Fixed learning rate of 0.001
    - Multi-scale training
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    model.to(device)
    
    # Optimizer with learning rate from paper
    optimizer = optim.SGD(
        [p for p in model.parameters() if p.requires_grad],
        lr=learning_rate,
        momentum=0.9,
        weight_decay=0.0005
    )
    
    # Training loop
    model.train()
    iteration = 0
    epoch = 0
    
    # Create an infinite data loader by cycling through the dataset
    data_iter = iter(data_loader)
    
    pbar = tqdm(total=num_iterations, desc="Fine-tuning on FDDB")
    
    while iteration < num_iterations:
        try:
            # Get next batch
            try:
                images, targets = next(data_iter)
            except StopIteration:
                # Restart the iterator when it's exhausted
                data_iter = iter(data_loader)
                epoch += 1
                print(f"Starting epoch {epoch}")
                images, targets = next(data_iter)
            
            # Move data to device
            images = [img.to(device) for img in images]
            targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
            
            # Forward pass
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
            
            # Backward pass and optimize
            optimizer.zero_grad()
            losses.backward()
            optimizer.step()
            
            # Update progress
            iteration += 1
            pbar.update(1)
            pbar.set_postfix(loss=losses.item())
            
            # Save checkpoint every 5000 iterations
            if iteration % 5000 == 0:
                torch.save({
                    'iteration': iteration,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': losses.item(),
                }, f"fddb_checkpoint_iter_{iteration}.pth")
                
        except Exception as e:
            print(f"Error in iteration {iteration}: {str(e)}")
            continue
    
    pbar.close()
    
    # Save final model
    torch.save(model.state_dict(), "faster_rcnn_face_detection_fddb.pth")
    return model

# Fine-tune the model on FDDB
model = fine_tune_on_fddb(model, fddb_multi_scale_loader)


Using device: cuda


Fine-tuning on FDDB:   7%|▋         | 2848/40000 [04:32<58:21, 10.61it/s, loss=0.237]   

Starting epoch 1


Fine-tuning on FDDB:  14%|█▍        | 5692/40000 [09:05<49:50, 11.47it/s, loss=0.172]   

Starting epoch 2


Fine-tuning on FDDB:  21%|██▏       | 8537/40000 [13:35<45:22, 11.56it/s, loss=0.0606]  

Starting epoch 3


Fine-tuning on FDDB:  28%|██▊       | 11381/40000 [18:06<38:06, 12.52it/s, loss=0.426]  

Starting epoch 4


Fine-tuning on FDDB:  36%|███▌      | 14228/40000 [22:41<36:30, 11.76it/s, loss=0.49]  

Starting epoch 5


Fine-tuning on FDDB:  43%|████▎     | 17071/40000 [27:10<33:11, 11.51it/s, loss=0.0736]  

Starting epoch 6


Fine-tuning on FDDB:  50%|████▉     | 19917/40000 [31:40<33:44,  9.92it/s, loss=0.932] 

Starting epoch 7


Fine-tuning on FDDB:  57%|█████▋    | 22761/40000 [36:10<26:19, 10.91it/s, loss=0.268] 

Starting epoch 8


Fine-tuning on FDDB:  64%|██████▍   | 25608/40000 [40:39<23:41, 10.12it/s, loss=0.13]  

Starting epoch 9


Fine-tuning on FDDB:  71%|███████   | 28453/40000 [45:05<16:02, 12.00it/s, loss=0.12]  

Starting epoch 10


Fine-tuning on FDDB:  78%|███████▊  | 31296/40000 [49:33<11:19, 12.80it/s, loss=0.0704]

Starting epoch 11


Fine-tuning on FDDB:  85%|████████▌ | 34142/40000 [54:01<09:21, 10.43it/s, loss=0.0666]

Starting epoch 12


Fine-tuning on FDDB:  92%|█████████▏| 36986/40000 [58:24<04:16, 11.73it/s, loss=0.396] 

Starting epoch 13


Fine-tuning on FDDB: 100%|█████████▉| 39831/40000 [1:02:52<00:15, 11.06it/s, loss=0.248] 

Starting epoch 14


Fine-tuning on FDDB: 100%|██████████| 40000/40000 [1:03:08<00:00, 10.56it/s, loss=0.0802]


In [27]:
# # Impossible to know which image is saved - name not saved with folder etc. 
# # Cell 18: FDDB Evaluation 
# def evaluate_on_fddb(model, fddb_dir, output_dir="fddb_results"):
#     """
#     Evaluate the model on FDDB dataset following the paper's methodology
#     """
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     model.to(device)
#     model.eval()
    
#     # Create output directory
#     os.makedirs(output_dir, exist_ok=True)
    
#     # Process each fold
#     for fold_id in range(1, 11):
#         # Create dataset for this fold
#         fold_dataset = FDDBDataset(fddb_dir, fold_id=fold_id)
#         fold_loader = DataLoader(
#             fold_dataset,
#             batch_size=1,
#             shuffle=False,
#             collate_fn=lambda batch: tuple(zip(*batch))
#         )
        
#         # Create output file for this fold
#         det_file = os.path.join(output_dir, f"fold-{fold_id:02d}-detections.txt")
        
#         with open(det_file, 'w') as f_det:
#             with torch.no_grad():
#                 for i, (images, targets) in enumerate(tqdm(fold_loader, desc=f"Evaluating fold {fold_id}")):
#                     # Get image path directly from the dataset
#                     img_path = fold_dataset.images[i]  # Use the batch index directly
#                     img_name = os.path.splitext(os.path.basename(img_path))[0]
                    
#                     # Move to device
#                     images = [img.to(device) for img in images]
                    
#                     # Multi-scale testing (optional, as in the paper)
#                     # Test with 3 scales and average results
#                     scales = [480, 600, 750]
#                     all_predictions = []
                    
#                     for scale in scales:
#                         # Resize image
#                         orig_img = images[0].cpu().permute(1, 2, 0).numpy() * 255
#                         h, w = orig_img.shape[:2]
#                         scale_factor = scale / min(h, w)
#                         if max(h, w) * scale_factor > 1000:
#                             scale_factor = 1000 / max(h, w)
                        
#                         new_h = int(h * scale_factor)
#                         new_w = int(w * scale_factor)
                        
#                         resized_img = cv2.resize(orig_img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
#                         resized_tensor = torch.from_numpy(resized_img.astype(np.float32) / 255.0).permute(2, 0, 1).to(device)
                        
#                         # Get predictions
#                         predictions = model([resized_tensor])
                        
#                         # Scale back predictions to original size
#                         boxes = predictions[0]['boxes'].cpu() / scale_factor
#                         scores = predictions[0]['scores'].cpu()
                        
#                         # Store predictions
#                         all_predictions.append((boxes, scores))
                    
#                     # Combine predictions from different scales
#                     all_boxes = torch.cat([p[0] for p in all_predictions])
#                     all_scores = torch.cat([p[1] for p in all_predictions])
                    
#                     # Apply confidence threshold
#                     conf_mask = all_scores > 0.8
#                     boxes = all_boxes[conf_mask]
#                     scores = all_scores[conf_mask]
                    
#                     # Apply NMS
#                     keep_indices = torchvision.ops.nms(boxes, scores, 0.3)
#                     boxes = boxes[keep_indices]
#                     scores = scores[keep_indices]
                    
#                     # Limit to top 100 detections
#                     if len(scores) > 100:
#                         top_indices = torch.argsort(scores, descending=True)[:100]
#                         boxes = boxes[top_indices]
#                         scores = scores[top_indices]
                    
#                     # Write detections to file
#                     f_det.write(f"{img_name}\n")
#                     f_det.write(f"{len(boxes)}\n")
                    
#                     for i in range(len(boxes)):
#                         # Convert to FDDB format (x, y, w, h, score)
#                         x1, y1, x2, y2 = boxes[i].tolist()
#                         w = x2 - x1
#                         h = y2 - y1
#                         score = scores[i].item()
#                         f_det.write(f"{x1} {y1} {w} {h} {score}\n")
    
#     print(f"Evaluation complete. Results saved to {output_dir}")
#     return output_dir

# # Evaluate the model on FDDB
# results_dir = evaluate_on_fddb(model, fddb_dir)


In [28]:
def evaluate_on_fddb(model, fddb_dir, output_dir="fddb_results"):
    """
    Evaluate the model on FDDB dataset following the paper's methodology
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Process each fold
    for fold_id in range(1, 11):
        # Create dataset for this fold
        fold_dataset = FDDBDataset(fddb_dir, fold_id=fold_id)
        fold_loader = DataLoader(
            fold_dataset,
            batch_size=1,
            shuffle=False,
            collate_fn=lambda batch: tuple(zip(*batch))
        )
        
        # Create output file for this fold
        det_file = os.path.join(output_dir, f"fold-{fold_id:02d}-detections.txt")
        
        with open(det_file, 'w') as f_det:
            with torch.no_grad():
                for i, (images, targets) in enumerate(tqdm(fold_loader, desc=f"Evaluating fold {fold_id}")):
                    # Get image path directly from the dataset
                    img_path = fold_dataset.images[i]  # Use the batch index directly
                    
                    # Extract relative path without extension
                    # Remove images_dir prefix and .jpg extension
                    images_dir = os.path.join(fddb_dir, "originalPics")
                    rel_path = img_path.replace(images_dir + os.path.sep, "").replace(".jpg", "")
                    
                    # Move to device
                    images = [img.to(device) for img in images]
                    
                    # Multi-scale testing (optional, as in the paper)
                    # Test with 3 scales and average results
                    scales = [480, 600, 750]
                    all_predictions = []
                    
                    for scale in scales:
                        # Resize image
                        orig_img = images[0].cpu().permute(1, 2, 0).numpy() * 255
                        h, w = orig_img.shape[:2]
                        scale_factor = scale / min(h, w)
                        if max(h, w) * scale_factor > 1000:
                            scale_factor = 1000 / max(h, w)
                        
                        new_h = int(h * scale_factor)
                        new_w = int(w * scale_factor)
                        
                        resized_img = cv2.resize(orig_img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
                        resized_tensor = torch.from_numpy(resized_img.astype(np.float32) / 255.0).permute(2, 0, 1).to(device)
                        
                        # Get predictions
                        predictions = model([resized_tensor])
                        
                        # Scale back predictions to original size
                        boxes = predictions[0]['boxes'].cpu() / scale_factor
                        scores = predictions[0]['scores'].cpu()
                        
                        # Store predictions
                        all_predictions.append((boxes, scores))
                    
                    # Combine predictions from different scales
                    all_boxes = torch.cat([p[0] for p in all_predictions])
                    all_scores = torch.cat([p[1] for p in all_predictions])
                    
                    # Apply confidence threshold
                    conf_mask = all_scores > 0.8
                    boxes = all_boxes[conf_mask]
                    scores = all_scores[conf_mask]
                    
                    # Apply NMS
                    keep_indices = torchvision.ops.nms(boxes, scores, 0.3)
                    boxes = boxes[keep_indices]
                    scores = scores[keep_indices]
                    
                    # Limit to top 100 detections
                    if len(scores) > 100:
                        top_indices = torch.argsort(scores, descending=True)[:100]
                        boxes = boxes[top_indices]
                        scores = scores[top_indices]
                    
                    # Write detections to file with full relative path
                    f_det.write(f"{rel_path}\n")
                    f_det.write(f"{len(boxes)}\n")
                    
                    for i in range(len(boxes)):
                        # Convert to FDDB format (x, y, w, h, score)
                        x1, y1, x2, y2 = boxes[i].tolist()
                        w = x2 - x1
                        h = y2 - y1
                        score = scores[i].item()
                        f_det.write(f"{x1} {y1} {w} {h} {score}\n")
    
    print(f"Evaluation complete. Results saved to {output_dir}")
    return output_dir


In [29]:
# Evaluate the model on FDDB
results_dir = evaluate_on_fddb(model, fddb_dir)

Loaded 290 images with 515 faces from FDDB


Evaluating fold 1: 100%|██████████| 290/290 [00:40<00:00,  7.20it/s]


Loaded 285 images with 519 faces from FDDB


Evaluating fold 2: 100%|██████████| 285/285 [00:39<00:00,  7.17it/s]


Loaded 274 images with 517 faces from FDDB


Evaluating fold 3: 100%|██████████| 274/274 [00:37<00:00,  7.21it/s]


Loaded 302 images with 517 faces from FDDB


Evaluating fold 4: 100%|██████████| 302/302 [00:42<00:00,  7.16it/s]


Loaded 298 images with 514 faces from FDDB


Evaluating fold 5: 100%|██████████| 298/298 [00:40<00:00,  7.27it/s]


Loaded 302 images with 518 faces from FDDB


Evaluating fold 6: 100%|██████████| 302/302 [00:42<00:00,  7.14it/s]


Loaded 279 images with 518 faces from FDDB


Evaluating fold 7: 100%|██████████| 279/279 [00:38<00:00,  7.25it/s]


Loaded 276 images with 518 faces from FDDB


Evaluating fold 8: 100%|██████████| 276/276 [00:38<00:00,  7.16it/s]


Loaded 259 images with 514 faces from FDDB


Evaluating fold 9: 100%|██████████| 259/259 [00:36<00:00,  7.05it/s]


Loaded 280 images with 521 faces from FDDB


Evaluating fold 10: 100%|██████████| 280/280 [00:39<00:00,  7.08it/s]

Evaluation complete. Results saved to fddb_results



