COMPUTER VISION - OBJECT DETECTION PROJECT TEMPLATE
====================================================
Use Case: Object Detection, Instance Segmentation, Face Detection

# 1. PROJECT SETUP & ENVIRONMENT

## 1.1 Import Libraries

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Deep Learning
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Detection specific
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.ops import box_iou

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1.2 Configuration

In [None]:
CONFIG = {
    'data_dir': './data',
    'img_size': (640, 640),
    'batch_size': 8,
    'num_epochs': 50,
    'learning_rate': 0.001,
    'num_classes': 10,  # Including background
    'num_workers': 4,
    'save_dir': './models',
    'confidence_threshold': 0.5,
    'iou_threshold': 0.5,
    'random_seed': 42
}

# Set random seeds
torch.manual_seed(CONFIG['random_seed'])
np.random.seed(CONFIG['random_seed'])

# 2. DATA LOADING & EXPLORATION

## 2.1 Custom Dataset Class

In [None]:
class ObjectDetectionDataset(Dataset):
    """Custom Dataset for Object Detection"""
    
    def __init__(self, root_dir, transforms=None, mode='train'):
        self.root_dir = Path(root_dir)
        self.transforms = transforms
        self.mode = mode
        
        # Load image paths and annotations
        self.images = sorted(list((self.root_dir / 'images' / mode).glob('*.jpg')))
        self.annotations = sorted(list((self.root_dir / 'annotations' / mode).glob('*.json')))
        
        assert len(self.images) == len(self.annotations), "Mismatch between images and annotations"
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        # Load image
        img_path = str(self.images[idx])
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Load annotations
        with open(self.annotations[idx], 'r') as f:
            annot = json.load(f)
        
        boxes = []
        labels = []
        
        for obj in annot['objects']:
            boxes.append(obj['bbox'])  # [xmin, ymin, xmax, ymax]
            labels.append(obj['label'])
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        target = {
            'boxes': boxes,
            'labels': labels,
            'image_id': torch.tensor([idx])
        }
        
        if self.transforms:
            transformed = self.transforms(image=image, bboxes=boxes, labels=labels)
            image = transformed['image']
            target['boxes'] = torch.as_tensor(transformed['bboxes'], dtype=torch.float32)
        
        return image, target

## 2.2 Data Exploration

In [None]:
# Load sample data for exploration
sample_dataset = ObjectDetectionDataset(CONFIG['data_dir'], mode='train')

print(f"Total training samples: {len(sample_dataset)}")

# Visualize sample images with bounding boxes
def visualize_sample(dataset, num_samples=4):
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    axes = axes.ravel()
    
    for i in range(num_samples):
        image, target = dataset[i]
        
        if torch.is_tensor(image):
            image = image.permute(1, 2, 0).numpy()
        
        axes[i].imshow(image)
        
        # Draw bounding boxes
        for box, label in zip(target['boxes'], target['labels']):
            x1, y1, x2, y2 = box
            rect = plt.Rectangle((x1, y1), x2-x1, y2-y1, 
                                fill=False, color='red', linewidth=2)
            axes[i].add_patch(rect)
            axes[i].text(x1, y1-5, f'Class {label}', 
                        color='red', fontsize=10, weight='bold')
        
        axes[i].set_title(f'Sample {i+1}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_sample(sample_dataset)

In [None]:
# Analyze class distribution
all_labels = []
box_areas = []

for i in range(len(sample_dataset)):
    _, target = sample_dataset[i]
    all_labels.extend(target['labels'].tolist())
    
    boxes = target['boxes']
    areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
    box_areas.extend(areas.tolist())

# Plot class distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
pd.Series(all_labels).value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class ID')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
plt.hist(box_areas, bins=50, edgecolor='black')
plt.title('Bounding Box Area Distribution')
plt.xlabel('Area (pixels)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# 3. DATA AUGMENTATION & PREPROCESSING

In [None]:
# Training augmentations
train_transforms = A.Compose([
    A.RandomResizedCrop(height=CONFIG['img_size'][0], width=CONFIG['img_size'][1], scale=(0.8, 1.0)),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.ColorJitter(p=0.2),
    A.Blur(blur_limit=3, p=0.1),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

# Validation augmentations
val_transforms = A.Compose([
    A.Resize(height=CONFIG['img_size'][0], width=CONFIG['img_size'][1]),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2()
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['labels']))

# 4. DATALOADER SETUP

In [None]:
def collate_fn(batch):
    """Custom collate function for object detection"""
    return tuple(zip(*batch))

# Create datasets
train_dataset = ObjectDetectionDataset(CONFIG['data_dir'], transforms=train_transforms, mode='train')
val_dataset = ObjectDetectionDataset(CONFIG['data_dir'], transforms=val_transforms, mode='val')

# Create dataloaders
train_loader = DataLoader(
    train_dataset, 
    batch_size=CONFIG['batch_size'],
    shuffle=True,
    num_workers=CONFIG['num_workers'],
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=CONFIG['batch_size'],
    shuffle=False,
    num_workers=CONFIG['num_workers'],
    collate_fn=collate_fn
)

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

# 5. MODEL ARCHITECTURE

## 5.1 Load Pretrained Model

In [None]:
def get_model(num_classes):
    """Load Faster R-CNN model with ResNet50 backbone"""
    model = fasterrcnn_resnet50_fpn(pretrained=True)
    
    # Replace classifier head
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
        in_features, num_classes
    )
    
    return model

model = get_model(CONFIG['num_classes']).to(device)
print("Model loaded successfully")

## 5.2 Alternative: YOLOv5 (Ultralytics)

In [None]:
# # Install: pip install ultralytics
# from ultralytics import YOLO
# 
# # Load pretrained YOLOv5
# model = YOLO('yolov5s.pt')
# 
# # Train
# results = model.train(data='data.yaml', epochs=CONFIG['num_epochs'], imgsz=CONFIG['img_size'][0])

# 6. TRAINING SETUP

In [None]:
# Optimizer and scheduler
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=CONFIG['learning_rate'],
    momentum=0.9,
    weight_decay=0.0005
)

lr_scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=3,
    gamma=0.1
)

## 6.1 Training Loop

In [None]:
def train_one_epoch(model, optimizer, data_loader, device, epoch):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(data_loader, desc=f'Epoch {epoch}')
    
    for images, targets in progress_bar:
        images = list(image.to(device) for image in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
        
        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        # Backward pass
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        total_loss += losses.item()
        
        progress_bar.set_postfix({'loss': losses.item()})
    
    return total_loss / len(data_loader)

## 6.2 Validation Function

In [None]:
@torch.no_grad()
def evaluate(model, data_loader, device):
    """Evaluate model on validation set"""
    model.eval()
    
    all_predictions = []
    all_targets = []
    
    for images, targets in tqdm(data_loader, desc='Validating'):
        images = list(image.to(device) for image in images)
        
        predictions = model(images)
        
        all_predictions.extend([{k: v.cpu() for k, v in pred.items()} for pred in predictions])
        all_targets.extend([{k: v.cpu() for k, v in t.items()} for t in targets])
    
    return all_predictions, all_targets

# 7. MODEL TRAINING

In [None]:
# Training history
history = {
    'train_loss': [],
    'val_mAP': []
}

best_mAP = 0.0

for epoch in range(CONFIG['num_epochs']):
    print(f"\nEpoch {epoch+1}/{CONFIG['num_epochs']}")
    
    # Train
    train_loss = train_one_epoch(model, optimizer, train_loader, device, epoch)
    history['train_loss'].append(train_loss)
    
    # Validate
    predictions, targets = evaluate(model, val_loader, device)
    
    # Calculate mAP (simplified)
    # In practice, use COCO evaluation metrics
    
    # Update learning rate
    lr_scheduler.step()
    
    # Save best model
    # if val_mAP > best_mAP:
    #     best_mAP = val_mAP
    #     torch.save(model.state_dict(), f"{CONFIG['save_dir']}/best_model.pth")
    
    print(f"Train Loss: {train_loss:.4f}")

# 8. EVALUATION METRICS

In [None]:
def calculate_iou(box1, box2):
    """Calculate IoU between two boxes"""
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0

def calculate_map(predictions, targets, iou_threshold=0.5):
    """Calculate mean Average Precision"""
    # Simplified mAP calculation
    # For production, use pycocotools
    aps = []
    
    for pred, target in zip(predictions, targets):
        pred_boxes = pred['boxes']
        pred_scores = pred['scores']
        pred_labels = pred['labels']
        
        target_boxes = target['boxes']
        target_labels = target['labels']
        
        # Filter by confidence threshold
        mask = pred_scores > CONFIG['confidence_threshold']
        pred_boxes = pred_boxes[mask]
        pred_labels = pred_labels[mask]
        pred_scores = pred_scores[mask]
        
        # Calculate AP for this image
        # (Simplified implementation)
    
    return np.mean(aps) if aps else 0.0

# 9. VISUALIZATION & INFERENCE

In [None]:
def visualize_predictions(model, dataset, num_samples=4):
    """Visualize model predictions"""
    model.eval()
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    axes = axes.ravel()
    
    for i in range(num_samples):
        image, target = dataset[i]
        
        # Make prediction
        with torch.no_grad():
            prediction = model([image.to(device)])[0]
        
        # Convert image back to numpy
        img = image.permute(1, 2, 0).cpu().numpy()
        img = (img * [0.229, 0.224, 0.225]) + [0.485, 0.456, 0.406]
        img = np.clip(img, 0, 1)
        
        axes[i].imshow(img)
        
        # Draw predictions
        boxes = prediction['boxes'].cpu().numpy()
        scores = prediction['scores'].cpu().numpy()
        labels = prediction['labels'].cpu().numpy()
        
        for box, score, label in zip(boxes, scores, labels):
            if score > CONFIG['confidence_threshold']:
                x1, y1, x2, y2 = box
                rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                                    fill=False, color='red', linewidth=2)
                axes[i].add_patch(rect)
                axes[i].text(x1, y1-5, f'Class {label}: {score:.2f}',
                           color='red', fontsize=10, weight='bold',
                           bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        axes[i].set_title(f'Prediction {i+1}')
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_predictions(model, val_dataset)

# 10. INFERENCE PIPELINE

In [None]:
def detect_objects(image_path, model, device, confidence_threshold=0.5):
    """Run object detection on a single image"""
    model.eval()
    
    # Load and preprocess image
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    transform = val_transforms
    transformed = transform(image=image)
    input_tensor = transformed['image'].unsqueeze(0).to(device)
    
    # Make prediction
    with torch.no_grad():
        prediction = model(input_tensor)[0]
    
    # Filter by confidence
    mask = prediction['scores'] > confidence_threshold
    boxes = prediction['boxes'][mask].cpu().numpy()
    scores = prediction['scores'][mask].cpu().numpy()
    labels = prediction['labels'][mask].cpu().numpy()
    
    results = {
        'boxes': boxes,
        'scores': scores,
        'labels': labels
    }
    
    return image, results

# Test inference
# image, results = detect_objects('test_image.jpg', model, device)

# 11. MODEL EXPORT & DEPLOYMENT

In [None]:
# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'config': CONFIG
}, f"{CONFIG['save_dir']}/final_model.pth")

# Export to ONNX
# dummy_input = torch.randn(1, 3, CONFIG['img_size'][0], CONFIG['img_size'][1]).to(device)
# torch.onnx.export(model, dummy_input, "model.onnx", 
#                   export_params=True, opset_version=11)

# 12. CONCLUSIONS & NEXT STEPS

## Summary:
- Dataset: X images, Y classes
- Architecture: Faster R-CNN with ResNet50
- Final mAP: X.XX

## Next Steps:
- [ ] Experiment with different architectures (YOLOv8, EfficientDet)
- [ ] Implement test-time augmentation
- [ ] Optimize model with quantization
- [ ] Deploy with TensorRT/ONNX Runtime
- [ ] Implement real-time video detection
- [ ] Add tracking capabilities