# Face Detection Training and Inference
Complete pipeline for training YOLOv8 face detector on WIDER FACE dataset and generating face embeddings

In [None]:
!pip install -q --upgrade ultralytics facenet-pytorch

In [None]:
import os
import json
import cv2
import torch
import numpy as np
from PIL import Image
from pathlib import Path
import shutil
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import yaml
from facenet_pytorch import InceptionResnetV1
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## Parse WIDER FACE Annotations

In [None]:
def parse_wider_annotations(annotation_file, img_base_path):
    """Parse WIDER FACE annotation file format"""
    annotations = []
    
    with open(annotation_file, 'r') as f:
        lines = f.readlines()
    
    idx = 0
    while idx < len(lines):
        img_path = lines[idx].strip()
        idx += 1
        
        if idx >= len(lines):
            break
        
        try:
            num_faces = int(lines[idx].strip())
            idx += 1
        except ValueError:
            continue
        
        boxes = []
        if num_faces > 0:
            for _ in range(num_faces):
                if idx >= len(lines):
                    break
                parts = lines[idx].strip().split()
                if len(parts) >= 4:
                    try:
                        x, y, w, h = map(int, parts[:4])
                        if w > 0 and h > 0:
                            boxes.append([x, y, w, h])
                    except ValueError:
                        pass
                idx += 1
        
        if boxes:
            full_img_path = os.path.join(img_base_path, img_path)
            if os.path.exists(full_img_path):
                annotations.append({
                    'image': full_img_path,
                    'boxes': boxes
                })
    
    return annotations

train_annotations = parse_wider_annotations(
    '/kaggle/input/datasets/rahulftz/face-detection-dataset0906/wider_face_annotations/wider_face_split/wider_face_train_bbx_gt.txt',
    '/kaggle/input/datasets/rahulftz/face-detection-dataset0906/WIDER_train/WIDER_train/images'
)

val_annotations = parse_wider_annotations(
    '/kaggle/input/datasets/rahulftz/face-detection-dataset0906/wider_face_annotations/wider_face_split/wider_face_val_bbx_gt.txt',
    '/kaggle/input/datasets/rahulftz/face-detection-dataset0906/WIDER_val/WIDER_val/images'
)

print(f'Training samples: {len(train_annotations)}')
print(f'Validation samples: {len(val_annotations)}')

## Convert to YOLO Format

In [None]:
def convert_to_yolo_format(annotations, output_dir, split='train'):
    """Convert WIDER FACE annotations to YOLO format"""
    img_dir = os.path.join(output_dir, 'images', split)
    label_dir = os.path.join(output_dir, 'labels', split)
    os.makedirs(img_dir, exist_ok=True)
    os.makedirs(label_dir, exist_ok=True)
    
    valid_samples = []
    
    for idx, ann in enumerate(annotations):
        try:
            img = cv2.imread(ann['image'])
            if img is None:
                continue
            
            h, w = img.shape[:2]
            
            new_img_name = f'{split}_{idx:06d}.jpg'
            new_img_path = os.path.join(img_dir, new_img_name)
            cv2.imwrite(new_img_path, img)
            
            label_file = os.path.join(label_dir, f'{split}_{idx:06d}.txt')
            with open(label_file, 'w') as f:
                for box in ann['boxes']:
                    x, y, bw, bh = box
                    x_center = (x + bw / 2) / w
                    y_center = (y + bh / 2) / h
                    box_w = bw / w
                    box_h = bh / h
                    
                    x_center = max(0, min(1, x_center))
                    y_center = max(0, min(1, y_center))
                    box_w = max(0, min(1, box_w))
                    box_h = max(0, min(1, box_h))
                    
                    f.write(f'0 {x_center:.6f} {y_center:.6f} {box_w:.6f} {box_h:.6f}\n')
            
            valid_samples.append(new_img_name)
        except Exception as e:
            continue
    
    return valid_samples

yolo_dataset_path = '/kaggle/working/yolo_dataset'
os.makedirs(yolo_dataset_path, exist_ok=True)

print('Converting training set...')
train_samples = convert_to_yolo_format(train_annotations[:5000], yolo_dataset_path, 'train')
print(f'Converted {len(train_samples)} training images')

print('Converting validation set...')
val_samples = convert_to_yolo_format(val_annotations[:1000], yolo_dataset_path, 'val')
print(f'Converted {len(val_samples)} validation images')

## Create YOLO Configuration

In [None]:
yaml_config = {
    'path': yolo_dataset_path,
    'train': 'images/train',
    'val': 'images/val',
    'names': {0: 'face'},
    'nc': 1
}

yaml_path = os.path.join(yolo_dataset_path, 'data.yaml')
with open(yaml_path, 'w') as f:
    yaml.dump(yaml_config, f)

print(f'YOLO config saved to {yaml_path}')

## Train YOLOv8 Face Detector

In [None]:
from ultralytics import YOLO

model = YOLO('yolov8n.pt')

results = model.train(
    data=yaml_path,
    epochs=50,
    imgsz=640,
    batch=16,
    name='face_detector',
    project='/kaggle/working',
    device=0 if torch.cuda.is_available() else 'cpu',
    patience=10,
    save=True,
    plots=True,
    verbose=True
)

print('Training completed!')

## Save Trained Model

In [None]:
best_model_path = '/kaggle/working/face_detector/weights/best.pt'
output_model_path = '/kaggle/working/face_detector_best.pt'

if os.path.exists(best_model_path):
    shutil.copy(best_model_path, output_model_path)
    print(f'Model saved to {output_model_path}')
else:
    print('Best model not found')

## Load Trained Model for Inference

In [None]:
face_detector = YOLO(output_model_path if os.path.exists(output_model_path) else 'yolov8n.pt')
print('Face detector loaded')

embedding_model = InceptionResnetV1(pretrained='vggface2').eval().to(device)
print('Embedding model loaded')

## Face Detection and Embedding Generation

In [None]:
def detect_and_embed_faces(image_path, output_dir='/kaggle/working/detected_faces'):
    """
    Detect faces, crop them, and generate embeddings
    
    Args:
        image_path: Path to input image
        output_dir: Directory to save cropped faces
    
    Returns:
        dict: Results containing face count, face paths, and embeddings
    """
    os.makedirs(output_dir, exist_ok=True)
    
    img = cv2.imread(image_path)
    if img is None:
        return {'error': 'Failed to load image'}
    
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    results = face_detector(img_rgb, conf=0.5)
    
    face_crops = []
    embeddings = []
    face_paths = []
    
    transform = transforms.Compose([
        transforms.Resize((160, 160)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])
    
    for idx, result in enumerate(results):
        boxes = result.boxes
        for box_idx, box in enumerate(boxes):
            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
            
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)
            
            if x2 > x1 and y2 > y1:
                face_crop = img_rgb[y1:y2, x1:x2]
                
                face_filename = f'face_{idx}_{box_idx}.jpg'
                face_path = os.path.join(output_dir, face_filename)
                cv2.imwrite(face_path, cv2.cvtColor(face_crop, cv2.COLOR_RGB2BGR))
                face_paths.append(face_path)
                
                face_pil = Image.fromarray(face_crop)
                face_tensor = transform(face_pil).unsqueeze(0).to(device)
                
                with torch.no_grad():
                    embedding = embedding_model(face_tensor)
                    embedding = embedding.cpu().numpy().flatten()
                    embedding = embedding / np.linalg.norm(embedding)
                    embeddings.append(embedding.tolist())
    
    results_dict = {
        'num_faces': len(face_paths),
        'face_paths': face_paths,
        'embeddings': embeddings
    }
    
    embeddings_file = os.path.join(output_dir, 'embeddings.json')
    with open(embeddings_file, 'w') as f:
        json.dump(results_dict, f, indent=2)
    
    print(f'Detected {len(face_paths)} faces')
    print(f'Cropped faces saved to {output_dir}')
    print(f'Embeddings saved to {embeddings_file}')
    
    return results_dict

## Example Usage

In [None]:
test_image_dirs = [
    '/kaggle/input/wider-face/WIDER_val/WIDER_val/images/0--Parade',
    '/kaggle/input/wider-face/WIDER_val/WIDER_val/images/1--Handshaking',
    '/kaggle/input/wider-face/WIDER_val/WIDER_val/images/12--Group'
]

for test_dir in test_image_dirs:
    if os.path.exists(test_dir):
        test_images = [os.path.join(test_dir, f) for f in os.listdir(test_dir) if f.endswith('.jpg')][:1]
        break

if test_images:
    test_image = test_images[0]
    print(f'Testing on: {test_image}')
    
    results = detect_and_embed_faces(test_image)
    
    print(f"\nResults:")
    print(f"Number of faces detected: {results['num_faces']}")
    print(f"Embedding dimension: {len(results['embeddings'][0]) if results['embeddings'] else 0}")
    print(f"First embedding (truncated): {results['embeddings'][0][:10] if results['embeddings'] else None}")
else:
    print('No test images found')

## Visualize Detection Results

In [None]:
import matplotlib.pyplot as plt

def visualize_detections(image_path, output_path='/kaggle/working/detection_result.jpg'):
    """Visualize face detection results"""
    img = cv2.imread(image_path)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    results = face_detector(img_rgb, conf=0.5)
    
    for result in results:
        boxes = result.boxes
        for box in boxes:
            x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
            cv2.rectangle(img_rgb, (x1, y1), (x2, y2), (0, 255, 0), 2)
            conf = float(box.conf[0])
            cv2.putText(img_rgb, f'{conf:.2f}', (x1, y1-10), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(img_rgb)
    plt.axis('off')
    plt.title(f'Detected Faces: {len(boxes)}')
    plt.tight_layout()
    plt.savefig(output_path, bbox_inches='tight', dpi=150)
    plt.show()
    
    print(f'Visualization saved to {output_path}')

if test_images:
    visualize_detections(test_images[0])

## Batch Processing Function

In [None]:
def process_image_batch(image_paths, output_base_dir='/kaggle/working/batch_results'):
    """Process multiple images and generate embeddings for all faces"""
    os.makedirs(output_base_dir, exist_ok=True)
    
    all_results = []
    
    for idx, img_path in enumerate(image_paths):
        print(f'Processing {idx+1}/{len(image_paths)}: {img_path}')
        
        output_dir = os.path.join(output_base_dir, f'image_{idx:04d}')
        results = detect_and_embed_faces(img_path, output_dir)
        
        all_results.append({
            'image_path': img_path,
            'results': results
        })
    
    summary_file = os.path.join(output_base_dir, 'batch_summary.json')
    with open(summary_file, 'w') as f:
        json.dump(all_results, f, indent=2)
    
    total_faces = sum(r['results']['num_faces'] for r in all_results)
    print(f'\nBatch processing completed!')
    print(f'Total images processed: {len(image_paths)}')
    print(f'Total faces detected: {total_faces}')
    print(f'Summary saved to {summary_file}')
    
    return all_results

if len(test_images) > 0:
    batch_results = process_image_batch(test_images[:3])

## Export Functions for Reuse

In [None]:
class FaceDetectionSystem:
    """Complete face detection and embedding system"""
    
    def __init__(self, detector_path, device='cuda'):
        self.device = torch.device(device if torch.cuda.is_available() else 'cpu')
        self.face_detector = YOLO(detector_path)
        self.embedding_model = InceptionResnetV1(pretrained='vggface2').eval().to(self.device)
        
        self.transform = transforms.Compose([
            transforms.Resize((160, 160)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])
        
        print(f'System initialized on {self.device}')
    
    def detect_faces(self, image_path, conf_threshold=0.5):
        """Detect faces in image"""
        img = cv2.imread(image_path)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = self.face_detector(img_rgb, conf=conf_threshold)
        
        detections = []
        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
                conf = float(box.conf[0])
                detections.append({
                    'bbox': [x1, y1, x2, y2],
                    'confidence': conf
                })
        
        return detections, img_rgb
    
    def generate_embedding(self, face_crop):
        """Generate embedding for a face crop"""
        face_pil = Image.fromarray(face_crop)
        face_tensor = self.transform(face_pil).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            embedding = self.embedding_model(face_tensor)
            embedding = embedding.cpu().numpy().flatten()
            embedding = embedding / np.linalg.norm(embedding)
        
        return embedding
    
    def process_image(self, image_path, output_dir=None):
        """Complete pipeline: detect faces and generate embeddings"""
        detections, img_rgb = self.detect_faces(image_path)
        
        results = {
            'image_path': image_path,
            'num_faces': len(detections),
            'faces': []
        }
        
        for idx, det in enumerate(detections):
            x1, y1, x2, y2 = det['bbox']
            face_crop = img_rgb[y1:y2, x1:x2]
            
            if face_crop.size > 0:
                embedding = self.generate_embedding(face_crop)
                
                face_data = {
                    'bbox': det['bbox'],
                    'confidence': det['confidence'],
                    'embedding': embedding.tolist()
                }
                
                if output_dir:
                    os.makedirs(output_dir, exist_ok=True)
                    face_path = os.path.join(output_dir, f'face_{idx}.jpg')
                    cv2.imwrite(face_path, cv2.cvtColor(face_crop, cv2.COLOR_RGB2BGR))
                    face_data['saved_path'] = face_path
                
                results['faces'].append(face_data)
        
        return results

system = FaceDetectionSystem(output_model_path if os.path.exists(output_model_path) else 'yolov8n.pt')

if test_images:
    demo_results = system.process_image(test_images[0], '/kaggle/working/system_output')
    print(f"System detected {demo_results['num_faces']} faces")
    print(f"Embedding dimensions: {len(demo_results['faces'][0]['embedding']) if demo_results['faces'] else 0}")