# Setup and imports

In [1]:
import os
from pathlib import Path
import xml.etree.ElementTree as ET

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, confusion_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision.models as models

In [2]:
DATA_DIR = Path("datasets")
IMAGE_DIR = DATA_DIR / "images"
ANN_DIR = DATA_DIR / "annotations"

IMG_SIZE = 128
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available else "cpu")

# Parse XML : Get crops and labels
We'll first parse the XML files to extract the bounding box coordinates and labels for each image.

## Label mapping

In [3]:
label_map = {
    "with_mask": 1, # mask
    "without_mask": 0,  #  no mask
    "mask_weard_incorrect": 0   # treat incorrect as no mask
}

## Parse annotations and prepare dataset

In [4]:
def parse_annotation(xml_path):
    """Parse a single XML annotation file."""
    tree = ET.parse(xml_path)
    root = tree.getroot()
    filename = root.find("filename").text # type: ignore
    objects = []
    for obj in root.findall("object"):
        name = obj.find("name").text # type: ignore
        bbox = obj.find("bndbox")
        xmin = int(bbox.find("xmin").text) # type: ignore
        ymin = int(bbox.find("ymin").text)   # type: ignore
        xmax = int(bbox.find("xmax").text)
        ymax = int(bbox.find("ymax").text)
        objects.append({"label": name, "bbox": (xmin, ymin, xmax, ymax)})
    
    return filename, objects



In [5]:
def build_crops_dataset(ann_dir, img_dir):
    crops = []
    labels = []
    xml_files = list(ann_dir.glob("*.xml"))
    print(f"Found {len(xml_files)} annotation files")

    for xml_file in xml_files:
        filename, objects = parse_annotation(xml_file)
        img_path = img_dir / filename
        if not img_path.exists():
            print(f"Missing image for {filename}, skipping")
            continue

        img = Image.open(img_path).convert('RGB')
        W, H = img.size

        for obj in objects:
            label_name = obj["label"]
            if label_name not in label_map:
                # Skip unknown labels
                continue

            xmin, ymin, xmax, ymax = obj["bbox"]
            # Clip to image bounds just in case
            xmin = max(0, xmin); ymin=max(0, ymin)
            xmax = min(W, xmax); ymax = min(H, ymax)
            if xmax <= xmin or ymax <= ymin:
                continue

            crop = img.crop((xmin, ymin, xmax, ymax))   # (left, upper, right, low)
            crops.append(crop)
            labels.append(label_map[label_name])
        
    print("Total face crops:", len(labels))
    return crops, np.array(labels, dtype=np.int64)
    
crops, labels = build_crops_dataset(ANN_DIR, IMAGE_DIR)
np.bincount(labels)

        

Found 853 annotation files
Total face crops: 3949


array([ 717, 3232])

# Train / val / test split
We'll keep crops as PIL images and split label indices into train, val, and test sets.

In [6]:
idx = np.arange(len(labels))
X_train_idx, X_temp_idx, y_train, y_temp = train_test_split(
    idx, labels, test_size=0.3, stratify=labels, random_state=42
)

X_val_idx, X_test_idx, y_val, y_test = train_test_split(
    X_temp_idx, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

len(X_train_idx), len(X_val_idx), len(X_test_idx)

(2764, 592, 593)

# Dataset and transforms

In [7]:
train_transform = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.RandomHorizontalFlip(),
    T.RandomRotation(15),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

test_transform = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [8]:
class MaskCropsDataset(Dataset):
    def __init__(self, crops, labels, indices, transform=None):
        self.crops = crops
        self.labels = labels
        self.indices = indices
        self.transform = transform
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, index):
        idx = self.indices[index]
        img = self.crops[index] # PIL Image
        label = int(self.labels[index])
        if self.transform is not None:
            img = self.transform(img)
        return img, label

train_set = MaskCropsDataset(crops, labels, X_train_idx, transform=train_transform)
val_set = MaskCropsDataset(crops, labels, X_val_idx, transform=test_transform)
test_set = MaskCropsDataset(crops, labels, X_test_idx, transform=test_transform)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Define a CNN model

In [9]:
import torch.nn as nn
import  torch.nn.functional as F

class MaskCNN(nn.Module):
    def __init__(self, num_classes=1):  # 1 output for binary(logit)
        super().__init__()
        # Convolutional layer 1
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        # Convolutional layer 2
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        # Convolutional layer 3
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)

        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.25)

        # After 3 times 2x2 pooling on 128x128
        # 128 -> 64 -> 32 -> 16
        # Feature map: 128 channels of size 16x16
        self.fc1 = nn.Linear(128 * 16 * 16, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))    # 64 x 64 x 32
        x = self.pool(F.relu(self.conv2(x)))    # 32 x 32 x 64
        x = self.pool(F.relu(self.conv3(x)))    # 16 x 16 x 128

        x = x.view(x.size(0), -1)   # Flatten
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)  # No activation here, will use

        return x


In [10]:
model = MaskCNN().to(DEVICE)
model

MaskCNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.25, inplace=False)
  (fc1): Linear(in_features=32768, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=1, bias=True)
)

# Train the model

In [None]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
best_val = float("inf")
EPOCHS = 25

print("Starting training...")
for epoch in range(EPOCHS):
    model.train()
    for imgs, labels in tqdm(train_loader, desc=f"Train {epoch+1}/{EPOCHS}"):
        imgs, labels = imgs.to(DEVICE), labels.float().unsqueeze(1).to(DEVICE)
        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item():.4f}")


    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for imgs, labels in tqdm(val_loader, desc=f"Val {epoch+1}/{EPOCHS}"):
            imgs, labels = imgs.to(DEVICE), labels.float().unsqueeze(1).to(DEVICE)
            val_loss += criterion(model(imgs), labels).item()

        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}/{EPOCHS}, Val Loss: {val_loss:.4f}")
        if val_loss < best_val:
            best_val = val_loss
            print(f'Best model saved with val_loss={val_loss:.4f}.')
            torch.save(model.state_dict(), 'models/maskcnn.pt')

Starting training...


Train 1/25:   0%|          | 0/87 [00:00<?, ?it/s]