# Face Mask Detection System

This is the code for training a face mask detection system using Pytorch. The model is trained on a dataset of images containing people with and without face masks. The goal is to classify whether a person is wearing a mask or not. 

## Import libraries


In [4]:
import os # For file and directory operations
import time # For time operations
import cv2 # For image processing
import gc # Force garbage collection
import kagglehub # For Kaggle API (Dataset download)
import xml.etree.ElementTree as et # For parsing XML files
from tqdm import tqdm # For progress bar
from sklearn.model_selection import train_test_split # For splitting the dataset
import albumentations as A  # Library for advanced augmentations
import numpy as  np # For numerical operations
import pandas as pd # For data manipulation
import torch # For Machine Learning
import torch.nn as nn # For neural networks
import torch.optim as optim # For optimization
from torch.utils.data import Dataset, DataLoader # For data loading
from torchvision import transforms # For data transformations
from torchvision.models import mobilenet_v2, MobileNet_V2_Weights # For pre-trained models
from sklearn.metrics import classification_report # For evaluation metrics
import matplotlib.pyplot as plt # For plotting 

## Check the libraries are installed

In [5]:
print("OS Name:", os.name)
print("OpenCV Version:", cv2.__version__)
print("KaggleHub Version:", kagglehub.__version__)
print("Albumentations Version:", A.__version__)
print("NumPy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Torch Version:", torch.__version__)

OS Name: nt
OpenCV Version: 4.11.0
KaggleHub Version: 0.3.12
Albumentations Version: 2.0.6
NumPy Version: 2.2.5
Pandas Version: 2.2.3
Torch Version: 2.7.0+cu126


## Download the Dataset

In [21]:
dataset_path = kagglehub.dataset_download("andrewmvd/face-mask-detection")
print("Path of the Dataset", dataset_path)

Path of the Dataset C:\Users\basee\.cache\kagglehub\datasets\andrewmvd\face-mask-detection\versions\1


## Parse XML Annotations to CSV from the Dataset

In [20]:
def parse_xml_to_csv(xml_folder, output_csv="annotations.csv"):
    data = []
    
    # Loop through all XML files in the specified folder
    for xml_file in tqdm(os.listdir(xml_folder), desc="Parsing XML files"): # Using tqdm for progress bar
        if not xml_file.endswith('.xml'): # Skip non-XML files
            continue
        
        xml_path = os.path.join(xml_folder, xml_file) # Construct full path
        tree = et.parse(xml_path) # Parse the XML file
        root = tree.getroot() # Get the root element
        
        size = root.find('size') # Find the size element
        width = int(size.find('width').text) # Extract width
        height = int(size.find('height').text) # Extract height
        
        # Extract each object (face) in the image
        for obj in root.findall('object'):
            label = obj.find('name').text # Extract label
            bbox = obj.find('bndbox') # Extract bounding box
            xmin = int(bbox.find('xmin').text) 
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)
            
            x_center = (xmin + xmax) / (2 * width)
            y_center = (ymin + ymax) / (2 * height)

            bbox_width = (xmax - xmin) / width
            bbox_height = (ymax - ymin) / height
            
            # Append data to the list
            data.append([
                xml_file.replace('.xml', '.png'), # Replace XML extension with PNG (Image names and XML names are the same)
                label,
                xmin, ymin, xmax, ymax,
                x_center, y_center, bbox_width, bbox_height,
                width, height
            ])
    
    # Create a DataFrame from the list
    columns = [
        'image_name', 'label', 
        'xmin', 'ymin', 'xmax', 'ymax', 
        'x_center', 'y_center', 'width_norm', 'height_norm',
        'img_width', 'img_height'
    ]
    dataframe = pd.DataFrame(data, columns=columns)
    
    dataframe.to_csv(output_csv, index=False) # Save DataFrame to CSV

    print(f"Annotations saved to {output_csv}.")

    return dataframe

# Main script
# Dataset path
xml_folder = r"C:\Users\basee\.cache\kagglehub\datasets\andrewmvd\face-mask-detection\versions\1\annotations"
dataframe = parse_xml_to_csv(xml_folder)

# Display the first few rows of the DataFrame
print("\nFirst few rows of the DataFrame:")
print(dataframe.head())

Parsing XML files: 100%|██████████| 853/853 [00:10<00:00, 80.44it/s]

Annotations saved to annotations.csv.

First few rows of the DataFrame:
          image_name         label  xmin  ymin  xmax  ymax  x_center  \
0  maksssksksss0.png  without_mask    79   105   109   142  0.183594   
1  maksssksksss0.png     with_mask   185   100   226   144  0.401367   
2  maksssksksss0.png  without_mask   325    90   360   141  0.668945   
3  maksssksksss1.png     with_mask   321    34   354    69  0.843750   
4  maksssksksss1.png     with_mask   224    38   261    73  0.606250   

   y_center  width_norm  height_norm  img_width  img_height  
0  0.337432    0.058594     0.101093        512         366  
1  0.333333    0.080078     0.120219        512         366  
2  0.315574    0.068359     0.139344        512         366  
3  0.330128    0.082500     0.224359        400         156  
4  0.355769    0.092500     0.224359        400         156  





## Load and Preprocess Images

In [7]:
INPUT_CSV = "annotations.csv"  # Path to the CSV file with annotations
IMAGE_DIR = r"C:\Users\basee\.cache\kagglehub\datasets\andrewmvd\face-mask-detection\versions\1\images" # Directory containing images

OUTPUT_DIR = "processed_data/" # Directory to save processed data
TARGET_SIZE = (224, 224) # Target size for resizing images

# Augmentation pipeline for training data
augmenter = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.Rotate(limit=20, p=0.3),
], bbox_params=A.BboxParams(format='pascal_voc', label_fields=['class_labels'])) # Using Pascal VOC format for bounding boxes

# Function to load and preprocess images
def load_and_preprocess_image(image_path, target_size):
    img = cv2.imread(image_path) # Read image using OpenCV
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert to RGB
    img = cv2.resize(img, target_size) # Resize to target size
    return img

# Function to normalize bounding box coordinates
def normalize_bbox(bbox, orig_size, target_size):
    xmin, ymin, xmax, ymax = bbox # Unpack bounding box coordinates
    orig_w, orig_h = orig_size # Original image size
    
    # Scale factors
    w_scale = target_size[0] / orig_w
    h_scale = target_size[1] / orig_h
    
    # Normalize bounding box coordinates
    new_xmin = int(xmin * w_scale)
    new_ymin = int(ymin * h_scale)
    new_xmax = int(xmax * w_scale)
    new_ymax = int(ymax * h_scale)
    
    return [new_xmin, new_ymin, new_xmax, new_ymax]

# Function to process the dataset
def process_dataset(df, augment=False):
    processed_data = []
    
    # Iterate through each row in the DataFrame
    for _, row in tqdm(df.iterrows(), total=len(df)):
        img_path = os.path.join(IMAGE_DIR, row['image_name']) # Construct full image path
        original_size = (row['img_width'], row['img_height']) # Original image size
        bbox = [row['xmin'], row['ymin'], row['xmax'], row['ymax']] # Bounding box coordinates
        
        img = load_and_preprocess_image(img_path, TARGET_SIZE) # Load and preprocess image
        
        new_bbox = normalize_bbox(bbox, original_size, TARGET_SIZE) # Normalize bounding box coordinates
        
        # If augmenting, apply augmentations
        if augment:
            augmented = augmenter(
                image=img,
                bboxes=[new_bbox],
                class_labels=[row['label']]
            )
            img = augmented['image']
            new_bbox = augmented['bboxes'][0] if augmented['bboxes'] else new_bbox 
        
        img = img / 255.0 # Normalize image 
        
        # Append processed data
        processed_data.append({
            'image': img,
            'bbox': new_bbox,
            'label': 1 if row['label'] == 'with_mask' else 0,  # Binary encoding
            'original_image': row['image_name']
        })
    
    return processed_data

# Main script
# Check if the output directory exists, if not create it
os.makedirs(OUTPUT_DIR, exist_ok=True)
    
# Load the CSV file
df = pd.read_csv(INPUT_CSV)
 
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # Split into train and test sets
train_df, validation_df = train_test_split(train_df, test_size=0.25, random_state=42) # Split train set into train and validation sets

# Process training dataset     
print("Processing Training set...")
train_data = process_dataset(train_df, augment=True)

# Process validation dataset 
print("\nProcessing Validation set...")
validation_data = process_dataset(validation_df)

# Process test dataset
print("\nProcessing Test set...")
test_data = process_dataset(test_df)
    
# Save processed training data to .npz files
print("\nSaving Training data...")
np.savez_compressed(
    os.path.join(OUTPUT_DIR, "train.npz"),
    images=np.array([x['image'] for x in train_data]),
    bboxes=np.array([x['bbox'] for x in train_data]),
    labels=np.array([x['label'] for x in train_data])
)
print("Training data saved.")

# Save processed validation data to .npz files
print("\nSaving Validation data...")
np.savez_compressed(
    os.path.join(OUTPUT_DIR, "validation.npz"),
    images=np.array([x['image'] for x in validation_data]),
    bboxes=np.array([x['bbox'] for x in validation_data]),
    labels=np.array([x['label'] for x in validation_data])
)
print("Validation data saved.")

# Save processed test data to .npz files
print("\nSaving Test data...")
np.savez_compressed(
    os.path.join(OUTPUT_DIR, "test.npz"),
    images=np.array([x['image'] for x in test_data]),
    bboxes=np.array([x['bbox'] for x in test_data]),
    labels=np.array([x['label'] for x in test_data])
)
print("Test data saved.")

del train_data, validation_df, test_data # Delete variables to free up memory
gc.collect() # Force garbage collection for memory cleanup
print("\nMemory cleaned up.")
    
print(f"\nPreprocessing complete! Data saved to {OUTPUT_DIR}.")

Processing Training set...


100%|██████████| 2442/2442 [00:23<00:00, 104.59it/s]



Processing Validation set...


100%|██████████| 815/815 [00:07<00:00, 115.61it/s]



Processing Test set...


100%|██████████| 815/815 [00:07<00:00, 115.61it/s]



Saving Training data...
Training data saved.

Saving Validation data...
Validation data saved.

Saving Test data...
Test data saved.

Memory cleaned up.

Preprocessing complete! Data saved to processed_data/.


## Verify GPU

In [6]:
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Name: {torch.cuda.get_device_name(0)}")

GPU Available: True
GPU Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


## Model Building & Training

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Use GPU if available
BATCH_SIZE = 32  # Batch size for training 
EPOCHS = 50  # Number of epochs for training
LR = 0.001  # Learning rate for optimizer
IMAGE_SIZE = (224, 224)  # Image size for model input

torch.multiprocessing.set_sharing_strategy('file_system') # Set sharing strategy for multiprocessing

# Custom Dataset Class
class MaskDataset(Dataset):
    # Initialize the dataset
    def __init__(self, npz_file, transform=None):
        data = np.load(npz_file) # Load the .npz file
        
        # Load images and labels
        self.images = data['images']
        self.labels = data['labels']
        
        self.transform = transform # Transformations to be applied to the images

    # Get the length of the dataset
    def __len__(self):
        return len(self.images)

    # Get a single item from the dataset
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]

        # Apply transformations if any
        if self.transform: 
            image = self.transform(image)

        # Convert image to tensor
        image = image.float()

        return image, label

# Define transformations for training dataset
train_transform = transforms.Compose([
    transforms.ToTensor(), # Convert image to tensor
    transforms.RandomHorizontalFlip(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Define transformations for validation and test datasets
validation_and_test_transform = transforms.Compose([
    transforms.ToTensor(), # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load datasets
print("Loading Processed Data...")
train_dataset = MaskDataset(os.path.join('processed_data', 'train.npz'), transform=train_transform)
validation_dataset = MaskDataset(os.path.join('processed_data', 'validation.npz'), transform=validation_and_test_transform)
test_dataset = MaskDataset(os.path.join('processed_data', 'test.npz'), transform=validation_and_test_transform)
print("Data Loaded.")

# Create DataLoader for each dataset
print("\nCreating DataLoaders...")
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)
print("DataLoaders Created.")

# Model Definition
class MaskDetector(nn.Module):
    # Initialize the model
    def __init__(self):
        super(MaskDetector, self).__init__() # Call the parent constructor
        self.base_model = mobilenet_v2(weights=MobileNet_V2_Weights.IMAGENET1K_V1)
        self.base_model.features[0][0] = nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False) # Modify the first convolutional layer

        # Freeze the base model parameters
        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(self.base_model.last_channel, 1),
            nn.Sigmoid()
        )

    # Forward pass
    def forward(self, x):
        x = self.base_model.features(x) # Pass through the base model
        x = nn.functional.adaptive_avg_pool2d(x, (1, 1)) # Adaptive average pooling
        x = torch.flatten(x, 1) # Flatten the output
        return self.classifier(x) # Pass through the classifier

# Move model to device
print("\nInitializing Model...")
model = MaskDetector().to(DEVICE)
print("Model Initialized.")

# Define loss function and optimizer
print("\nDefining Loss Function and Optimizer...")
criterion = nn.BCELoss() # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=LR) # Adam optimizer
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2) # Learning rate scheduler
print("Loss Function and Optimizer Defined.")

# Training Function
def train_model():
    best_validation_loss = float('inf') # Initialize best validation loss

    # Training loop
    print("Starting Training...")
    for epoch in range(EPOCHS):
        model.train() # Set model to training mode
        train_loss = 0.0 # Initialize training loss

        # Iterate through training data
        for images, labels in train_loader: # Get batch of images and labels
            images = images.to(DEVICE, non_blocking=True) # Move images to device
            labels = labels.float().to(DEVICE, non_blocking=True) # Move labels to device

            optimizer.zero_grad() # Zero the gradients
            outputs = model(images) # Forward pass 
            loss = criterion(outputs.squeeze(), labels) # Compute loss
            loss.backward() # Backward pass
            optimizer.step() # Update weights

            # Update training loss 
            train_loss += loss.item() * images.size(0)

        validation_loss, validation_acc = evaluate(model, validation_loader) # Compute validation loss and accuracy
        scheduler.step(validation_loss) # Adjust learning rate based on validation loss

        train_loss = train_loss / len(train_loader.dataset) # Average training loss

        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print(f'Train Loss: {train_loss:.4f} | Validation Loss: {validation_loss:.4f} | Validation Acc: {validation_acc:.4f}')

        # Save the model if validation loss improves
        if validation_loss < best_validation_loss:
            best_validation_loss = validation_loss
            torch.save(model.state_dict(), 'face_mask_detection_model.pth')
            print('Model saved!')
        print("\n")

    print("Model Trained Successfully!")

# Evaluation Function
def evaluate(model, data_loader):
    model.eval() # Set model to evaluation mode
    loss = 0.0
    correct = 0

    # Iterate through validation/test data
    with torch.no_grad(): # Disable gradient calculation
        # Iterate over batches
        for images, labels in data_loader: 
            images = images.to(DEVICE) # Move images to device
            labels = labels.float().to(DEVICE) # Move labels to device
            
            outputs = model(images) # Forward pass
            loss += criterion(outputs.squeeze(), labels).item() * images.size(0) # Compute loss
            preds = (outputs > 0.5).float() # Convert probabilities to binary predictions
            correct += (preds.squeeze() == labels).sum().item() # Count correct predictions

    # Compute average loss and accuracy
    avg_loss = loss / len(data_loader.dataset)
    accuracy = correct / len(data_loader.dataset)
    return avg_loss, accuracy

# Main script
# Define the device
print(f'\nUsing device: {DEVICE}')
print(f'GPU Name: {torch.cuda.get_device_name(0)}')

# Dataset statistics
print(f'\nTraining on {len(train_dataset)} samples')
print(f'Validating on {len(validation_dataset)} samples')

start_time = time.time() # Start time for training

print("\n")

train_model() # Train the model
print(f'Training completed in {(time.time() - start_time) / 60:.2f} minutes')

# Load best model and test
model.load_state_dict(torch.load('face_mask_detection_model.pth', weights_only=True))
test_loss, test_acc = evaluate(model, test_loader) # Evaluate on test set
print(f'\nTest Accuracy: {test_acc:.4f} | Test Loss: {test_loss:.4f}')

# Generate classification report
y_true, y_pred = [], []
# Iterate through test data
with torch.no_grad(): # Disable gradient calculation
    # Iterate over batches 
    for images, labels in test_loader:
        images = images.to(DEVICE) # Move images to device
        outputs = model(images) # Forward pass
        preds = (outputs > 0.5).float() # Convert probabilities to binary predictions
        y_true.extend(labels.tolist()) # Append true labels
        y_pred.extend(preds.cpu().squeeze().tolist()) # Append predicted labels

print('\nClassification Report:')
print(classification_report(y_true, y_pred, target_names=['No Mask', 'Mask']))

Loading Processed Data...
Data Loaded.

Creating DataLoaders...
DataLoaders Created.

Initializing Model...
Model Initialized.

Defining Loss Function and Optimizer...
Loss Function and Optimizer Defined.

Using device: cuda
GPU Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU

Training on 2442 samples
Validating on 815 samples


Starting Training...
Epoch 1/20
Train Loss: 0.5178 | Validation Loss: 0.6096 | Validation Acc: 0.6834
Model saved!


Epoch 2/20
Train Loss: 0.4670 | Validation Loss: 0.5122 | Validation Acc: 0.7755
Model saved!


Epoch 3/20
Train Loss: 0.4627 | Validation Loss: 0.4769 | Validation Acc: 0.7939
Model saved!


Epoch 4/20
Train Loss: 0.4479 | Validation Loss: 0.4748 | Validation Acc: 0.7939
Model saved!


Epoch 5/20
Train Loss: 0.4343 | Validation Loss: 0.4676 | Validation Acc: 0.8000
Model saved!


Epoch 6/20
Train Loss: 0.4183 | Validation Loss: 0.4702 | Validation Acc: 0.8061


Epoch 7/20
Train Loss: 0.4131 | Validation Loss: 0.4797 | Validation Acc: 0.8061


Epoch 

## Loading the Trained Model

In [7]:
# Define the model for inference
class MaskDetector(torch.nn.Module):
    def __init__(self): # Initialize the model
        super(MaskDetector, self).__init__() # Call the parent constructor
        self.base_model = mobilenet_v2(pretrained=True)
        self.base_model.features[0][0] = torch.nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False) # Modify the first convolutional layer
        
        # Freeze the base model parameters
        self.classifier = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(self.base_model.last_channel, 1),
            torch.nn.Sigmoid()
        )
    
    # Forward pass
    def forward(self, x):
        x = self.base_model.features(x) # Pass through the base model
        x = torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)) # Adaptive average pooling
        x = torch.flatten(x, 1) # Flatten the output
        return self.classifier(x)

# Intialize the model to the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MaskDetector().to(device)

# Load the model weights
model.load_state_dict(torch.load('face_mask_detection_model.pth', map_location=device, weights_only=True))
model.eval() # Set model to evaluation mode

print("Model loaded successfully!")

# Define the transformation for inference
transform = transforms.Compose([
    transforms.ToTensor(), # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize image
])

Model loaded successfully!


## Single Image Prediction

In [9]:
# Function to predict the mask status of a face in an image 
def predict(image_path): 
    img = cv2.imread(image_path) # Read image using OpenCV
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert to RGB
    img = cv2.resize(img, (224, 224)) # Resize to target size
    img_tensor = transform(img).unsqueeze(0).to(device) # Convert to tensor and add batch dimension
    
    # Predict the mask status
    with torch.no_grad():
        output = model(img_tensor) # Forward pass
        probability = output.item() # Get probability

        label = "Mask" if probability > 0.5 else "No Mask" # Determine label based on probability
        confidence = max(probability, 1 - probability) # Get confidence score
    
    print(f"Prediction: {label} ({confidence:.2%} confidence)") # Print prediction and confidence

predict('test_images/test_face.jpg') # Test the prediction function with the given image 

Prediction: No Mask (70.75% confidence)


## Single Image Prediction with Visualization

In [None]:
# Function to predict the mask status of a face in an image
def predict(image_path):
    img = cv2.imread(image_path) # Read image using OpenCV
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert to RGB
    img_resized = cv2.resize(img_rgb, (224, 224)) # Resize to target size
    img_tensor = transform(img_resized).unsqueeze(0).to(device) # Convert to tensor and add batch dimension
    
    # Predict the mask status
    with torch.no_grad():
        output = model(img_tensor) # Forward pass
        probability = output.item() # Get probability

        label = "Mask" if probability > 0.5 else "No Mask" # Determine label based on probability
        confidence = max(probability, 1 - probability) # Get confidence score

    # Show the image with label
    plt.figure(figsize=(5, 5))
    plt.imshow(img_rgb)
    plt.title(f"Prediction: {label} ({confidence:.2%} confidence)", fontsize=14)
    plt.axis('off')
    plt.show()

predict('test_images/test_face.jpg') # Test the prediction function with the given image 