Build & Train CNN Model from Scratch

**Goals:**
1. Define a simple CNN architecture for 3‑class mask classification  
2. Set up loss (`CrossEntropyLoss`), optimizer (`Adam`), and (optional) LR scheduler  
3. Write train/validation loops  
4. Track & plot loss/accuracy  
5. Save the best model


## Import Libraries and Check Device

In [18]:
import os
import time
import torch

import matplotlib.pyplot as plt
import numpy as np



if torch.cuda.is_available():
    device = torch.device("cuda")
    print(device)
    print(f"Using GPU: {torch.cuda.get_device_name(0)}"
          f" (ID: {torch.cuda.current_device()})")

else:
    device = torch.device("cpu")


cuda
Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU (ID: 0)


In [19]:
import torch.nn as nn
import torch.optim as optim


class SimpleCNN(nn.Module):
    def __init__(self,num_classes=3):
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(3,32, kernel_size=3,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32,64, kernel_size=3,padding=1),
            nn.ReLU(), 
            nn.MaxPool2d(2),

            nn.Conv2d(64,128, kernel_size=3,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)

        )
 
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128*(224//8)*(224//8), 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x
    
model= SimpleCNN(num_classes=3).to(device)
print(model)
    
        


SimpleCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=100352, out_features=256, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=256, out_features=3, bias=True)
  )
)


In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [21]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for imgs, labels in loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)
        preds = outputs.argmax(dim=1)
        correct   += (preds == labels).sum().item()
        total     += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc

def validate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * imgs.size(0)
            preds = outputs.argmax(dim=1)
            correct   += (preds == labels).sum().item()
            total     += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc


In [22]:
# — Step 3 setup (must run before your training loop) —

import os, torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from PIL import Image
import xml.etree.ElementTree as ET

# Hyperparameters
IMG_SIZE    = 224
BATCH_SIZE  = 8
NUM_WORKERS = 4

# (Re‑define your CLASS_MAP and collect_paths_and_labels if needed)
CLASS_MAP = {"with_mask":0,"mask_weared_incorrect":1,"without_mask":2}
def xml_to_label(xml_path):
    tree = ET.parse(xml_path)
    return CLASS_MAP[tree.find("object/name").text.lower()]

def collect_paths_and_labels(split):
    img_dir = f"dataset/{split}/images"
    ann_dir = f"dataset/{split}/annotations"
    paths, labels = [], []
    for fn in os.listdir(img_dir):
        if not fn.lower().endswith((".png",".jpg")): continue
        xml = os.path.join(ann_dir, fn.rsplit(".",1)[0]+".xml")
        if not os.path.exists(xml): continue
        paths.append(os.path.join(img_dir,fn))
        labels.append(xml_to_label(xml))
    return paths, labels

train_paths, train_labels = collect_paths_and_labels("train")
val_paths,   val_labels   = collect_paths_and_labels("val")
test_paths,  test_labels  = collect_paths_and_labels("test")

# Transforms
train_transform = T.Compose([
    T.Resize((IMG_SIZE,IMG_SIZE)),
    T.RandomHorizontalFlip(),
    T.RandomRotation(15),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])
val_transform = T.Compose([
    T.Resize((IMG_SIZE,IMG_SIZE)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225])
])

# Dataset
class MaskDataset(Dataset):
    def __init__(self, paths, labels, transform=None):
        self.paths, self.labels, self.transform = paths, labels, transform
    def __len__(self): return len(self.paths)
    def __getitem__(self, i):
        img = Image.open(self.paths[i]).convert("RGB")
        if self.transform: img = self.transform(img)
        return img, self.labels[i]

# DataLoaders
train_loader = DataLoader(MaskDataset(train_paths, train_labels, train_transform),
                          batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=0, pin_memory=True)
val_loader   = DataLoader(MaskDataset(val_paths,   val_labels,   val_transform),
                          batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=0, pin_memory=True)
test_loader  = DataLoader(MaskDataset(test_paths,  test_labels,  val_transform),
                          batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=0, pin_memory=True)

print(f"Train batches: {len(train_loader)}")


Train batches: 75


In [23]:
EPOCHS = 15

history = {
    'train_loss': [], 'train_acc': [],
    'val_loss':   [], 'val_acc':   []
}

best_val_acc = 0.0
best_model_path = "best_mask_cnn.pth"

for epoch in range(1, EPOCHS+1):
    start = time.time()
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    val_loss,   val_acc   = validate(model, val_loader,   criterion, device)
    # scheduler.step(val_loss)  # if using scheduler

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), best_model_path)

    print(f"Epoch {epoch:02d}/{EPOCHS}  "
          f"Train: loss={train_loss:.4f}, acc={train_acc:.4f}  "
          f"Val:   loss={val_loss:.4f}, acc={val_acc:.4f}  "
          f"Time: {(time.time()-start):.1f}s")

print(f"\nBest validation accuracy: {best_val_acc:.4f}")


Epoch 01/15  Train: loss=0.7786, acc=0.7973  Val:   loss=0.6500, acc=0.7874  Time: 12.0s
Epoch 02/15  Train: loss=0.6257, acc=0.8141  Val:   loss=0.6568, acc=0.7874  Time: 13.3s
Epoch 03/15  Train: loss=0.5582, acc=0.8124  Val:   loss=0.6397, acc=0.7874  Time: 13.7s
Epoch 04/15  Train: loss=0.5923, acc=0.8141  Val:   loss=0.6310, acc=0.7874  Time: 12.0s
Epoch 05/15  Train: loss=0.5584, acc=0.8141  Val:   loss=0.6359, acc=0.7874  Time: 11.8s
Epoch 06/15  Train: loss=0.5852, acc=0.8124  Val:   loss=0.6744, acc=0.7874  Time: 11.9s
Epoch 07/15  Train: loss=0.5536, acc=0.8157  Val:   loss=0.7206, acc=0.7874  Time: 12.2s
Epoch 08/15  Train: loss=0.5566, acc=0.8124  Val:   loss=0.6710, acc=0.7953  Time: 12.2s
Epoch 09/15  Train: loss=0.5601, acc=0.8141  Val:   loss=0.6214, acc=0.8031  Time: 12.6s
Epoch 10/15  Train: loss=0.5133, acc=0.8208  Val:   loss=0.6408, acc=0.7953  Time: 12.2s
Epoch 11/15  Train: loss=0.5002, acc=0.8208  Val:   loss=0.6305, acc=0.8031  Time: 12.2s
Epoch 12/15  Train: l

In [24]:
import cv2
import torch
import torchvision.transforms as T
from PIL import Image
import numpy as np
from time import time

# 1. Settings
MODEL_PATH = "best_mask_cnn.pth"
IMG_SIZE   = 224
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CLASS_NAMES = ["Mask", "Worn Incorrectly", "No Mask"]
BOX_COLORS  = [(0,255,0), (0,165,255), (0,0,255)]  # green, orange, red

# 2. Load your trained model
class SimpleMaskCNN(torch.nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.features = torch.nn.Sequential(
            torch.nn.Conv2d(3, 32, 3, padding=1), torch.nn.ReLU(),
            torch.nn.MaxPool2d(2),
            torch.nn.Conv2d(32, 64, 3, padding=1), torch.nn.ReLU(),
            torch.nn.MaxPool2d(2),
            torch.nn.Conv2d(64,128,3, padding=1), torch.nn.ReLU(),
            torch.nn.MaxPool2d(2),
        )
        flattened = 128 * (IMG_SIZE//8) * (IMG_SIZE//8)
        self.classifier = torch.nn.Sequential(
            torch.nn.Flatten(),
            torch.nn.Linear(flattened, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(256, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

model = SimpleMaskCNN(num_classes=3).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# 3. Define the same transforms you used in training
transform = T.Compose([
    T.Resize((IMG_SIZE, IMG_SIZE)),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])

# 4. Load OpenCV’s Haar‐cascade for face detection
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# 5. Open webcam
cap = cv2.VideoCapture(0)
fps_display_interval = 5  # seconds
frame_count = 0
start_time = time()

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    # Convert to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    
    for (x, y, w, h) in faces:
        # Crop & preprocess face
        face_img = frame[y:y+h, x:x+w]
        pil_img = Image.fromarray(cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB))
        inp = transform(pil_img).unsqueeze(0).to(DEVICE)
        
        # Predict
        with torch.no_grad():
            logits = model(inp)
            pred = logits.argmax(dim=1).item()
        
        # Draw box & label
        color = BOX_COLORS[pred]
        label = CLASS_NAMES[pred]
        cv2.rectangle(frame, (x,y), (x+w, y+h), color, 2)
        cv2.putText(frame, label, (x, y-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)
    
    # Compute & display FPS
    frame_count += 1
    elapsed = time() - start_time
    if elapsed > fps_display_interval:
        fps = frame_count / elapsed
        cv2.putText(frame, f"FPS: {fps:.1f}", (10,30),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.0, (255,255,0), 2)
        start_time = time()
        frame_count = 0
    
    # Show
    cv2.imshow("Real‑Time Face Mask Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


In [None]:
import os
import cv2
import torch
import xml.etree.ElementTree as ET
from torchvision import transforms
from PIL import Image

# ─── Settings ──────────────────────────────────────────────────────────────
IMG_SIZE   = 224
BATCH_SIZE = 1
DEVICE     = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "best_mask_cnn.pth"

CLASS_NAMES = ["Mask", "Incorrectly Worn", "No Mask"]
BOX_COLORS  = [(0,255,0), (0,165,255), (0,0,255)]  # green, orange, red

TEST_IMG_DIR  = "dataset/test/images"
TEST_ANN_DIR  = "dataset/test/annotations"

# ─── Load Model ─────────────────────────────────────────────────────────────
class SimpleMaskCNN(torch.nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.features = torch.nn.Sequential(
            torch.nn.Conv2d(3, 32, 3, padding=1), torch.nn.ReLU(),
            torch.nn.MaxPool2d(2),
            torch.nn.Conv2d(32, 64, 3, padding=1), torch.nn.ReLU(),
            torch.nn.MaxPool2d(2),
            torch.nn.Conv2d(64,128,3, padding=1), torch.nn.ReLU(),
            torch.nn.MaxPool2d(2),
        )
        flat_dim = 128 * (IMG_SIZE//8) * (IMG_SIZE//8)
        self.classifier = torch.nn.Sequential(
            torch.nn.Flatten(),
            torch.nn.Linear(flat_dim, 256),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(256, num_classes)
        )
    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

model = SimpleMaskCNN(num_classes=3).to(DEVICE)
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()

# ─── Transforms ─────────────────────────────────────────────────────────────
transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])

# ─── XML Parser ─────────────────────────────────────────────────────────────
def parse_annotation(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    boxes = []
    for obj in root.findall('object'):
        bb = obj.find('bndbox')
        xmin = int(bb.find('xmin').text)
        ymin = int(bb.find('ymin').text)
        xmax = int(bb.find('xmax').text)
        ymax = int(bb.find('ymax').text)
        boxes.append((xmin, ymin, xmax, ymax))
    return boxes

# ─── Play Test Images as “Video” ───────────────────────────────────────────
for fname in sorted(os.listdir(TEST_IMG_DIR)):
    if not fname.lower().endswith((".png",".jpg")): continue
    img_path = os.path.join(TEST_IMG_DIR, fname)
    ann_path = os.path.join(TEST_ANN_DIR, fname.rsplit(".",1)[0] + ".xml")
    if not os.path.exists(ann_path): continue

    frame = cv2.imread(img_path)
    for (xmin,ymin,xmax,ymax) in parse_annotation(ann_path):
        face = frame[ymin:ymax, xmin:xmax]
        pil   = Image.fromarray(cv2.cvtColor(face,cv2.COLOR_BGR2RGB))
        inp   = transform(pil).unsqueeze(0).to(DEVICE)
        with torch.no_grad():
            pred = model(inp).argmax(dim=1).item()

        color = BOX_COLORS[pred]
        label = CLASS_NAMES[pred]
        cv2.rectangle(frame, (xmin,ymin), (xmax,ymax), color, 2)
        cv2.putText(frame, label, (xmin, ymin-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    cv2.imshow("Test Set Playback", frame)
    # wait 500ms between frames; press ESC to quit early
    if cv2.waitKey(500) & 0xFF == 27:
        break

cv2.destroyAllWindows()


KeyboardInterrupt: 

: 