In [1]:
import os
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchvision.models import mobilenet_v2
from torch.utils.data import Dataset, DataLoader, random_split

# Dataset Class

In [27]:
class MultiTaskDataset(Dataset):
    def __init__(self, drowsy_dir, gesture_dir, transform=None):
        self.transform = transform

        self.drowsy_classes = sorted(os.listdir(drowsy_dir))
        self.gesture_classes = sorted(os.listdir(gesture_dir))

        self.drowsy_map = {c: i for i, c in enumerate(self.drowsy_classes)}
        self.gesture_map = {c: i for i, c in enumerate(self.gesture_classes)}

        self.drowsy_images = []
        self.gesture_images = []

        # Collect drowsy images
        for d_label in self.drowsy_classes:
            d_class_dir = os.path.join(drowsy_dir, d_label)
            for fname in os.listdir(d_class_dir):
                img_path = os.path.join(d_class_dir, fname)
                self.drowsy_images.append((img_path, self.drowsy_map[d_label]))

        # Collect gesture images
        for g_label in self.gesture_classes:
            g_class_dir = os.path.join(gesture_dir, g_label)
            for fname in os.listdir(g_class_dir):
                img_path = os.path.join(g_class_dir, fname)
                self.gesture_images.append((img_path, self.gesture_map[g_label]))

        # Truncate to the smallest dataset length
        self.length = min(len(self.drowsy_images), len(self.gesture_images))
        self.drowsy_images = self.drowsy_images[:self.length]
        self.gesture_images = self.gesture_images[:self.length]

        print(f"Paired {self.length} drowsy and gesture images.")

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        d_img_path, d_label = self.drowsy_images[idx]
        g_img_path, g_label = self.gesture_images[idx]

        # You can pick either image — we’ll use the drowsy one
        img = Image.open(d_img_path).convert("L")  # Grayscale

        if self.transform:
            img = self.transform(img)

        return img, d_label, g_label

# Transformation Pipeline

In [38]:
transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=1),
    transforms.Resize((128, 128)),
    transforms.ToTensor(),  # Converts to [1, H, W] for grayscale
])

# transforms.Grayscale(num_output_channels=1),  # Ensure it's 1 channel
#     transforms.Resize((224, 224)),
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomRotation(degrees=15),
#     transforms.ToTensor(),  # Convert to Tensor (required for normalization)
#     transforms.Normalize(mean=[0.5], std=[0.5])

# Modified MobileNetV2 for Grayscale and Multi-task

In [39]:
class MultiTaskMobileNet(nn.Module):
    def __init__(self, num_drowsy_classes, num_gesture_classes):
        super().__init__()
        base_model = mobilenet_v2(pretrained=True)

        # Modify the first conv layer to accept 1 channel (grayscale)
        first_conv = base_model.features[0][0]
        new_conv = nn.Conv2d(1, first_conv.out_channels,
                             kernel_size=first_conv.kernel_size,
                             stride=first_conv.stride,
                             padding=first_conv.padding,
                             bias=False)
        # Copy weights by averaging over RGB channels
        with torch.no_grad():
            new_conv.weight[:] = first_conv.weight.mean(dim=1, keepdim=True)
        base_model.features[0][0] = new_conv

        self.features = base_model.features
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.2)

        in_features = base_model.last_channel
        self.drowsy_head = nn.Linear(in_features, num_drowsy_classes)
        self.gesture_head = nn.Linear(in_features, num_gesture_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.pool(x).squeeze(-1).squeeze(-1)
        x = self.dropout(x)

        d_out = self.drowsy_head(x)
        g_out = self.gesture_head(x)
        return d_out, g_out

# Train/Test Split and DataLoader

In [40]:
drowsy_path = "drowsiness_data"
gesture_path = "gesture_data"

In [41]:
# Load full dataset
full_dataset = MultiTaskDataset(drowsy_path, gesture_path, transform=transform)


train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Paired 8000 drowsy and gesture images.


# Training Loop Setup

In [42]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")
print(device)

cuda


In [43]:
model = MultiTaskMobileNet(num_drowsy_classes=2, num_gesture_classes=len(os.listdir(gesture_path)))
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [44]:
def evaluate(model, data_loader, device):
    model.eval()
    correct_drowsy = 0
    correct_gesture = 0
    total = 0
    total_loss = 0

    with torch.no_grad():
        for imgs, d_labels, g_labels in data_loader:
            imgs = imgs.to(device)
            d_labels = d_labels.to(device)
            g_labels = g_labels.to(device)

            d_out, g_out = model(imgs)

            loss_d = F.cross_entropy(d_out, d_labels)
            loss_g = F.cross_entropy(g_out, g_labels)
            loss = loss_d + loss_g
            total_loss += loss.item()

            _, d_pred = d_out.max(1)
            _, g_pred = g_out.max(1)

            correct_drowsy += (d_pred == d_labels).sum().item()
            correct_gesture += (g_pred == g_labels).sum().item()
            total += d_labels.size(0)

    val_d_acc = 100 * correct_drowsy / total
    val_g_acc = 100 * correct_gesture / total
    val_loss = total_loss / len(data_loader)

    return val_loss, val_d_acc, val_g_acc

In [45]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    correct_drowsy = 0
    correct_gesture = 0
    total_samples = 0

    for imgs, d_labels, g_labels in train_loader:
        imgs = imgs.to(device)
        d_labels = d_labels.to(device)
        g_labels = g_labels.to(device)

        optimizer.zero_grad()
        d_out, g_out = model(imgs)

        loss_d = F.cross_entropy(d_out, d_labels)
        loss_g = F.cross_entropy(g_out, g_labels)
        loss = loss_d + loss_g

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, d_pred = torch.max(d_out, 1)
        _, g_pred = torch.max(g_out, 1)

        correct_drowsy += (d_pred == d_labels).sum().item()
        correct_gesture += (g_pred == g_labels).sum().item()
        total_samples += d_labels.size(0)

    train_d_acc = 100.0 * correct_drowsy / total_samples
    train_g_acc = 100.0 * correct_gesture / total_samples

    # 🔍 Evaluation (validation)
    val_loss, val_d_acc, val_g_acc = evaluate(model, test_loader, device)

    print(f"Epoch [{epoch+1}/{num_epochs}] "
          f"Train Loss: {total_loss:.4f} | Train Drowsy Acc: {train_d_acc:.2f}% | Train Gesture Acc: {train_g_acc:.2f}%")
    print(f"           ➤ Validation Loss: {val_loss:.4f} | Val Drowsy Acc: {val_d_acc:.2f}% | Val Gesture Acc: {val_g_acc:.2f}%\n")

Epoch [1/20] Train Loss: 104.9863 | Train Drowsy Acc: 88.47% | Train Gesture Acc: 88.45%
           ➤ Validation Loss: 0.2568 | Val Drowsy Acc: 95.06% | Val Gesture Acc: 95.19%
Epoch [2/20] Train Loss: 35.3711 | Train Drowsy Acc: 96.80% | Train Gesture Acc: 96.88%
           ➤ Validation Loss: 0.1679 | Val Drowsy Acc: 96.88% | Val Gesture Acc: 96.62%
Epoch [3/20] Train Loss: 17.2151 | Train Drowsy Acc: 98.64% | Train Gesture Acc: 98.55%
           ➤ Validation Loss: 0.1539 | Val Drowsy Acc: 97.12% | Val Gesture Acc: 97.06%
Epoch [4/20] Train Loss: 12.3648 | Train Drowsy Acc: 99.03% | Train Gesture Acc: 99.02%
           ➤ Validation Loss: 0.2044 | Val Drowsy Acc: 97.38% | Val Gesture Acc: 97.31%
Epoch [5/20] Train Loss: 10.7943 | Train Drowsy Acc: 99.05% | Train Gesture Acc: 98.98%
           ➤ Validation Loss: 0.2256 | Val Drowsy Acc: 97.00% | Val Gesture Acc: 97.00%
Epoch [6/20] Train Loss: 3.7705 | Train Drowsy Acc: 99.75% | Train Gesture Acc: 99.78%
           ➤ Validation Loss: 0.

In [46]:
torch.save(model.state_dict(), "models/model1_W.pth")

In [47]:
torch.save(model, "models/model1_full.pth")