In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import time
import copy
import cv2
import numpy as np
from playsound import playsound
import threading
from mtcnn import MTCNN
#from retinaface import RetinaFace



In [2]:
data_dir = r"C:\Users\phtac\OneDrive\Documents\Deep learning\prj2\data\data\processed"  

data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ]),
}

image_datasets = {x: datasets.ImageFolder(root=f'{data_dir}/{x}', transform=data_transforms[x])
                  for x in ['train', 'test']}

dataloaders = {x: DataLoader(image_datasets[x], batch_size=32, shuffle=True)
               for x in ['train', 'test']}

class_names = image_datasets['train'].classes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("Classes:", class_names)


Classes: ['eyeclose', 'normal', 'yawn']


In [3]:
model = models.resnet18(weights='IMAGENET1K_V1')
for param in model.parameters():
    param.requires_grad = False  

num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(class_names))
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)


In [10]:
num_epochs = 30
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0

for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    for phase in ['train', 'test']:
        if phase == 'train':
            model.train()
        else:
            model.eval()

        running_loss = 0.0
        running_corrects = 0

        for inputs, labels in dataloaders[phase]:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)

        epoch_loss = running_loss / len(image_datasets[phase])
        epoch_acc = running_corrects.double() / len(image_datasets[phase])

        print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        if phase == 'test' and epoch_acc > best_acc:
            best_acc = epoch_acc
            best_model_wts = copy.deepcopy(model.state_dict())

model.load_state_dict(best_model_wts)
torch.save(model.state_dict(), 'yawn_resnet18.pth')
print(f'Best acc: {best_acc:.4f}')


Epoch 1/30
train Loss: 0.5533 Acc: 0.8231
test Loss: 0.3065 Acc: 0.9463
Epoch 2/30
train Loss: 0.2359 Acc: 0.9513
test Loss: 0.1865 Acc: 0.9691
Epoch 3/30
train Loss: 0.1718 Acc: 0.9637
test Loss: 0.1380 Acc: 0.9756
Epoch 4/30
train Loss: 0.1474 Acc: 0.9663
test Loss: 0.1313 Acc: 0.9739
Epoch 5/30
train Loss: 0.1262 Acc: 0.9699
test Loss: 0.0935 Acc: 0.9870
Epoch 6/30
train Loss: 0.1145 Acc: 0.9711
test Loss: 0.0845 Acc: 0.9821
Epoch 7/30
train Loss: 0.1083 Acc: 0.9690
test Loss: 0.0753 Acc: 0.9886
Epoch 8/30
train Loss: 0.0939 Acc: 0.9758
test Loss: 0.0643 Acc: 0.9902
Epoch 9/30
train Loss: 0.0925 Acc: 0.9749
test Loss: 0.0621 Acc: 0.9886
Epoch 10/30
train Loss: 0.0882 Acc: 0.9761
test Loss: 0.0574 Acc: 0.9870
Epoch 11/30
train Loss: 0.0802 Acc: 0.9787
test Loss: 0.0634 Acc: 0.9870
Epoch 12/30
train Loss: 0.0719 Acc: 0.9811
test Loss: 0.0521 Acc: 0.9902
Epoch 13/30
train Loss: 0.0681 Acc: 0.9811
test Loss: 0.0463 Acc: 0.9935
Epoch 14/30
train Loss: 0.0691 Acc: 0.9805
test Loss: 0.0466

In [16]:
threshold = 0.7        
smooth_frames = 5      


model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load('yawn_resnet18.pth', map_location=device))
model = model.to(device)

detector = MTCNN()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

cap = cv2.VideoCapture(0)
prev_label = "normal"
conf_buffer = []
yawn_count = 0
eyes_closed_start = None
Close_threshold = 5
alert_playing = False


def play_alert(file):
    threading.Thread(target=playsound, args=(file,), daemon=True).start()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = detector.detect_faces(rgb_frame)

    if results:
        
        face = max(results, key=lambda r: r['confidence'])
        x, y, w, h = face['box']
        x, y = max(0, x), max(0, y)
        face_roi = rgb_frame[y:y+h, x:x+w]

       
        img_pil = transforms.ToPILImage()(face_roi)
        img_tensor = transform(img_pil).unsqueeze(0).to(device)

        
        with torch.no_grad():
            outputs = model(img_tensor)
            probs = torch.nn.functional.softmax(outputs, dim=1)[0]
            conf, pred_class = torch.max(probs, dim=0)

        conf_value = conf.item()
        label = class_names[pred_class.item()]

        
        if conf_value < threshold:
            label = prev_label
        else:
            conf_buffer.append(conf_value)
            if len(conf_buffer) > smooth_frames:
                conf_buffer.pop(0)
            avg_conf = np.mean(conf_buffer)
            if avg_conf > threshold:
                prev_label = label

        if conf_value < threshold:
            label = prev_label
        else:
            conf_buffer.append(conf_value)
            if len(conf_buffer) > smooth_frames:
                conf_buffer.pop(0)
            avg_conf = np.mean(conf_buffer)
            if avg_conf > threshold:
                prev_label = label

        curr_time = time.time()
        if prev_label == "yawn":
            yawn_count += 1
            if yawn_count >= 10 and alert_playing == False:
                play_alert('Soft Alarm Sound Effect.mp3')
                yawn_count = 0

        elif prev_label == "eyeclose":
            if eyes_closed_start is None:
                eyes_closed_start = curr_time
            elif curr_time - eyes_closed_start >= Close_threshold and not alert_playing:
                play_alert('Danger Alarm Sound Effect.mp3')
                alert_playing = False
        else:
            eyes_closed_start = None
            alert_playing = False
            yawn_count = 0
                
       
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        text = f"{prev_label} ({conf_value:.2f})"
        cv2.putText(frame, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.9, (0, 255, 0), 2, cv2.LINE_AA)


    else:
        cv2.putText(frame, "No face detected", (20, 50),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)

    cv2.imshow("Yawning Detection (MTCNN)", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()