# Extracting the dataset from the GCP Bucket

In [None]:
import os
import zipfile
from google.cloud import storage

# Download and extract dataset from GCS
def download_and_extract_gcs(bucket_name, blob_path, destination_folder):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(blob_path)

    zip_path = os.path.join(destination_folder, "dataset.zip")
    os.makedirs(destination_folder, exist_ok=True)
    blob.download_to_filename(zip_path)
    print(f"Downloaded {blob_path} to {zip_path}.")

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(destination_folder)
    print(f"Extracted dataset to {destination_folder}.")
    os.remove(zip_path)
    print(f"Removed temporary zip file: {zip_path}.")

# Replace these with your actual GCS bucket name and blob path
bucket_name = "Data"
blob_path = "eval/downloaded_file.zip"
destination_folder = "./dataset"

# Download and extract the dataset
download_and_extract_gcs(bucket_name, blob_path, destination_folder)

Downloaded eval/downloaded_file.zip to ./dataset/dataset.zip.
Extracted dataset to ./dataset.
Removed temporary zip file: ./dataset/dataset.zip.


## Exploratory Data Analysis

In [None]:
import os

def inspect_dataset_structure(data_dir):
    # List the classes (main folders such as microsleep, yawning)
    classes = os.listdir(data_dir)
    print(f"Classes found in {data_dir}: {classes}")

    # Iterate through each class folder to check subfolders (videos or subdirectories)
    for cls in classes:
        class_dir = os.path.join(data_dir, cls)
        if os.path.isdir(class_dir):
            print(f"\nInspecting class folder: {cls}")
            subfolders = [f for f in os.listdir(class_dir) if os.path.isdir(os.path.join(class_dir, f))]

            # If no subfolders, it means images might be directly inside the class folder
            if not subfolders:
                print(f"  No subfolders found. Checking for image files directly in the {cls} folder...")
                image_files = [f for f in os.listdir(class_dir) if f.endswith('.jpg') or f.endswith('.png')]
                if image_files:
                    print(f"  Found {len(image_files)} images directly in the {cls} folder.")
                else:
                    print(f"  No images found directly in {cls} folder.")
            else:
                for subfolder in subfolders:
                    subfolder_path = os.path.join(class_dir, subfolder)
                    image_files = [f for f in os.listdir(subfolder_path) if f.endswith('.jpg') or f.endswith('.png')]
                    print(f"  Found {len(image_files)} images in subfolder {subfolder_path}.")

# Path to your dataset folder (adjust if needed)
data_dir = "/content/dataset/Image data"

# Inspect the dataset structure
inspect_dataset_structure(data_dir)


Classes found in /content/dataset/Image data: ['test', 'Yawning', 'train', 'Microsleep', 'val']

Inspecting class folder: test
  No subfolders found. Checking for image files directly in the test folder...
  No images found directly in test folder.

Inspecting class folder: Yawning
  Found 990 images in subfolder /content/dataset/Image data/Yawning/P1042773_720.
  Found 1320 images in subfolder /content/dataset/Image data/Yawning/P1042748_720.
  Found 865 images in subfolder /content/dataset/Image data/Yawning/P1042750_720.
  Found 1000 images in subfolder /content/dataset/Image data/Yawning/P1042798_720.
  Found 1490 images in subfolder /content/dataset/Image data/Yawning/P1042778_720.
  Found 1027 images in subfolder /content/dataset/Image data/Yawning/P1042780_720.
  Found 730 images in subfolder /content/dataset/Image data/Yawning/P1043062_720.
  Found 815 images in subfolder /content/dataset/Image data/Yawning/P1042771_720.
  Found 778 images in subfolder /content/dataset/Image da

## Dataset Balancing

In [None]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split

def balance_and_split_dataset(yawning_dir, microsleep_dir, output_dir, val_test_split=0.2):
    # Create directories for balanced dataset
    train_dir = os.path.join(output_dir, "train")
    val_dir = os.path.join(output_dir, "val")
    test_dir = os.path.join(output_dir, "test")

    # Make sure the directories exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # List all subfolders in Yawning and Microsleep
    yawning_subfolders = [os.path.join(yawning_dir, subfolder) for subfolder in os.listdir(yawning_dir) if os.path.isdir(os.path.join(yawning_dir, subfolder))]
    microsleep_subfolders = [os.path.join(microsleep_dir, subfolder) for subfolder in os.listdir(microsleep_dir) if os.path.isdir(os.path.join(microsleep_dir, subfolder))]

    # Collect all images from Yawning and Microsleep
    yawning_images = [os.path.join(subfolder, img) for subfolder in yawning_subfolders for img in os.listdir(subfolder) if img.endswith('.jpg') or img.endswith('.png')]
    microsleep_images = [os.path.join(subfolder, img) for subfolder in microsleep_subfolders for img in os.listdir(subfolder) if img.endswith('.jpg') or img.endswith('.png')]

    # Balance the dataset (take as many from Microsleep as there are in Yawning)
    min_samples = min(len(yawning_images), len(microsleep_images))
    microsleep_images = random.sample(microsleep_images, min_samples)

    # Combine the images
    all_images = yawning_images + microsleep_images
    all_labels = ['Yawning'] * len(yawning_images) + ['Microsleep'] * len(microsleep_images)

    # Split the dataset into train, validation, and test
    X_train, X_temp, y_train, y_temp = train_test_split(all_images, all_labels, test_size=val_test_split*2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Move images to the appropriate directories
    def move_images(image_paths, labels, target_dir):
        for img_path, label in zip(image_paths, labels):
            class_dir = os.path.join(target_dir, label)
            os.makedirs(class_dir, exist_ok=True)
            shutil.copy(img_path, class_dir)

    move_images(X_train, y_train, train_dir)
    move_images(X_val, y_val, val_dir)
    move_images(X_test, y_test, test_dir)

    print(f"Balanced and split dataset into {train_dir}, {val_dir}, and {test_dir}")

# Define your source directories and output directory
yawning_dir = "/content/dataset/Image data/Yawning"
microsleep_dir = "/content/dataset/Image data/Microsleep"
output_dir = "/content/dataset/balanced_data"

# Balance and split the dataset
balance_and_split_dataset(yawning_dir, microsleep_dir, output_dir)

Balanced and split dataset into /content/dataset/balanced_data/train, /content/dataset/balanced_data/val, and /content/dataset/balanced_data/test


## Data Augmentation

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def load_data_with_augmentation(train_dir, val_dir, test_dir, img_size=(224, 224), batch_size=32):
    # Set up data augmentation for training data
    train_datagen = ImageDataGenerator(
        rescale=1.0/255.0,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

    # For validation and test, only rescaling
    val_test_datagen = ImageDataGenerator(rescale=1.0/255.0)

    # Load training, validation, and test data using the data generators
    train_generator = train_datagen.flow_from_directory(
        directory=train_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=True
    )

    val_generator = val_test_datagen.flow_from_directory(
        directory=val_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False
    )

    test_generator = val_test_datagen.flow_from_directory(
        directory=test_dir,
        target_size=img_size,
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False
    )

    return train_generator, val_generator, test_generator

# Set up directories for the balanced dataset
train_dir = "/content/dataset/balanced_data/train"
val_dir = "/content/dataset/balanced_data/val"
test_dir = "/content/dataset/balanced_data/test"

# Load the data
train_gen, val_gen, test_gen = load_data_with_augmentation(train_dir, val_dir, test_dir)

# Print class names to check
print("Class Names:", train_gen.class_indices)

Found 5939 images belonging to 2 classes.
Found 4674 images belonging to 2 classes.
Found 4733 images belonging to 2 classes.
Class Names: {'Microsleep': 0, 'Yawning': 1}


## Model Building

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

def build_model(input_shape=(224, 224, 3), num_classes=2):
    model = Sequential([
        # First convolutional block
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        BatchNormalization(),
        MaxPooling2D(pool_size=(2, 2)),
        Dropout(0.3),

        # Second convolutional block
        Conv2D(64, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D(pool_size=(2, 2)),
        Dropout(0.3),

        # Third convolutional block
        Conv2D(128, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D(pool_size=(2, 2)),
        Dropout(0.4),

        # Fourth convolutional block
        Conv2D(256, (3, 3), activation='relu'),
        BatchNormalization(),
        MaxPooling2D(pool_size=(2, 2)),
        Dropout(0.4),

        # Global Average Pooling
        GlobalAveragePooling2D(),

        # Fully connected layer
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')  # Softmax activation for classification
    ])

    # Compile the model
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='categorical_crossentropy',  # Multi-class classification
        metrics=['accuracy']
    )

    return model

# Build and summarize the model
model = build_model(input_shape=(224, 224, 3), num_classes=len(train_gen.class_indices))
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Model Training

In [None]:
# Training the model
history = model.fit(
    train_gen,  # Training data
    validation_data=val_gen,  # Validation data
    epochs=20,  # Number of epochs to train for
    batch_size=32,  # Batch size for each iteration
    verbose=1  # Print progress bar and details
)

# Save the trained model
model.save("balanced_model.h5")
print("Model saved as balanced_model.h5.")

Epoch 1/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 437ms/step - accuracy: 0.9055 - loss: 0.2497 - val_accuracy: 0.3579 - val_loss: 0.8749
Epoch 2/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 366ms/step - accuracy: 0.9601 - loss: 0.1319 - val_accuracy: 0.3415 - val_loss: 1.6420
Epoch 3/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 367ms/step - accuracy: 0.9666 - loss: 0.1021 - val_accuracy: 0.7037 - val_loss: 0.7583
Epoch 4/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 365ms/step - accuracy: 0.9770 - loss: 0.0683 - val_accuracy: 0.8611 - val_loss: 0.6117
Epoch 5/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 366ms/step - accuracy: 0.9811 - loss: 0.0522 - val_accuracy: 0.6635 - val_loss: 4.3192
Epoch 6/20
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 365ms/step - accuracy: 0.9817 - loss: 0.0588 - val_accuracy: 0.9450 - val_loss: 0.1683
Epoch 7/20



Model saved as balanced_model.h5.


## Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Define the class names explicitly
class_names = ['Microsleep', 'Yawning']

# Evaluate the model on training data
def evaluate_training_data(model, train_gen, class_names):
    # Evaluate the model on the training data
    train_loss, train_accuracy = model.evaluate(train_gen)
    print(f"Training Accuracy: {train_accuracy:.2%}")
    print(f"Training Loss: {train_loss:.4f}")

    # Predictions and true labels for training data
    y_pred_train = np.argmax(model.predict(train_gen), axis=1)
    y_true_train = train_gen.classes

    # Classification report for training data
    train_report = classification_report(y_true_train, y_pred_train, target_names=class_names)
    print("Training Classification Report:\n", train_report)

    # Confusion matrix for training data
    train_cm = confusion_matrix(y_true_train, y_pred_train)
    print("Training Confusion Matrix:\n", train_cm)

    # Misclassified examples (indices of wrong predictions)
    misclassified_train_idx = np.where(y_true_train != y_pred_train)[0]
    print(f"Number of misclassified samples in training: {len(misclassified_train_idx)}")

    return train_report, train_cm, train_accuracy, train_loss, misclassified_train_idx

# Evaluate the model on the test data
def evaluate_model(model, test_gen, class_names):
    # Evaluate the model on the test data
    test_loss, test_accuracy = model.evaluate(test_gen)
    print(f"Test Accuracy: {test_accuracy:.2%}")
    print(f"Test Loss: {test_loss:.4f}")

    # Predictions and true labels for test data
    y_pred_test = np.argmax(model.predict(test_gen), axis=1)
    y_true_test = test_gen.classes

    # Classification report for test data
    test_report = classification_report(y_true_test, y_pred_test, target_names=class_names)
    print("Test Classification Report:\n", test_report)

    # Confusion matrix for test data
    test_cm = confusion_matrix(y_true_test, y_pred_test)
    print("Test Confusion Matrix:\n", test_cm)

    # Misclassified examples (indices of wrong predictions)
    misclassified_test_idx = np.where(y_true_test != y_pred_test)[0]
    print(f"Number of misclassified samples in test: {len(misclassified_test_idx)}")

    return test_report, test_cm, test_accuracy, test_loss, misclassified_test_idx

# Call the evaluation functions
train_report, train_cm, train_accuracy, train_loss, misclassified_train = evaluate_training_data(model, train_gen, class_names)
test_report, test_cm, test_accuracy, test_loss, misclassified_test = evaluate_model(model, test_gen, class_names)

[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 357ms/step - accuracy: 0.8431 - loss: 0.5065
Training Accuracy: 84.53%
Training Loss: 0.5230
[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 347ms/step
Training Classification Report:
               precision    recall  f1-score   support

  Microsleep       0.60      0.46      0.52      3510
     Yawning       0.42      0.56      0.48      2429

    accuracy                           0.50      5939
   macro avg       0.51      0.51      0.50      5939
weighted avg       0.53      0.50      0.51      5939

Training Confusion Matrix:
 [[1628 1882]
 [1070 1359]]
Number of misclassified samples in training: 2952
[1m  5/148[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m5s[0m 37ms/step - accuracy: 0.9845 - loss: 0.0666

  self._warn_if_super_not_called()


[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 53ms/step - accuracy: 0.9627 - loss: 0.0942
Test Accuracy: 97.00%
Test Loss: 0.0719
[1m148/148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step
Test Classification Report:
               precision    recall  f1-score   support

  Microsleep       0.99      0.96      0.98      3119
     Yawning       0.93      0.98      0.96      1614

    accuracy                           0.97      4733
   macro avg       0.96      0.97      0.97      4733
weighted avg       0.97      0.97      0.97      4733

Test Confusion Matrix:
 [[3009  110]
 [  32 1582]]
Number of misclassified samples in test: 142
