# Getting Dataset Information


Dataset Link : https://www.kaggle.com/datasets/ismailnasri20/driver-drowsiness-dataset-ddd/data


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import sys
from alive_progress import alive_bar
import time

import sklearn
from sklearn.model_selection import train_test_split

In [37]:
# Helper function to consolidate image paths
def consolidate_image_paths(input_path : str, subfolder_name: str) -> list[str]:
    return [os.path.join(input_path, subfolder_name, p) for p in os.listdir(os.path.join(input_path, subfolder_name))]

# Helper function to map image paths to labels
def map_image_paths_to_labels(image_paths: list[str], label: int) -> dict:
    return {p: label for p in image_paths}

In [38]:
# Start of Dataset 2 ============================================================================================================
base_path_2 = "./Datasets/Dataset_2/"

# Adds the relative paths of all the images in the dataset
drowsy_paths = consolidate_image_paths(base_path_2, "Drowsy")
non_drowsy_paths = consolidate_image_paths(base_path_2, "Non Drowsy")

# Combining all the paths
# all_paths = drowsy_paths + non_drowsy_paths

# Mapping the image paths to their respective labels
drowsy_labels = map_image_paths_to_labels(drowsy_paths, 1)
non_drowsy_labels = map_image_paths_to_labels(non_drowsy_paths, 0)

# Combining all the labels
all_labels = {**drowsy_labels, **non_drowsy_labels}

print(f"Total Number of Images: {len(all_labels)}")
print(f"Difference between Drowsy and Non-Drowsy: {len(drowsy_labels) - len(non_drowsy_labels)}")

# TODO : Add some visuals in seaborn to show the distribution of the labels
# End of Dataset 2 ============================================================================================================


Total Number of Images: 41793
Difference between Drowsy and Non-Drowsy: 2903


### Insights from Dataset 2

- There are `2903` drowsy images than non-drowsy images.
- The dataset is imbalanced.
- To balance the dataset, we can consider several techniques:
    - Oversampling
    - Undersampling
    - Data Augmentation

# Data Preprocessing

### `Steps`:
1. Image Resizing
2. Data Splitting
3. Reshuffling
4. Undersampling (Majority Class)
5. Data Augmentation (for training data)
6. Data Normalization

### Preprocessing Steps Methodology
1. **Image Resizing**:
    - The images should be resized first to ensure all images are of the same dimensions.
    - The images are resized to `224x224` pixels.

2. **Data Splitting**:
    - Dataset should be split before any form of augmentation or sampling to ensure that the model is evaluated on unseen data.
    - Augmented data can be spilt into the testing and validation sets otherwise.
    - The dataset is split into `70%` training, `15%` validation and `15%` testing sets.

3. **Reshuffling**:
    - The dataset is reshuffled to ensure that the data is not ordered in any way.
    - This helps to prevent the model from learning any patterns in the data that may not be present in real-world scenarios.

4. **Undersampling**:
    - The majority class is undersampled to the number of images in the minority class.

5. **Data Augmentation**:
    - Data Augmentation is applied to the training set only to increase the variability of the training data. 
    - This helps to prevent overfitting and help to contextualise to real-world scenarios. 
    - Possible augmentations are:
        - Rotation
        - Horizontal Flip
        - Vertical Flip
        - Increasing the brightness

6. **Data Normalization**:
    - The pixel values are normalized to the range `[0, 1]` by dividing by `255`.
    

In [39]:
# Image Resizing. Default resized dim : 224x224
def resize_image(image_path : str, size : tuple[int, int] = (224, 224)) -> np.ndarray:
    """
    Resizes an image to the specified size and returns the resized image as an ndarray.
    Handles potential errors if the image cannot be loaded.
    """
    image = cv2.imread(image_path)

    # Checking if image exists
    if image is None:
        raise ValueError(f"Error reading image from path: {image_path}")

    image = cv2.resize(image, size)
    return image

# Function to save the resized image
def save_resized_image(image: np.ndarray, output_path: str) -> None:
    """
    Saves the resized image to the specified output path.
    """
    # Ensure the output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Save the image
    cv2.imwrite(output_path, image)

# Data Splitting
def split_data(X, y): # will the X and y be a dictionary or a list?
    
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Shuffle Image Paths
def shuffle_paths(paths_dict : list[tuple]) -> list[tuple]:
    paths_dict = np.random.shuffle(paths_dict)
    return paths_dict

In [40]:
# Image Resizing
output_folder_path = os.path.join(base_path_2, "Resized_Images")

# Loop through all the images and resize them
with alive_bar(len(all_labels), title="Resizing Images", length=50, force_tty=True) as bar:
    for image_path, label in all_labels.items():
        try:
            # Resize the image
            resized_image = resize_image(image_path, (224, 224))

            # Create output file path
            output_file_name = os.path.basename(image_path)
            output_path = os.path.join(output_folder_path, output_file_name)

            # Save the resized image
            save_resized_image(resized_image, output_path)

            # Progress update
            bar()
            sys.stdout.flush()  # Force flushing the output buffer

        except ValueError as e:
            print(f"Skipping image {image_path}: {e}")
            sys.stdout.flush()  # Force flushing the output buffer

Resizing Images |██████████████████████████████████████████████████| 41793/41793|████████▎                                         | ▆▄▂ 6934/41|████████████▊                                     | ▅▇▇ 10657/4|█████████████▉                                    | ▁▃▅ 11556/4|███████████████▍                                  | ▂▄▆ 12818/4|████████████████████████                          | ▆▄▂ 20024/4|█████████████████████████                         | ▂▄▆ 20901/4|██████████████████████████▋                       | ▄▆█ 22218/4|███████████████████████████▋                      | ▇▅▃ 23100/4|███████████████████████████▉                      | ▇▅▃ 23283/4|████████████████████████████▍                     | ▃▅▇ 23705/4|█████████████████████████████                     | ▁▃▅ 24244/4|█████████████████████████████▉                    | ▅▇▇ 24955/4|██████████████████████████████▏                   | ▇▇▅ 25134/4|██████████████████████████████▏                   | ▂▂▄ 25210/4|███████████████████████

In [41]:
# Splitting the data
