# **Project Name**    - DeepFER: Facial Emotion Recognition Using Deep Learning



### Import Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import Libraries
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, Flatten, Dense,
                                     Dropout, BatchNormalization)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

ModuleNotFoundError: No module named 'matplotlib'

### Dataset Loading

In [None]:
# Load Dataset
base_dataset_path = 'original_images'

train_data_dir = os.path.join(base_dataset_path, 'train')
validation_data_dir = os.path.join(base_dataset_path, 'validation')
print(train_data_dir)
print(validation_data_dir)

### Dataset First View

#### Viewing images from Training Images Folder

In [None]:
# Dataset First Look
train_path = 'original_images/train'
train_classes = os.listdir(train_path)

for class_name in train_classes:
    class_dir = os.path.join(train_path, class_name)
    images_files = os.listdir(class_dir)

    print(f"5 random Images in the {class_name} folder out of {len(images_files)} total files")

    random_images = random.sample(images_files, min(5, len(images_files)))

    plt.figure(figsize = (15, 3))
    for index, images in enumerate(random_images):
        img_path = os.path.join(class_dir, images)
        img = Image.open(img_path)

        plt.subplot(1, 5, index+1)
        plt.imshow(img)
        plt.title(class_name)
        plt.axis('off')
    plt.tight_layout()
    plt.show()
    print("*"*100)

#### Viewing images from Validation Images Folder

In [None]:
# Validation Dataset First Look
val_path = 'original_images/validation'
val_classes = os.listdir(val_path)

for class_name in val_classes:
    class_dir = os.path.join(val_path, class_name)
    images_files = os.listdir(class_dir)

    print(f"5 random Images in the {class_name} folder out of {len(images_files)} total files")

    random_images = random.sample(images_files, min(5, len(images_files)))

    plt.figure(figsize = (15, 3))
    for index, images in enumerate(random_images):
        img_path = os.path.join(class_dir, images)
        img = Image.open(img_path)

        plt.subplot(1, 5, index+1)
        plt.imshow(img)
        plt.title(class_name)
        plt.axis('off')
    plt.tight_layout()
    plt.show()
    print("*"*100)

### Images Count

In [None]:
main_dir = 'original_images'
main_dir_files = os.listdir(main_dir)
for sub_dir in main_dir_files:
    class_dir = os.path.join(main_dir, sub_dir)
    class_dir_files = os.listdir(class_dir)
    print("-"*90)
    print(f"Folder inside {sub_dir}: {class_dir_files}")
    print(f"Total Classes in the {sub_dir} folder is: {len(class_dir_files)}")

    total_images = 0
    for files in class_dir_files:
        files_dir  =  os.path.join(class_dir, files)
        images_count = len(os.listdir(files_dir))
        total_images+=images_count
        print(f"Count of images inside {files} : {images_count}")
    print()
    print(f"Total images in {sub_dir} Folder : {total_images}")
    print()

### Dataset Information

#### Duplicate Images

In [None]:
import os
import hashlib

def md5_hasher(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        file = f.read()
        hasher.update(file)
    return hasher.hexdigest()

# Function to find duplicate images
def find_duplicates(folder_path):
    hashes = {}
    duplicates = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                file_hash = md5_hasher(file_path)

                if file_hash in hashes:
                    duplicates.append((file_path, hashes[file_hash]))
                else:
                    hashes[file_hash] = file_path

            except Exception as e:
                print(f"Error: {e}")

    return duplicates

dups = find_duplicates('original_images')
print(f"Total duplicate files found: {len(dups)}")

In [None]:
# Randomly sample up to 5 duplicate pairs
random_images = random.sample(dups, min(5, len(dups)))

for dup_path, original_path in random_images:
    try:
        dup_img = Image.open(dup_path)
        orig_img = Image.open(original_path)

        plt.figure(figsize=(4, 2))

        plt.subplot(1, 2, 1)
        plt.imshow(dup_img)
        plt.title("Duplicate")
        plt.axis('off')

        plt.subplot(1, 2, 2)
        plt.imshow(orig_img)
        plt.title("Original")
        plt.axis('off')

        plt.suptitle("Duplicate vs Original", fontsize=14)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Could not open images: {dup_path}, {original_path} ‚Üí {e}")

In [None]:
import os
import hashlib

# Function to calculate the MD5 hash of a file
def md5_hasher(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        file_data = f.read()
        hasher.update(file_data)
    return hasher.hexdigest()

# Function to find duplicates across all folders and classes
def find_duplicates_with_classes(folder_path):
    hashes = {}
    duplicates = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                file_hash = md5_hasher(file_path)

                class_name = os.path.basename(os.path.dirname(file_path))

                if file_hash in hashes:
                    original_path = hashes[file_hash]['path']
                    original_class = hashes[file_hash]['class']

                    duplicates.append({
                        'duplicate_path': file_path,
                        'duplicate_class': class_name,
                        'original_path': original_path,
                        'original_class': original_class
                    })
                else:
                    hashes[file_hash] = {'path': file_path, 'class': class_name}

            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return duplicates


duplicates = find_duplicates_with_classes('original_images')

# Print results
print(f"Total duplicate files found: {len(duplicates)}\n")

duplicates_different_classes = 0
for dup in duplicates:
    if dup['duplicate_class'] != dup['original_class']:
        duplicates_different_classes+=1
        print(f"Duplicate Image : {dup['duplicate_path']} (Class: {dup['duplicate_class']})")
        print(f"Original Image  : {dup['original_path']} (Class: {dup['original_class']})")
print()
print("-"*90)
print("Total Number of duplicates with different classes :", duplicates_different_classes)

#### Corrupt Images

In [None]:
# Missing Values/Null Values Count
from PIL import Image

def check_missing_or_corrupt_images(directory):
    corrupt_images = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            path = os.path.join(root, file)
            try:
                img = Image.open(path)
                img.verify()
            except:
                corrupt_images.append(path)

    print(f"Corrupt or Unreadable Images: {len(corrupt_images)}")
    return corrupt_images

# Check both train and validation sets
corrupt_train = check_missing_or_corrupt_images(train_data_dir)
corrupt_val = check_missing_or_corrupt_images(validation_data_dir)


##### Removing the duplicate images from the preprocessed folder which is the copy of the original images folder. It is done to ensure that the initial set of images remain untouched.

In [None]:
import os
import hashlib

def md5_hasher(file_path):
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        file = f.read()
        hasher.update(file)
    return hasher.hexdigest()

# Function to find duplicate images
def find_duplicates(folder_path):
    hashes = {}
    duplicates = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                file_hash = md5_hasher(file_path)

                if file_hash in hashes:
                    duplicates.append((file_path, hashes[file_hash]))
                else:
                    hashes[file_hash] = file_path

            except Exception as e:
                print(f"Error: {e}")

    return duplicates

dups = find_duplicates('preprocessed_images')
print(f"Total duplicate files found: {len(dups)}")

In [None]:
# ‚úÖ Delete duplicates
for dup in dups:
    duplicate_file = dup[0]

    if os.path.exists(duplicate_file):
        try:
            os.remove(duplicate_file)
            print(f"‚úÖ Deleted: {duplicate_file}")

        except Exception as e:
            print(f"Error deleting {duplicate_file}: {e}")

    else:
        print(f"File already deleted or not found: {duplicate_file}")

print("\n‚úÖ All duplicate images deletion check completed!")


##### Comparison before and after deleting the images

In [None]:
import os
import pandas as pd

data = []

def count_images(main_dir, phase):
    main_dir_files = os.listdir(main_dir)
    for sub_dir in main_dir_files:
        class_dir = os.path.join(main_dir, sub_dir)
        class_dir_files = os.listdir(class_dir)

        total_images_in_folder = 0

        for files in class_dir_files:
            files_dir = os.path.join(class_dir, files)
            images_count = len(os.listdir(files_dir))

            total_images_in_folder += images_count

            data.append({
                'Main Folder': sub_dir,
                'Class': files,
                'Images Count': images_count,
                'Phase': phase
            })

        # ‚úÖ Append total count for the folder
        data.append({
            'Main Folder': sub_dir,
            'Class': 'Total',
            'Images Count': total_images_in_folder,
            'Phase': phase
        })

count_images('original_images', 'Before')
count_images('preprocessed_images', 'After')

df = pd.DataFrame(data)
comparison_df = df.pivot_table(index=['Main Folder', 'Class'],
                               columns='Phase',
                               values='Images Count',
                               fill_value=0).reset_index()
comparison_df



##### Unique data type of files found inside the folder

In [None]:
# checking the unique file formats i have in these folders
def check_unique_file_formats(folder_path):
    extensions = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_extension = os.path.splitext(file)[1]
            if file_extension not in extensions:
                extensions.append(file_extension)
        print(f"Unique files in {root}: {extensions}")
        print("-"*80)

folder_path = 'original_images'
check_unique_file_formats(folder_path)

##### Unique Images pixel found inside every folder

In [None]:
# checking the unique images size
def check_image_size(folder_path):
    size = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            image_path = os.path.join(root, file)
            with Image.open(image_path) as img:
                image_size = img.size
            if image_size not in size:
                size.append(image_size)

        print(f"Unique image size in {root}: {size}")
        print("-"*80)

folder_path = 'original_images'
check_image_size(folder_path)

##### Unique type of Image file size found inside different folder

In [None]:
# Set your directory path
base_path = "original_images"

file_data = []

for root, dirs, files in os.walk(base_path):
    for file in files:
        file_path = os.path.join(root, file)
        try:
            size_kb = os.path.getsize(file_path) / 1024
            file_data.append({
                "file_name": file,
                "folder": os.path.relpath(root, base_path),
                "size_kb": round(size_kb, 2)
            })
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

# Create DataFrame
df = pd.DataFrame(file_data)

# Sort by size to detect anomalies
df_sorted = df.sort_values(by="size_kb", ascending=False)
df_sorted

In [None]:
print("Unique File Sizes (KB):")
print(df["size_kb"].unique())
print("Total Files Scanned:", len(df))


##### Algorithm to detect if there are actual faces in the images or not

In [None]:
from facenet_pytorch import MTCNN
from PIL import Image
import os
from tqdm import tqdm

# ‚úÖ Initialize MTCNN face detector
mtcnn = MTCNN(keep_all=True)
folder_path = 'original_images'
no_face_images = []

for root, dirs, files in os.walk(folder_path):
    for file in tqdm(files):
        if file.lower().endswith(('.jpg', '.jpeg', '.png')):
            file_path = os.path.join(root, file)

            try:
                img = Image.open(file_path).convert('RGB')
                boxes, _ = mtcnn.detect(img)
                if boxes is None:
                    no_face_images.append(file_path)

            except Exception as e:
                print(f"‚ö†Ô∏è Error processing {file_path}: {e}")
                no_face_images.append(file_path)

if len(no_face_images) == 0:
    print("‚úÖ All images have faces.")
else:
    print(f"Found {len(no_face_images)} images with NO faces:")

In [None]:
# Save
pd.DataFrame(no_face_images, columns=['FilePath']).to_csv('no_face_images.csv', index=False)

In [None]:
# Load
no_face_images = pd.read_csv('no_face_images.csv')
no_face_images.head()

In [None]:
no_face_images_list = no_face_images['FilePath'].tolist()
random30 = random.sample(no_face_images_list, min(30, len(no_face_images_list)))
plt.figure(figsize=(15,10))
for index, file_path in enumerate(random30):
    plt.subplot(6, 5, index+1)
    img = Image.open(file_path)
    plt.imshow(img)
    plt.axis("off")
plt.show()

##  Data Vizualization, Storytelling & Experimenting with charts : Understand the relationships between variables

In [None]:
# Chart - 1 visualization code
def plot_class_distribution(directory, title):
    classes = os.listdir(directory)
    counts = [len(os.listdir(os.path.join(directory, cls))) for cls in classes]

    plt.figure(figsize=(8, 5))
    plt.bar(classes, counts, color='skyblue')
    plt.xlabel('Emotion Classes')
    plt.ylabel('Number of Images')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

plot_class_distribution(train_data_dir, 'Training Set: Class Distribution')
plot_class_distribution(validation_data_dir, 'Validation Set: Class Distribution')


##  Building & Training our Deep Learning Model

In [None]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, Flatten, Dense,
                                     Dropout, BatchNormalization, Rescaling,
                                     RandomFlip, RandomRotation, RandomZoom)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [None]:
# importing libraries for building the model
train_dir = 'preprocessed_images/train'
val_dir = 'preprocessed_images/validation'
model_save_path = 'cnn_custom_first_model.keras'

class_names = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
IMG_SIZE = (48, 48)
BATCH_SIZE = 128
EPOCHS = 100

In [None]:
train_dir = 'preprocessed_images/train'
val_dir = 'preprocessed_images/validation'
model_save_path = 'cnn_custom_first_model.keras'

In [None]:
class_names = ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
IMG_SIZE = (48, 48)
BATCH_SIZE = 128
EPOCHS = 100

In [None]:
# ‚úÖ Load datasets
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    labels='inferred',
    label_mode='categorical',
    color_mode='grayscale',
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    labels='inferred',
    label_mode='categorical',
    color_mode='grayscale',
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

# ‚úÖ Prefetch for performance
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(tf.data.AUTOTUNE)

data_augmentation = Sequential([
    Rescaling(1./255),
    RandomFlip("horizontal"),
    RandomRotation(0.1),
    RandomZoom(0.1),
])

In [None]:
model = Sequential([
    tf.keras.Input(shape=(48, 48, 1)),

    # ‚úÖ Data Augmentation Layer
    data_augmentation,

    # ‚úÖ Block 1
    Conv2D(64, (3, 3), padding='same', activation='relu'),
    BatchNormalization(),
    Conv2D(64, (3, 3), padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    # ‚úÖ Block 2
    Conv2D(128, (3, 3), padding='same', activation='relu'),
    BatchNormalization(),
    Conv2D(128, (3, 3), padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    # ‚úÖ Block 3
    Conv2D(256, (3, 3), padding='same', activation='relu'),
    BatchNormalization(),
    Conv2D(256, (3, 3), padding='same', activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    # ‚úÖ Fully Connected Layer
    Flatten(),
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),

    # ‚úÖ Output Layer
    Dense(7, activation='softmax', dtype='float32')
])

model.summary()

In [None]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

callbacks = [
    EarlyStopping(
        patience=10,
        monitor='val_loss',
        restore_best_weights=True
    ),
    ModelCheckpoint(
        filepath=model_save_path,
        monitor='val_accuracy',
        save_best_only=True,
        mode='max',
        verbose=1
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks
)


In [None]:
# ‚úÖ Predictions
y_true = []
y_pred = []

for images, labels in val_ds:
    preds = model.predict(images)
    y_true.extend(tf.argmax(labels, axis=1).numpy())
    y_pred.extend(tf.argmax(preds, axis=1).numpy())

# ‚úÖ Classification Report
print("\nüìã Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=class_names, digits=4))


#### Explain the ML Model used and it's performance using Evaluation metric Score Chart.

In [None]:
# Visualizing evaluation Metric Score chart
# ‚úÖ Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()