In [1]:
import os
import cv2
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import random
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras import layers, models
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split

In [2]:
image_folder = '/Users/payalchavan/Documents/Data Mining_Assignments/celeba/img_align_celeba'
attributes_file = '/Users/payalchavan/Documents/Data Mining_Assignments/celeba/list_attr_celeba.txt'

In [None]:
# Load attributes
attributes = {}
with open(attributes_file, 'r') as file:
    file.readline()  # Skip the header line (number of samples)
    attribute_names = file.readline().strip().split()  # Get the attribute names
    for line in file:
        parts = line.strip().split()
        attributes[parts[0]] = [1 if int(p) == 1 else 0 for p in parts[1:]]

# Find the index of the "Wearing a hat" attribute
Wearing_hat_index = attribute_names.index('Wearing_Hat')

# Load images and labels
images = []
labels = []

for filename in sorted(os.listdir(image_folder)):
    if filename.lower().endswith('.jpg'):
        # Read and preprocess the image
        img = cv2.imread(os.path.join(image_folder, filename))
        img = cv2.resize(img, (64, 64))  # Resize to 64x64
        img = img / 255.0  # Normalize pixel values
        images.append(img)

        # Extract the "Wearing_hat_index" attribute as the label
        labels.append(attributes[filename][Wearing_hat_index])

images = np.array(images, dtype='float32')
labels = np.array(labels, dtype='int')

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

In [None]:
model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),  
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid') 
])

model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, validation_split=0.2)

In [None]:
# Checking the accuracy of the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Accuracy of the model: {test_acc}")

The overall accuracy of a model is determined by the ratio of total correct predictions to the total number of predictions (both correct and incorrect). The formula for accuracy is as follows:
[ \text{Accuracy} = \frac{\text{Total correct predictions}}{\text{Total correct predictions} + \text{Total incorrect predictions}} ]
This accuracy value represents the model’s performance across the entire dataset or batch. It provides insight into how well the model generalizes to unseen examples. Remember that achieving high accuracy is essential, but it’s equally crucial to evaluate other metrics and consider the context of the problem domain.

In [None]:
# Make predictions
predictions = model.predict(X_test)
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Determine the number of correct and incorrect predictions
correct_predictions = np.sum(predicted_labels == y_test)
incorrect_predictions = np.sum(predicted_labels != y_test)

print(f"Total correct predictions: {correct_predictions}")
print(f"Total incorrect predictions: {incorrect_predictions}")

In [None]:
# Extract the history data
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

In [None]:
# Plot training and validation accuracy
plt.figure(figsize=(8, 6))  # Set the figure size
plt.plot(epochs, acc, label='Training accuracy')
plt.plot(epochs, val_acc, label='Validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.figure(figsize=(8, 6))  # Set the figure size

# Plot training and validation loss
plt.plot(epochs, loss, label='Training loss')
plt.plot(epochs, val_loss, label='Validation loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

The trend of increasing accuracy during training epochs indicates effective learning from the training data. However, the gap between training and validation accuracy hints at potential overfitting, where the model learns specific patterns from the training set that may not apply well to new data.

The minor fluctuations in validation metrics in later epochs suggest the early stages of overfitting. This can be addressed by introducing techniques like dropout layers or regularization to prevent the model from memorizing the training data too closely.

The final test accuracy is a crucial metric indicating how well the model generalizes to new, unseen data. Achieving an accuracy of over 91% is commendable, showcasing the model's ability to extract meaningful features from face images to predict smiles accurately.

To enhance the model further, exploring data augmentation methods can diversify the training set, aiding the model in better generalization. Experimenting with different architectures or hyperparameters could also lead to performance enhancements.

The final test accuracy is a critical metric. Achieving an accuracy of over 91% is commendable. It demonstrates the model’s ability to extract meaningful features from face images and predict smiles accurately.
However, always validate the model’s performance on unseen data to ensure robustness.
Further Model Enhancement:
To enhance the model further:
Data Augmentation: Explore data augmentation methods to diversify the training set. Augmented data can help the model generalize better.
Architecture and Hyperparameters: Experiment with different architectures or hyperparameters. Fine-tuning these aspects could lead to performance improvements.