In [9]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Directory paths
train_dir = os.path.expanduser('../raw_data/fer2013/train')
test_dir = os.path.expanduser('../raw_data/fer2013/test')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)
test_images, test_labels = load_images_from_directory(test_dir)


print(f"Loaded {len(train_images)} training images.")
print(f"Loaded {len(test_images)} training images.")

Loaded 28709 training images.
Loaded 7178 training images.


In [3]:
# Check the train and test data
print({x: train_labels.count(x) for x in set(train_labels)})
print(np.array(train_images).shape)

print({x: test_labels.count(x) for x in set(test_labels)})
print(np.array(test_images).shape)

{'sad': 4830, 'fear': 4097, 'angry': 3995, 'disgust': 436, 'surprise': 3171, 'happy': 7215, 'neutral': 4965}
(28709, 48, 48)
{'sad': 1247, 'fear': 1024, 'angry': 958, 'disgust': 111, 'surprise': 831, 'happy': 1774, 'neutral': 1233}
(7178, 48, 48)


In [4]:
# set up X, y of train and test
train_Class = np.array(train_labels)
train_images = np.array(train_images)

test_Class = np.array(test_labels)
test_images = np.array(test_images)

In [5]:
# project the image data on the eigen vectors
train_images_project, selected_eigenvectors=eigenface_project(train_images)

test_images_centered = test_images - np.mean(test_images, axis=0)
test_images_project = np.dot(test_images_centered.reshape(test_images.shape[0], -1), selected_eigenvectors)

In [21]:
# fit multi-logistic Regression

def auto_logreg(X, 
                y, 
                random_state=2024, 
                cv=10
                ):
    # 1. Parameter grid
    param_grid = {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"]
    }
    
    # 2. Create base model
    logreg_model = LogisticRegression(
        multi_class='multinomial',
        solver='saga',
        max_iter=1000
    )
    
    # 3. Setup StratifiedKFold
    skf = StratifiedKFold(
        n_splits=cv,
        shuffle=True,
        random_state=random_state
    )
    
    # 4. Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=logreg_model,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1, # use all the CPU cores
        verbose=2  # detailed output monitoring progress updates
    )
    
    # 5. Scale the features
    
    # 6. Fit the model
    grid_search.fit(X, y)

    # 7. Best model
    best_model = grid_search.best_estimator_
    
    # 8. Best params
    best_par = grid_search.best_params_
    
    # 9. Best CV score
    best_score = grid_search.best_score_
    
    # 10. Compile and return the results
    results = {
        'best_params': best_par,
        'best_score': best_score,
        'best_model': best_model
    }

    return results, grid_search

logreg_result, grid_search = auto_logreg(X=train_images_project, y=train_Class)

Fitting 10 folds for each of 10 candidates, totalling 100 fits




In [22]:
# loop through and print the results
for key, value in logreg_result.items():
    print(f"{key}: \n {value}")

best_params: 
 {'C': 0.01, 'penalty': 'l1'}
best_score: 
 0.35922547595381904
best_model: 
 LogisticRegression(C=0.01, max_iter=1000, multi_class='multinomial',
                   penalty='l1', solver='saga')


In [25]:
# work on the test data set
# fit with the best model
logreg = LogisticRegression(C=0.01, max_iter=1000, 
                            multi_class='multinomial', 
                            solver='saga',
                            penalty="l1")

logreg.fit(train_images_project, train_Class)
test_Class_pred = logreg.predict(test_images_project)
train_Class_pred = logreg.predict(train_images_project)



In [26]:
# train accuracy
print(accuracy_score(train_Class, train_Class_pred))
        
# classification_report
print(classification_report(train_Class, train_Class_pred))

0.36925702741300637
              precision    recall  f1-score   support

       angry       0.29      0.21      0.25      3995
     disgust       0.22      0.01      0.02       436
        fear       0.26      0.11      0.16      4097
       happy       0.49      0.60      0.54      7215
     neutral       0.36      0.33      0.34      4965
         sad       0.30      0.29      0.29      4830
    surprise       0.33      0.60      0.43      3171

    accuracy                           0.37     28709
   macro avg       0.32      0.31      0.29     28709
weighted avg       0.35      0.37      0.35     28709



In [27]:
# test accuracy
print(accuracy_score(test_Class, test_Class_pred))
        
# classification_report
print(classification_report(test_Class, test_Class_pred))

0.3722485371969908
              precision    recall  f1-score   support

       angry       0.24      0.17      0.20       958
     disgust       0.00      0.00      0.00       111
        fear       0.24      0.11      0.15      1024
       happy       0.49      0.63      0.55      1774
     neutral       0.36      0.34      0.35      1233
         sad       0.31      0.29      0.30      1247
    surprise       0.35      0.59      0.44       831

    accuracy                           0.37      7178
   macro avg       0.28      0.31      0.28      7178
weighted avg       0.34      0.37      0.35      7178

