In [None]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from time import time, sleep

In [2]:
# Directory paths
train_dir = os.path.expanduser('../raw_data/fer2013/train')
test_dir = os.path.expanduser('../raw_data/fer2013/test')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)
test_images, test_labels = load_images_from_directory(test_dir)


print(f"Loaded {len(train_images)} training images.")
print(f"Loaded {len(test_images)} training images.")

Loaded 28709 training images.
Loaded 7178 training images.


In [3]:
# Check the train and test data
print({x: train_labels.count(x) for x in set(train_labels)})
print(np.array(train_images).shape)

print({x: test_labels.count(x) for x in set(test_labels)})
print(np.array(test_images).shape)

{'surprise': 3171, 'angry': 3995, 'sad': 4830, 'happy': 7215, 'neutral': 4965, 'disgust': 436, 'fear': 4097}
(28709, 48, 48)
{'surprise': 831, 'angry': 958, 'sad': 1247, 'happy': 1774, 'neutral': 1233, 'disgust': 111, 'fear': 1024}
(7178, 48, 48)


In [4]:
# set up X, y of train and test
train_Class = np.array(train_labels)
train_images = np.array(train_images)

test_Class = np.array(test_labels)
test_images = np.array(test_images)

In [5]:
# project the image data on the eigen vectors
train_images_project, selected_eigenvectors=eigenface_project(train_images)

test_images_centered = test_images - np.mean(test_images, axis=0)
test_images_project = np.dot(test_images_centered.reshape(test_images.shape[0], -1), selected_eigenvectors)

In [None]:
# define a random forest function, and search the best parameters

def auto_random_forest(
    X,
    y,
    n_estimators=100,
    cv=10,
    random_state=2024
):
    """
    Implement Random Forest with different max_features settings and find optimal value
    
    Parameters:
    X, y: pandas DataFrame containing features and target
    test_size: proportion of data to use for testing in CV
    random_state: random state for reproducibility
    n_estimators: number of trees in forest
    cv: number of cross-validation folds
    """
    
    # number of features
    n_features = np.array(X).shape[1]
    
    # Create different max_features values to try
    # Include common options and some specific numbers
    max_features_options = [
        "log2",
        "sqrt",
        0.1,     # 10% of features
        0.2,     # 20% of features
    ]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Create parameter grid
    param_grid = {
        'max_features': max_features_options,
        'min_samples_split': [2, 5, 10, 20, 30, 40, 50],
        # 'min_samples_leaf': [2, 5, 10, 20, 30, 40, 50]
        'max_depth': [3, 5, 7, 10, 20, 30]
    }
    
    # Initialize Random Forest
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=random_state,
        max_samples=0.3
    )
    
    # 3. Setup StratifiedKFold
    skf = StratifiedKFold(
        n_splits=cv,
        shuffle=True,
        random_state=random_state,
        bootstrap=True
    )
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1, # use all the CPU cores
        verbose=2  # detailed output monitoring progress updates
    )
    
    # Fit grid search
    grid_search.fit(X_scaled, y)
    
    # Get best model, best params, and best score
    best_rf = grid_search.best_estimator_
    best_par = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Compile results
    results = {
        'best_params': best_par,
        'best_score': best_score,
        'best_model': best_rf
    }
    
    return results

In [None]:


# Grid search the best model
random_forest_result = auto_random_forest(X=train_images_project, y=train_Class)

Fitting 10 folds for each of 168 candidates, totalling 1680 fits


In [8]:
# loop through and print the results
for key, value in random_forest_result.items():
    print(f"{key}: \n {value}")

best_params: 
 {'max_depth': 20, 'max_features': 0.2, 'min_samples_split': 5}
best_score: 
 0.42700846018760236
best_model: 
 RandomForestClassifier(max_depth=20, max_features=0.2, min_samples_split=5,
                       random_state=2024)


In [26]:
# work on the test data set
# fit with the best model with 1000 trees
scaler = StandardScaler()
scaler.fit(train_images_project)
train_images_scaled = scaler.transform(train_images_project)
test_images_scaled = scaler.transform(test_images_project)
rf = RandomForestClassifier(max_depth=20, max_features=0.2, min_samples_split=5,
                       random_state=2024, max_samples=0.5, n_estimators=2000)
rf.fit(train_images_scaled, train_Class)
test_Class_pred = rf.predict(test_images_scaled)
train_Class_pred = rf.predict(train_images_scaled)

In [None]:
# feature importance
rf.feature_importances_

In [27]:
# train accuracy
print(accuracy_score(train_Class, train_Class_pred))
        
# classification_report
print(classification_report(train_Class, train_Class_pred))

0.9939391828346511
              precision    recall  f1-score   support

       angry       1.00      0.99      1.00      3995
     disgust       1.00      0.88      0.93       436
        fear       1.00      0.99      0.99      4097
       happy       0.98      1.00      0.99      7215
     neutral       1.00      1.00      1.00      4965
         sad       1.00      1.00      1.00      4830
    surprise       1.00      0.99      0.99      3171

    accuracy                           0.99     28709
   macro avg       1.00      0.98      0.99     28709
weighted avg       0.99      0.99      0.99     28709



In [28]:
# test accuracy
print(accuracy_score(test_Class, test_Class_pred))
        
# classification_report
print(classification_report(test_Class, test_Class_pred))

0.44413485650599055
              precision    recall  f1-score   support

       angry       0.80      0.14      0.23       958
     disgust       1.00      0.23      0.38       111
        fear       0.57      0.22      0.32      1024
       happy       0.38      0.87      0.53      1774
     neutral       0.45      0.31      0.36      1233
         sad       0.41      0.32      0.36      1247
    surprise       0.67      0.59      0.62       831

    accuracy                           0.44      7178
   macro avg       0.61      0.38      0.40      7178
weighted avg       0.52      0.44      0.41      7178

