In [1]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Directory paths
train_dir = os.path.expanduser('../train_balanced')
test_dir = os.path.expanduser('../raw_data/fer2013/test')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)
test_images, test_labels = load_images_from_directory(test_dir)


print(f"Loaded {len(train_images)} training images.")
print(f"Loaded {len(test_images)} training images.")

Loaded 34795 training images.
Loaded 7178 training images.


In [3]:
# Check the train and test data
print({x: train_labels.count(x) for x in set(train_labels)})
print(np.array(train_images).shape)

print({x: test_labels.count(x) for x in set(test_labels)})
print(np.array(test_images).shape)

{'disgust': 5000, 'fear': 5000, 'sad': 4830, 'happy': 5000, 'angry': 5000, 'neutral': 4965, 'surprise': 5000}
(34795, 48, 48)
{'disgust': 111, 'fear': 1024, 'sad': 1247, 'happy': 1774, 'angry': 958, 'neutral': 1233, 'surprise': 831}
(7178, 48, 48)


In [4]:
# set up X, y of train and test
train_Class = np.array(train_labels)
train_images = np.array(train_images)

test_Class = np.array(test_labels)
test_images = np.array(test_images)

In [5]:
# project the image data on the eigen vectors
train_images_project, selected_eigenvectors=eigenface_project(train_images)

test_images_centered = test_images - np.mean(test_images, axis=0)
test_images_project = np.dot(test_images_centered.reshape(test_images.shape[0], -1), selected_eigenvectors)

In [6]:
# define a random forest function, and search the best parameters

def auto_random_forest(
    X,
    y,
    n_estimators=100,
    cv=10,
    random_state=2024
):
    """
    Implement Random Forest with different max_features settings and find optimal value
    
    Parameters:
    X, y: pandas DataFrame containing features and target
    test_size: proportion of data to use for testing in CV
    random_state: random state for reproducibility
    n_estimators: number of trees in forest
    cv: number of cross-validation folds
    """
    
    # number of features
    n_features = np.array(X).shape[1]
    
    # Create different max_features values to try
    # Include common options and some specific numbers
    max_features_options = [
        "log2",
        "sqrt",
        0.1,     # 10% of features
        0.2,     # 20% of features
    ]
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Create parameter grid
    param_grid = {
        'max_features': max_features_options,
        'min_samples_split': [2, 5, 10, 20, 30, 40, 50],
        # 'min_samples_leaf': [2, 5, 10, 20, 30, 40, 50]
        'max_depth': [3, 5, 7, 10, 20, 30]
    }
    
    # Initialize Random Forest
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=random_state
    )
    
    # 3. Setup StratifiedKFold
    skf = StratifiedKFold(
        n_splits=cv,
        shuffle=True,
        random_state=random_state
    )
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1, # use all the CPU cores
        verbose=2  # detailed output monitoring progress updates
    )
    
    # Fit grid search
    grid_search.fit(X_scaled, y)
    
    # Get best model, best params, and best score
    best_rf = grid_search.best_estimator_
    best_par = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Compile results
    results = {
        'best_params': best_par,
        'best_score': best_score,
        'best_model': best_rf
    }
    
    return results

In [7]:
# Grid search the best model
random_forest_result = auto_random_forest(X=train_images_project, y=train_Class)

Fitting 10 folds for each of 168 candidates, totalling 1680 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [8]:
# loop through and print the results
for key, value in random_forest_result.items():
    print(f"{key}: \n {value}")

best_params: 
 {'max_depth': 20, 'max_features': 0.1, 'min_samples_split': 10}
best_score: 
 0.40994386681335965
best_model: 
 RandomForestClassifier(max_depth=20, max_features=0.1, min_samples_split=10,
                       random_state=2024)


In [9]:
# work on the test data set
# fit with the best model with 500 trees
scaler = StandardScaler()
scaler.fit(train_images_project)
train_images_scaled = scaler.transform(train_images_project)
test_images_scaled = scaler.transform(test_images_project)
rf = RandomForestClassifier(max_depth=20, max_features=0.1, min_samples_split=10,
                       random_state=2024, n_estimators=2000)
rf.fit(train_images_scaled, train_Class)
test_Class_pred = rf.predict(test_images_scaled)
train_Class_pred = rf.predict(train_images_scaled)

In [10]:
# train accuracy
print(accuracy_score(train_Class, train_Class_pred))
        
# classification_report
print(classification_report(train_Class, train_Class_pred))

0.9981319155051013
              precision    recall  f1-score   support

       angry       1.00      1.00      1.00      5000
     disgust       1.00      1.00      1.00      5000
        fear       1.00      1.00      1.00      5000
       happy       1.00      1.00      1.00      5000
     neutral       1.00      1.00      1.00      4965
         sad       1.00      1.00      1.00      4830
    surprise       1.00      1.00      1.00      5000

    accuracy                           1.00     34795
   macro avg       1.00      1.00      1.00     34795
weighted avg       1.00      1.00      1.00     34795



In [11]:
# test accuracy
print(accuracy_score(test_Class, test_Class_pred))
        
# classification_report
print(classification_report(test_Class, test_Class_pred))

0.42811368069100025
              precision    recall  f1-score   support

       angry       0.39      0.30      0.34       958
     disgust       0.07      0.52      0.13       111
        fear       0.48      0.31      0.38      1024
       happy       0.56      0.54      0.55      1774
     neutral       0.41      0.41      0.41      1233
         sad       0.41      0.32      0.36      1247
    surprise       0.51      0.68      0.58       831

    accuracy                           0.43      7178
   macro avg       0.41      0.44      0.39      7178
weighted avg       0.46      0.43      0.44      7178

