In [16]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Directory paths
train_dir = os.path.expanduser('../train_balanced')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)

print(f"Loaded {len(train_images)} training images.")

Loaded 34795 training images.


In [None]:
print({x: train_labels.count(x) for x in set(train_labels)})
print(np.array(train_images).shape)

(34795, 48, 48)


In [5]:
# set up X, y
Class = np.array(train_labels)
train_images = np.array(train_images)

In [6]:
# project the image data on the eigen vectors
train_images_project=eigenface_project(train_images)

In [32]:
# define a random forest function, and search the best parameters

def auto_random_forest(
    X,
    y, 
    test_size=0.2,
    random_state=2024,
    n_estimators=100,
    cv=10
):
    """
    Implement Random Forest with different max_features settings and find optimal value
    
    Parameters:
    X, y: pandas DataFrame containing features and target
    test_size: proportion of data to use for testing
    random_state: random state for reproducibility
    n_estimators: number of trees in forest
    cv: number of cross-validation folds
    """
    
    # number of features
    n_features = np.array(X).shape[1]
    
    # Create different max_features values to try
    # Include common options and some specific numbers
    max_features_options = [
        "log2",
        "sqrt",
        0.1,     # 10% of features
    ]
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=test_size, 
        random_state=random_state,
        stratify=y
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Create parameter grid
    param_grid = {
        'max_features': max_features_options,
        'min_samples_split': [2, 5, 10, 20, 30, 40, 50],
        'min_samples_leaf': [2, 5, 10, 20, 30, 40, 50]
    }
    
    # Initialize Random Forest
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=random_state
    )
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1, # use all the CPU cores
        verbose=2  # detailed output monitoring progress updates
    )
    
    # Fit grid search
    grid_search.fit(X_train_scaled, y_train)
    
    # Get best model
    best_rf = grid_search.best_estimator_
    
    # Make predictions with best model
    y_pred = best_rf.predict(X_test_scaled)
    
    # Evaluate different max_features settings
    max_features_results = []
    for max_feat in max_features_options:
        rf_temp = RandomForestClassifier(
            n_estimators=1000,
            max_features=max_feat,
            random_state=random_state
        )
        rf_temp.fit(X_train_scaled, y_train)
        train_score = rf_temp.score(X_train_scaled, y_train)
        test_score = rf_temp.score(X_test_scaled, y_test)
        
        # Convert max_feat to string for consistent display
        max_feat_str = str(max_feat)
        if isinstance(max_feat, float):
            max_feat_str = f"{max_feat:.1%} of features"
        
        max_features_results.append({
            'max_features': max_feat_str,
            'train_score': train_score,
            'test_score': test_score
        })
    
    # Compile results
    results = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'test_accuracy': accuracy_score(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred),
        'max_features_comparison': pd.DataFrame(max_features_results),
        'best_model': best_rf
    }
    
    return results

In [33]:
random_forest_result = auto_random_forest(X=train_images_project, y=Class)

Fitting 10 folds for each of 147 candidates, totalling 1470 fits


In [34]:
# loop through and print the results
for key, value in random_forest_result.items():
    print(f"{key}: \n {value}")

best_params: 
 {'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 20}
best_score: 
 0.3976860226911338
test_accuracy: 
 0.39991378071562006
classification_report: 
               precision    recall  f1-score   support

       angry       0.33      0.25      0.28      1000
     disgust       0.50      0.73      0.59      1000
        fear       0.34      0.22      0.27      1000
       happy       0.38      0.50      0.43      1000
     neutral       0.38      0.38      0.38       993
         sad       0.31      0.27      0.29       966
    surprise       0.48      0.45      0.46      1000

    accuracy                           0.40      6959
   macro avg       0.39      0.40      0.39      6959
weighted avg       0.39      0.40      0.39      6959

max_features_comparison: 
         max_features  train_score  test_score
0               log2     0.998743    0.445466
1               sqrt     0.998743    0.442736
2  10.0% of features     0.998743    0.450496
best_mode