In [14]:
import numpy as np
import pandas as pd
import os
import cv2
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [None]:
# Directory paths
train_dir = os.path.expanduser('../train_balanced')
test_dir = os.path.expanduser('../raw_data/fer2013/test')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)
test_images, test_labels = load_images_from_directory(test_dir)


print(f"Loaded {len(train_images)} training images.")
print(f"Loaded {len(test_images)} testing images.")

Loaded 34795 training images.
Loaded 7178 training images.


In [16]:
# Check the train and test data
print({x: train_labels.count(x) for x in set(train_labels)})
print(np.array(train_images).shape)

print({x: test_labels.count(x) for x in set(test_labels)})
print(np.array(test_images).shape)

{'fear': 5000, 'neutral': 4965, 'disgust': 5000, 'happy': 5000, 'sad': 4830, 'surprise': 5000, 'angry': 5000}
(34795, 48, 48)
{'fear': 1024, 'neutral': 1233, 'disgust': 111, 'happy': 1774, 'sad': 1247, 'surprise': 831, 'angry': 958}
(7178, 48, 48)


In [17]:
# set up X, y of train and test
train_Class = np.array(train_labels)
train_images = np.array(train_images)

test_Class = np.array(test_labels)
test_images = np.array(test_images)

In [18]:
# project the image data on the eigen vectors
train_images_project, selected_eigenvectors=eigenface_project(train_images)

test_images_centered = test_images - np.mean(test_images, axis=0)
test_images_project = np.dot(test_images_centered.reshape(test_images.shape[0], -1), selected_eigenvectors)

In [None]:
# Assuming you have your data as X and y

def auto_xgboost(
    X,
    y,
    n_estimators=100,
    random_state=2024,
    cv=10
):
    # 1. Parameter grid
    param_grid = {
        'max_depth': [3, 5, 10, 20, 30],
        'learning_rate': [0.01, 0.1, 0.3],
        'min_child_weight': [1, 2, 5, 10, 20, 30, 40]
        # 'gamma': [0, 0.1],
        # 'subsample': [0.8, 1.0],
        # 'colsample_bytree': [0.8, 1.0]
    }
    # 2. Create base model
    xgb_model = XGBClassifier(
        objective='multi:softmax',
        n_estimators= n_estimators,
        # use_label_encoder=False,
        eval_metric='merror',
        random_state=random_state
    )

    # 3. Setup StratifiedKFold
    skf = StratifiedKFold(
        n_splits=cv,
        shuffle=True,
        random_state=random_state
    )

    # 4. Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=skf,
        scoring='accuracy',
        n_jobs=-1, # use all the CPU cores
        verbose=2  # detailed output monitoring progress updates
    )

    # 5. Scale the features
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 6. Fit the model
    grid_search.fit(X_scaled, y_encoded)

    # 7. Best model
    best_model = grid_search.best_estimator_
    
    # 8. Best params
    best_par = grid_search.best_params_
    
    # 9. Best CV score
    best_score = grid_search.best_score_

    # 10. Feature importance
    feature_importance = pd.DataFrame({
        'feature': [f'feature_{i}' for i in range(X.shape[1])],
        'importance': best_model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    # compile and return the results
    results = {
        'best_params': best_par,
        'best_score': best_score,
        'best_model': best_model,
        'feature_importance': feature_importance
    }

    return results, grid_search

In [22]:
XGB_result, grid_search = auto_xgboost(X=train_images_project, y=train_Class)

# loop through and print the results
for key, value in XGB_result.items():
    print(f"{key}: \n {value}")

Fitting 10 folds for each of 105 candidates, totalling 1050 fits
best_params: 
 {'learning_rate': 0.1, 'max_depth': 30, 'min_child_weight': 10}
best_score: 
 0.4404942710449891
best_model: 
 XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='merror',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=30,
              max_leaves=None, min_child_weight=10, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, objective='multi:softmax', ...)
feature_importance: 
          feature  impor

In [45]:
# work on the test data set
# fit with the best model with 1000 trees
scaler = StandardScaler()
scaler.fit(train_images_project)
train_images_scaled = scaler.transform(train_images_project)
test_images_scaled = scaler.transform(test_images_project)

# encode the classes
label_encoder = LabelEncoder()
label_encoder.fit(train_Class)
train_Class_encoded = label_encoder.transform(train_Class)
test_Class_encoded = label_encoder.transform(test_Class)
    
    
xgb = XGBClassifier(max_depth=30, learning_rate=0.01, min_child_weight=10,
                       random_state=2024, n_estimators=5000, objective='multi:softmax', eval_metric='merror')
xgb.fit(train_images_scaled, train_Class_encoded)
train_Class_pred = xgb.predict(train_images_scaled)
test_Class_pred = xgb.predict(test_images_scaled)

In [46]:
# train accuracy
print(accuracy_score(train_Class_encoded, train_Class_pred))
        
# classification_report
print(classification_report(train_Class_encoded, train_Class_pred))

0.9987354504957608
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       1.00      1.00      1.00      5000
           2       1.00      1.00      1.00      5000
           3       1.00      1.00      1.00      5000
           4       1.00      1.00      1.00      4965
           5       1.00      1.00      1.00      4830
           6       1.00      1.00      1.00      5000

    accuracy                           1.00     34795
   macro avg       1.00      1.00      1.00     34795
weighted avg       1.00      1.00      1.00     34795



In [47]:
# train accuracy
print(accuracy_score(test_Class_encoded, test_Class_pred))
        
# classification_report
print(classification_report(test_Class_encoded, test_Class_pred))

0.4381443298969072
              precision    recall  f1-score   support

           0       0.32      0.34      0.33       958
           1       0.11      0.53      0.18       111
           2       0.41      0.33      0.37      1024
           3       0.63      0.52      0.57      1774
           4       0.42      0.42      0.42      1233
           5       0.38      0.32      0.35      1247
           6       0.55      0.69      0.61       831

    accuracy                           0.44      7178
   macro avg       0.40      0.45      0.40      7178
weighted avg       0.46      0.44      0.44      7178

