In [17]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report, accuracy_score
# numbers of pixels of x and y axis
xPixel = 48  
yPixel = 48

In [2]:
# Load the raw data
# df = pd.read_csv("../row_data_to_csv/train_data/face_data_train.csv", header=0) # specify the first row as header

In [3]:
# Directory paths
train_dir = os.path.expanduser('../raw_data/fer2013/train')
test_dir = os.path.expanduser('../raw_data/fer2013/test')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)
test_images, test_labels = load_images_from_directory(test_dir)


print(f"Loaded {len(train_images)} training images.")
print(f"Loaded {len(test_images)} training images.")

Loaded 28709 training images.
Loaded 7178 training images.


In [4]:
# Check the train and test data
{x: train_labels.count(x) for x in set(train_labels)}
print(np.array(train_images).shape)

print({x: test_labels.count(x) for x in set(test_labels)})
print(np.array(test_images).shape)

(28709, 48, 48)
{'happy': 1774, 'fear': 1024, 'surprise': 831, 'sad': 1247, 'angry': 958, 'disgust': 111, 'neutral': 1233}
(7178, 48, 48)


In [6]:
# set up X, y of train and test
train_Class = np.array(train_labels)
train_images = np.array(train_images)

test_Class = np.array(test_labels)
test_images = np.array(test_images)

In [7]:
# project the image data on the eigen vectors
train_images_project, selected_eigenvectors=eigenface_project(train_images)

test_images_centered = test_images - np.mean(test_images, axis=0)
test_images_project = np.dot(test_images_centered.reshape(test_images.shape[0], -1), selected_eigenvectors)

In [10]:
# Perform StratifiedKFold cross-validation on knn

# create pipeline object
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Define the parameter grid for n_neighbors
param_grid = {
    'knn__n_neighbors': range(1, 201)  # Test n_neighbors from 1 to 30
}

# stratified CV object
skcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2024)

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=skcv, scoring='accuracy')

# Fit the grid search
grid_search.fit(train_images_project, train_Class)

# Print the results
# Print the best n_neighbors and the corresponding score
print("Best n_neighbors:", grid_search.best_params_['knn__n_neighbors'])
print("Best accuracy:", grid_search.best_score_)

Best n_neighbors: 1
Best accuracy: 0.38311971086571595


In [12]:
# Print the results for each n_neighbors
for i in range(1, 201):
    print(f"n_neighbors = {i}:")
    print(f"  Mean cross-validation score: {grid_search.cv_results_[f'mean_test_score'][i-1]:.4f}")
    # print(f"  Standard deviation: {grid_search.cv_results_[f'std_test_score'][i-1]:.4f}")

n_neighbors = 1:
  Mean cross-validation score: 0.3831
n_neighbors = 2:
  Mean cross-validation score: 0.3378
n_neighbors = 3:
  Mean cross-validation score: 0.3355
n_neighbors = 4:
  Mean cross-validation score: 0.3356
n_neighbors = 5:
  Mean cross-validation score: 0.3383
n_neighbors = 6:
  Mean cross-validation score: 0.3324
n_neighbors = 7:
  Mean cross-validation score: 0.3302
n_neighbors = 8:
  Mean cross-validation score: 0.3287
n_neighbors = 9:
  Mean cross-validation score: 0.3291
n_neighbors = 10:
  Mean cross-validation score: 0.3247
n_neighbors = 11:
  Mean cross-validation score: 0.3261
n_neighbors = 12:
  Mean cross-validation score: 0.3242
n_neighbors = 13:
  Mean cross-validation score: 0.3234
n_neighbors = 14:
  Mean cross-validation score: 0.3213
n_neighbors = 15:
  Mean cross-validation score: 0.3214
n_neighbors = 16:
  Mean cross-validation score: 0.3206
n_neighbors = 17:
  Mean cross-validation score: 0.3185
n_neighbors = 18:
  Mean cross-validation score: 0.3174
n

In [18]:
# return the precision, recall and f1-score in each class

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=grid_search.best_params_['knn__n_neighbors']))
])

# Get predictions of the testing data
pipe.fit(train_images_project, train_Class)
pred = pipe.predict(test_images_project)

# print test accuracy
print(accuracy_score(test_Class, pred))
        
#
# print detailed classification report
print(classification_report(test_Class, pred))


0.4020618556701031
              precision    recall  f1-score   support

       angry       0.37      0.31      0.34       958
     disgust       0.36      0.57      0.44       111
        fear       0.38      0.38      0.38      1024
       happy       0.50      0.41      0.45      1774
     neutral       0.30      0.45      0.36      1233
         sad       0.35      0.30      0.32      1247
    surprise       0.61      0.58      0.60       831

    accuracy                           0.40      7178
   macro avg       0.41      0.43      0.41      7178
weighted avg       0.41      0.40      0.40      7178

