In [1]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score, KFold, StratifiedKFold, train_test_split
from eigenface_project import eigenface_project
from sklearn.metrics import classification_report
# numbers of pixels of x and y axis
xPixel = 48  
yPixel = 48

In [2]:
# Load the raw data
# df = pd.read_csv("../row_data_to_csv/train_data/face_data_train.csv", header=0) # specify the first row as header

In [3]:
# Directory paths
train_dir = os.path.expanduser('../train_balanced')

# Define a function to load the balanced data
def load_images_from_directory(directory):
    images = []
    labels = []
    for label in os.listdir(directory):
        label_path = os.path.join(directory, label)
        if os.path.isdir(label_path):
            for file in os.listdir(label_path):
                file_path = os.path.join(label_path, file)
                image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
                images.append(image)
                labels.append(label)
    return images, labels

# Load training and testing images
train_images, train_labels = load_images_from_directory(train_dir)

print(f"Loaded {len(train_images)} training images.")

Loaded 34795 training images.


In [4]:
{x: train_labels.count(x) for x in set(train_labels)}
print(np.array(train_images).shape)

(34795, 48, 48)


In [5]:
# set up X, y
Class = np.array(train_labels)
train_images = np.array(train_images)

In [None]:
# the projection of the image data on the eigen vectors
train_images_project, selected_eigenvectors=eigenface_project(train_images)

In [8]:
# Perform StratifiedKFold cross-validation on knn

# create pipeline object
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

# Define the parameter grid for n_neighbors
param_grid = {
    'knn__n_neighbors': range(1, 201)  # Test n_neighbors from 1 to 30
}

# stratified CV object
skcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2024)

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=skcv, scoring='accuracy')

# Fit the grid search
grid_search.fit(train_images_project, Class)

# Print the results
# Print the best n_neighbors and the corresponding score
print("Best n_neighbors:", grid_search.best_params_['knn__n_neighbors'])
print("Best accuracy:", grid_search.best_score_)

ValueError: Found input variables with inconsistent numbers of samples: [2, 34795]

In [None]:
# Print the results for each n_neighbors
for i in range(1, 201):
    print(f"n_neighbors = {i}:")
    print(f"  Mean cross-validation score: {grid_search.cv_results_[f'mean_test_score'][i-1]:.4f}")
    # print(f"  Standard deviation: {grid_search.cv_results_[f'std_test_score'][i-1]:.4f}")

n_neighbors = 1:
  Mean cross-validation score: 0.3866
n_neighbors = 2:
  Mean cross-validation score: 0.3415
n_neighbors = 3:
  Mean cross-validation score: 0.3429
n_neighbors = 4:
  Mean cross-validation score: 0.3443
n_neighbors = 5:
  Mean cross-validation score: 0.3440
n_neighbors = 6:
  Mean cross-validation score: 0.3403
n_neighbors = 7:
  Mean cross-validation score: 0.3364
n_neighbors = 8:
  Mean cross-validation score: 0.3366
n_neighbors = 9:
  Mean cross-validation score: 0.3376
n_neighbors = 10:
  Mean cross-validation score: 0.3354
n_neighbors = 11:
  Mean cross-validation score: 0.3348
n_neighbors = 12:
  Mean cross-validation score: 0.3332
n_neighbors = 13:
  Mean cross-validation score: 0.3310
n_neighbors = 14:
  Mean cross-validation score: 0.3286
n_neighbors = 15:
  Mean cross-validation score: 0.3287
n_neighbors = 16:
  Mean cross-validation score: 0.3264
n_neighbors = 17:
  Mean cross-validation score: 0.3259
n_neighbors = 18:
  Mean cross-validation score: 0.3247
n

In [None]:
# return the precision, recall and f1-score in each class

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=grid_search.best_params_['knn__n_neighbors']))
])

# Get predictions using cross validation
predictions = cross_val_predict(pipeline, train_images_project, Class, cv=skcv)

# Print detailed classification report
print(classification_report(Class, predictions))


              precision    recall  f1-score   support

       angry       0.34      0.26      0.30      5000
     disgust       0.50      0.66      0.57      5000
        fear       0.34      0.32      0.33      5000
       happy       0.40      0.31      0.34      5000
     neutral       0.29      0.44      0.35      4965
         sad       0.33      0.28      0.30      4830
    surprise       0.51      0.44      0.47      5000

    accuracy                           0.39     34795
   macro avg       0.39      0.39      0.38     34795
weighted avg       0.39      0.39      0.38     34795

