# 1. Import Library and Initialize Variables

In [52]:
from sklearn.metrics import accuracy_score, classification_report
from skimage.feature import graycomatrix, graycoprops
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import cv2
import os
import pandas as pd

angles = [0, np.pi/4, np.pi/2, 3*np.pi/4, np.pi]
n_neighbors = [1, 3, 5, 7, 9]
features = []
labels = []

# 2. Define Preprocessing and Feature Extraction Functions

In [53]:
def preprocess_image(image_path, target_size):
    # Read the image
    image = cv2.imread(image_path)

    # Convert image to grayscale
    grayscale_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Resize image while maintaining aspect ratio
    height, width = grayscale_image.shape[:2]
    if height > width:
        new_height = target_size
        new_width = int(width * (target_size / height))
    else:
        new_width = target_size
        new_height = int(height * (target_size / width))
    resized_image = cv2.resize(grayscale_image, (new_width, new_height))

    return resized_image

In [54]:
def extract_features(image):
    features = []
    for angle in angles:
        glcm = graycomatrix(image, [1], [angle], levels=256, symmetric=True, normed=True)
        dissimilarity = graycoprops(glcm, 'dissimilarity').ravel()
        correlation = graycoprops(glcm, 'correlation').ravel()
        homogeneity = graycoprops(glcm, 'homogeneity').ravel()
        contrast = graycoprops(glcm, 'contrast').ravel()
        asm = graycoprops(glcm, 'ASM').ravel()
        energy = graycoprops(glcm, 'energy').ravel()
        angle_features = np.concatenate((dissimilarity, correlation, homogeneity, contrast, asm, energy))
        features.append(angle_features)

    return np.concatenate(features)

# 3. Read Image Datas

In [55]:
parent_folder = "FacialExpression/"
subfolder_names = ["happy", "sad"]
df = pd.DataFrame(columns=['Image Name', 'Category'])

df_list = []
for subfolder in subfolder_names:
    subfolder_path = os.path.join(parent_folder, subfolder)
    image_list = os.listdir(subfolder_path)
    image_names = [os.path.splitext(image)[0] for image in image_list]
    category = [subfolder] * len(image_names)
    image_df = pd.DataFrame(
        {"Image Name": image_names, "Category": category})
    df_list.append(image_df)
df = pd.concat(df_list, ignore_index=True)
print(df['Category'].value_counts())

happy    1537
sad      1463
Name: Category, dtype: int64


# 4. Process The Image
### a. Change Category Data (happy and sad) to Numeric Data (0 and 1)
### b. Resize Image to with either height as 128 or width as 128 based on it's aspect ratio
### c. Extract Features from Image using Feature Extraction Function with GLCM (Total 30 Features)
### d. Append Its Features and Label into List

In [56]:
for (i, imagePath) in enumerate(df['Image Name']):
    if df['Category'][i] == "happy":
        label = 0
    else:
        label = 1 #sad
    path = os.path.join(parent_folder, df["Category"][i] + '/' + imagePath + ".jpg")
    try:
        image = preprocess_image(path, 128)
        feat = extract_features(image)
        features.append(feat)
        labels.append(label)
    except:
        print("File corrupted: {}".format(imagePath))

    # show an update every 200 images until the last image
    if i > 0 and ((i + 1)% 200 == 0 or i == len(imagePath)-1):
	    print("[INFO] processed {}/{}".format(i+1, len(df)))

[INFO] processed 10/3000
[INFO] processed 200/3000
[INFO] processed 400/3000
[INFO] processed 600/3000
[INFO] processed 800/3000
File corrupted: happy-0974
[INFO] processed 1000/3000
[INFO] processed 1200/3000
[INFO] processed 1400/3000
[INFO] processed 1600/3000
[INFO] processed 1800/3000
[INFO] processed 2000/3000
[INFO] processed 2200/3000
[INFO] processed 2400/3000
File corrupted: sad-0967
[INFO] processed 2600/3000
[INFO] processed 2800/3000
[INFO] processed 3000/3000


# 5. Split Data into Training and Testing Data (80% Training and 20% Testing)

In [57]:
#divide 1537 happy and 1463 sad images into equal amount for training and testing using sklearn by 80 20
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
	features, labels, test_size=0.2, random_state=42)

# 6. Train Model
### a. Enumerate K based on what we initialize before to find the best model
### b. Create Model with K Nearest Neighbor Classifier
### c. Train Model with Training Data
### d. Predict Label of Testing Data
### e. Calculate Accuracy of Model
### f. Save the Model with Highest Accuracy

In [58]:
bestModel2 = None
bestAcc2 = 0.0
k2 = 0
for k in n_neighbors:
    print("[INFO] evaluating feature accuracy for k={}...".format(k))
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(trainFeat, trainLabels)
    pred_feat = model.predict(testFeat)
    acc = accuracy_score(testLabels, pred_feat)

    print("[INFO] k-NN classifier: k={}".format(k))
    print("[INFO] feature accuracy: {:.2f}%".format(acc*100))
    report = classification_report(testLabels, pred_feat, target_names=["happy", "sad"])
    print(report)

    if acc > bestAcc2:
        bestAcc2 = acc
        bestModel2 = model
        k2 = k

[INFO] evaluating feature accuracy for k=1...
[INFO] k-NN classifier: k=1
[INFO] feature accuracy: 48.17%
              precision    recall  f1-score   support

       happy       0.52      0.49      0.50       321
         sad       0.45      0.48      0.46       279

    accuracy                           0.48       600
   macro avg       0.48      0.48      0.48       600
weighted avg       0.48      0.48      0.48       600

[INFO] evaluating feature accuracy for k=3...
[INFO] k-NN classifier: k=3
[INFO] feature accuracy: 49.83%
              precision    recall  f1-score   support

       happy       0.53      0.50      0.52       321
         sad       0.46      0.50      0.48       279

    accuracy                           0.50       600
   macro avg       0.50      0.50      0.50       600
weighted avg       0.50      0.50      0.50       600

[INFO] evaluating feature accuracy for k=5...
[INFO] k-NN classifier: k=5
[INFO] feature accuracy: 53.00%
              precision    r

# 7. Export Model

In [59]:
import pickle

filename = "knn_model.sav"
pickle.dump(model, open(filename, 'wb'))

# 8. If Necessary, Do Hyperparameter Tuning Based on 2 Parameter Used in KNN
### a. n_neighbors 
### b. p (1 = manhattan_distance, 2 = euclidean_distance) for Minkowski Distance

In [60]:
from sklearn.model_selection import GridSearchCV

#List Hyperparameters that we want to tune.
n_neighbors = list(range(1,30,2))
p=[1,2]
#Convert to dictionary
hyperparameters = dict(n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters)

#Fit the model
best_model = clf.fit(trainFeat, trainLabels)

#Print The value of best Hyperparameters
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])

Best p: 1
Best n_neighbors: 23


In [73]:
results = pd.DataFrame(clf.cv_results_)
#show accuracy for each combination of parameters
results['accuracy'] = results['mean_test_score'].apply(lambda x: x*100)
sorted_result = results[['param_n_neighbors', 'param_p', 'accuracy']]
sorted_result.sort_values(by=['accuracy'], ascending=False)

Unnamed: 0,param_n_neighbors,param_p,accuracy
22,23,1,55.505828
5,5,2,55.422147
24,25,1,55.297408
8,9,1,55.130045
6,7,1,55.088118
19,19,2,55.047147
25,25,2,55.046886
13,13,2,55.046625
18,19,1,55.046625
16,17,1,54.963553


In [71]:
sorted_result.to_excel("knn_gridsearch.xlsx")

In [77]:
print("[INFO] evaluating feature accuracy for k={}...".format(5))
model = KNeighborsClassifier(n_neighbors=5, p=2)
model.fit(trainFeat, trainLabels)
pred_feat = model.predict(testFeat)
acc = accuracy_score(testLabels, pred_feat)
print("[INFO] k-NN classifier: k={}".format(5))
print("[INFO] feature accuracy: {:.2f}%".format(acc*100))
report = classification_report(testLabels, pred_feat, target_names=["happy", "sad"])
print(report)

[INFO] evaluating feature accuracy for k=5...
[INFO] k-NN classifier: k=5
[INFO] feature accuracy: 53.00%
              precision    recall  f1-score   support

       happy       0.56      0.57      0.57       321
         sad       0.49      0.48      0.49       279

    accuracy                           0.53       600
   macro avg       0.53      0.53      0.53       600
weighted avg       0.53      0.53      0.53       600

