In [1]:
import cv2
import numpy as np
import pandas as pd
import os
from skimage.feature import hog, local_binary_pattern
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [3]:
# Loading the Dataset
train_dir = 'Dataset/Train'
test_dir = 'Dataset/Test'

print(os.listdir(train_dir))

def get_data(dir):
    images = []
    labels = []
    
    for cat in os.listdir(dir):
        for img_name in os.listdir(os.path.join(dir, cat)):
            img = cv2.imread(os.path.join(dir,cat,img_name), cv2.IMREAD_GRAYSCALE)
            images.append(img)
            labels.append(cat)
    
    return np.array(images), np.array(labels)

train_images, train_labels = get_data(train_dir)
test_images, test_labels = get_data(test_dir)

all_images = np.concatenate((train_images, test_images))
all_labels = np.concatenate((train_labels, test_labels))

print(all_images.shape, all_labels.shape)


['happy', 'sad', 'fear', 'surprise', 'neutral', 'angry', 'disgust']
(35887, 48, 48) (35887,)


In [4]:
print(np.unique(all_labels))
# describe the labels
print(pd.Series(all_labels).value_counts())

['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']
happy       8989
neutral     6198
sad         6077
fear        5121
angry       4953
surprise    4002
disgust      547
Name: count, dtype: int64


In [5]:
# # Convert flattened images into a list of strings for saving
# image_strings = [' '.join(map(str, img)) for img in all_images]

# # Create a DataFrame with image data as a single column and labels
# df = pd.DataFrame({'image_data': image_strings, 'label': all_labels})

# # Save to CSV
# df.to_csv('all_images_labels.csv', index=False)

In [6]:
# Saving the images and labels to a CSV file
def save_images_labels_to_csv(images, labels, output_csv):
    with open(output_csv, 'w') as f:
        # Write the header for image pixels and the label
        columns = ['label'] + [f'pixel_{i}' for i in range(images[0].size)] 
        f.write(','.join(columns) + '\n')
        
        # Process and write each image's flattened data
        for image, label in zip(images, labels):
            # Flatten the image to a 1D array
            flattened_image = image.flatten()
            # Convert the array to a string format suitable for CSV
            row = np.append(label, flattened_image)
            row_str = ','.join(map(str, row))
            # Write the row to the CSV
            f.write(row_str + '\n')
    
    print(f"Data saved to {output_csv}")

# Save all images and labels to a single CSV file
save_images_labels_to_csv(all_images, all_labels, 'all_images_labels.csv')

Data saved to all_images_labels.csv


In [7]:
# X_train, X_test, y_train, y_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=42)

# print(f"Train data shape: {X_train.shape}")
# print(f"Test data shape: {X_test.shape}")

In [8]:
# Loading the images from the csv
image_size = 48  # Assuming 48x48 images

# Load the data from the CSV
def load_images_labels_from_csv(csv_file):
    df = pd.read_csv(csv_file)
    labels = df['label'].values
    # Drop the label column to retain only pixel data
    image_data = df.drop(columns=['label']).values
    # Reshape the image data back to original shape (48x48)
    images = [np.array(image, dtype=np.uint8).reshape((image_size, image_size)) for image in image_data]
    return images, labels

# Load images and labels
all_images, all_labels = load_images_labels_from_csv('all_images_labels.csv')

In [9]:
print(len(all_labels))

35887


In [10]:
# Extract HOG features
def extract_hog_features(image):
    return hog(image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys')

# Extract ORB features
def extract_orb_features(image, max_features=128):
    orb = cv2.ORB_create()
    _, descriptors = orb.detectAndCompute(image, None)
    if descriptors is None:
        return np.zeros(max_features * 32)
    if descriptors.shape[0] < max_features:
        padding = np.zeros((max_features - descriptors.shape[0], descriptors.shape[1]))
        descriptors = np.vstack((descriptors, padding))
    return descriptors[:max_features].flatten()

# Extract histogram features (for grayscale images)
def extract_histogram(image):
    # Compute the histogram with 256 bins (grayscale)
    hist = cv2.calcHist([image], [0], None, [256], [0, 256])
    # Normalize the histogram
    hist = cv2.normalize(hist, hist).flatten()
    return hist

# Combine selected features
def extract_combined_features(image):
    hog_features = extract_hog_features(image)
    orb_features = extract_orb_features(image)
    histogram = extract_histogram(image)
    
    return np.concatenate([hog_features, orb_features, histogram])


In [12]:
print(type(all_images[0][1]))

<class 'numpy.ndarray'>


In [13]:
def save_features_to_csv(features, labels, output_csv):
    # Combine features and labels
    data = [np.append(label, feature) for feature, label in zip(features, labels)]
    # Create a DataFrame
    df = pd.DataFrame(data)
    # Save to CSV
    df.to_csv(output_csv, index=False, header=False)
    print(f"Features saved to {output_csv}")


In [15]:
X_train, X_test, y_train, y_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=42)

# print(f"Train data shape: {X_train.shape}")
# print(f"Test data shape: {X_test.shape}")

In [17]:
# Step 1: Extract features for training and testing images
X_train_features = [extract_combined_features(img) for img in X_train]
X_test_features = [extract_combined_features(img) for img in X_test]

# Step 2: Convert features to numpy arrays for scaling and PCA
# X_train_features = np.array(train_features)
# X_test_features = np.array(test_features)

print(type(X_train_features))
print(X_train_features[0].shape)  

# Step 3: Standardize the training features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)

# Step 4: Apply PCA to the scaled training features
pca = PCA(n_components=500)  # Adjust n_components as needed
X_train_pca = pca.fit_transform(X_train_scaled)

# Step 5: Save the processed training features to a CSV
save_features_to_csv(X_train_pca, y_train, 'train_features.csv')

# Step 6: Apply the same scaler and PCA to the test features
X_test_scaled = scaler.transform(X_test_features)
X_test_pca = pca.transform(X_test_scaled)

# Step 7: Save the processed testing features to a CSV
save_features_to_csv(X_test_pca, y_test, 'test_features.csv')

<class 'list'>
(5252,)
Features saved to train_features.csv
Features saved to test_features.csv


### Train the model

In [20]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Load the preprocessed features and labels from CSV
train_df = pd.read_csv('train_features.csv', header=None)

# Separate features and labels
X_train = train_df.iloc[:, 1:].values  # Features
y_train = train_df.iloc[:, 0].values  # Labels (assuming label is in the first column)

# Load the preprocessed features and labels from CSV
test_df = pd.read_csv('test_features.csv', header=None)

# Separate features and labels
X_test = test_df.iloc[:, 1:].values  # Features
y_test = test_df.iloc[:, 0].values  # Labels (assuming label is in the first column)

# Split the loaded data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use LazyClassifier for a quick comparison of models
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Print out the model performance
print(models)

 97%|█████████▋| 30/31 [23:36<01:11, 71.80s/it]  

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033866 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 127500
[LightGBM] [Info] Number of data points in the train set: 28709, number of used features: 500
[LightGBM] [Info] Start training from score -1.975426
[LightGBM] [Info] Start training from score -4.191921
[LightGBM] [Info] Start training from score -1.946712
[LightGBM] [Info] Start training from score -1.386887
[LightGBM] [Info] Start training from score -1.758429
[LightGBM] [Info] Start training from score -1.777202
[LightGBM] [Info] Start training from score -2.187829


100%|██████████| 31/31 [24:02<00:00, 46.52s/it]

                               Accuracy  Balanced Accuracy ROC AUC  F1 Score  \
Model                                                                          
LGBMClassifier                     0.48               0.42    None      0.47   
SVC                                0.48               0.42    None      0.47   
NearestCentroid                    0.40               0.39    None      0.41   
QuadraticDiscriminantAnalysis      0.47               0.38    None      0.46   
LogisticRegression                 0.43               0.38    None      0.42   
LinearDiscriminantAnalysis         0.43               0.38    None      0.42   
LinearSVC                          0.43               0.37    None      0.40   
CalibratedClassifierCV             0.43               0.36    None      0.41   
BaggingClassifier                  0.37               0.34    None      0.36   
RidgeClassifier                    0.42               0.34    None      0.39   
RidgeClassifierCV                  0.42 


