In [4]:
import cv2
import numpy as np
import pandas as pd
import os
from skimage.feature import hog, local_binary_pattern
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import pickle

In [5]:
# Needed to process our augmented and oversampled dataset (Will combine ipynbs afterwards)
from PIL import Image, ImageEnhance, ImageOps
import os
import numpy as np
import random

# Define paths to the train and test directories
train_dir = 'Dataset/Train'
test_dir = 'Dataset/Test'

# Check if the directories exist
if not os.path.exists(train_dir):
    raise FileNotFoundError(f"Training directory {train_dir} not found.")
if not os.path.exists(test_dir):
    raise FileNotFoundError(f"Test directory {test_dir} not found.")

# Function to load images from a directory and its subdirectories
def load_images_from_directory(directory):
    images = {}
    for root, _, files in os.walk(directory):
        class_name = os.path.basename(root)
        images[class_name] = []
        for filename in files:
            if filename.endswith(".jpg") or filename.endswith(".png"):
                img_path = os.path.join(root, filename)
                with Image.open(img_path) as img:
                    img_array = np.array(img)
                    images[class_name].append(img_array)
    return images

# Load images from the train and test directories
train_images = load_images_from_directory(train_dir)
test_images = load_images_from_directory(test_dir)

# Print the number of images loaded for each class
for class_name, images in train_images.items():
    print(f"Loaded {len(images)} images from class {class_name} in {train_dir}")
for class_name, images in test_images.items():
    print(f"Loaded {len(images)} images from class {class_name} in {test_dir}")

if "Train" in train_images:train_images.pop("Train")
if "Test" in test_images:test_images.pop("Test")

print(train_images.keys())
print(test_images.keys())




def augment_image(image):
    # Convert numpy array to PIL Image
    pil_image = Image.fromarray(image)

    # Random rotation
    if random.random() > 0.5:
        angle = random.uniform(-30, 30)
        pil_image = pil_image.rotate(angle)

    # Random horizontal flip
    if random.random() > 0.5:
        pil_image = ImageOps.mirror(pil_image)

    # Random vertical flip
    if random.random() > 0.5:
        pil_image = ImageOps.flip(pil_image)

    # Random Gaussian noise
    if random.random() > 0.5:
        np_image = np.array(pil_image)
        mean = 0
        std = random.uniform(0, 25)
        gauss = np.random.normal(mean, std, np_image.shape).astype('uint8')
        np_image = np.clip(np_image + gauss, 0, 255)
        pil_image = Image.fromarray(np_image)

    # Convert PIL Image back to numpy array
    return np.array(pil_image)

# Apply augmentation to all images in the train_images dictionary
augmented_train_images = {}
for class_name, images in train_images.items():
    augmented_train_images[class_name] = [augment_image(image) for image in images]

# Print the number of augmented images for each class
for class_name, images in augmented_train_images.items():
    print(f"Augmented {len(images)} images for class {class_name}")

# Merge augmented images with original training images
for class_name, images in augmented_train_images.items():
    if class_name in train_images:
        train_images[class_name].extend(images)
    else:
        train_images[class_name] = images

# Count the number of images in each class for train and test datasets
train_counts = {class_name: len(images) for class_name, images in train_images.items()}
test_counts = {class_name: len(images) for class_name, images in test_images.items()}



# Step 1: Calculate the target number of images for each class (equal to the maximum class size)
max_class_size = max(train_counts.values())
print(f"Maximum class size: {max_class_size}")

# Step 2: Function to oversample a class by augmenting images
def oversample_class(images, target_size):
    augmented_images = []
    while len(images) + len(augmented_images) < target_size:
        # Augment the images to reach the target size
        image = random.choice(images)  # Randomly pick an image from the class
        augmented_image = augment_image(image)  # Augment the selected image
        augmented_images.append(augmented_image)
    return images + augmented_images

# Step 3: Oversample each class in the train_images dictionary
for class_name, images in train_images.items():
    if len(images) < max_class_size:
        # If the class has fewer images than the max_class_size, oversample it
        train_images[class_name] = oversample_class(images, max_class_size)
    print(f"Oversampled class {class_name} to {len(train_images[class_name])} images")

# Step 4: Recalculate the number of images in each class after oversampling
train_counts = {class_name: len(images) for class_name, images in train_images.items()}





Loaded 0 images from class Train in Dataset/Train
Loaded 7215 images from class happy in Dataset/Train
Loaded 4830 images from class sad in Dataset/Train
Loaded 4097 images from class fear in Dataset/Train
Loaded 3171 images from class surprise in Dataset/Train
Loaded 4965 images from class neutral in Dataset/Train
Loaded 3995 images from class angry in Dataset/Train
Loaded 436 images from class disgust in Dataset/Train
Loaded 0 images from class Test in Dataset/Test
Loaded 1774 images from class happy in Dataset/Test
Loaded 1247 images from class sad in Dataset/Test
Loaded 1024 images from class fear in Dataset/Test
Loaded 831 images from class surprise in Dataset/Test
Loaded 1233 images from class neutral in Dataset/Test
Loaded 958 images from class angry in Dataset/Test
Loaded 111 images from class disgust in Dataset/Test
dict_keys(['happy', 'sad', 'fear', 'surprise', 'neutral', 'angry', 'disgust'])
dict_keys(['happy', 'sad', 'fear', 'surprise', 'neutral', 'angry', 'disgust'])
Augme

In [6]:
X_train, y_train, X_test, y_test = [], [], [], []

# Flatten training images and store them along with their labels
for label, images in train_images.items():
    for img in images:
        X_train.append(img)  
        y_train.append(label)

# Flatten testing images and store them along with their labels
for label, images in test_images.items():
    for img in images:
        X_test.append(img)
        y_test.append(label)

# Convert the lists to numpy arrays after collecting all data
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

In [7]:
# # Loading the Dataset
# train_dir = 'Dataset/Train'
# test_dir = 'Dataset/Test'

# print(os.listdir(train_dir))

# def get_data(dir):
#     images = []
#     labels = []
    
#     for cat in os.listdir(dir):
#         for img_name in os.listdir(os.path.join(dir, cat)):
#             img = cv2.imread(os.path.join(dir,cat,img_name), cv2.IMREAD_GRAYSCALE)
#             images.append(img)
#             labels.append(cat)
    
#     return np.array(images), np.array(labels)

# train_images, train_labels = get_data(train_dir)
# test_images, test_labels = get_data(test_dir)

# all_images = np.concatenate((train_images, test_images))
# all_labels = np.concatenate((train_labels, test_labels))

# print(all_images.shape, all_labels.shape)


In [8]:
# print(np.unique(all_labels))
# # describe the labels
# print(pd.Series(all_labels).value_counts())

In [9]:
# # Convert flattened images into a list of strings for saving
# image_strings = [' '.join(map(str, img)) for img in all_images]

# # Create a DataFrame with image data as a single column and labels
# df = pd.DataFrame({'image_data': image_strings, 'label': all_labels})

# # Save to CSV
# df.to_csv('all_images_labels.csv', index=False)

In [10]:
# # Saving the images and labels to a CSV file
# def save_images_labels_to_csv(images, labels, output_csv):
#     with open(output_csv, 'w') as f:
#         # Write the header for image pixels and the label
#         columns = ['label'] + [f'pixel_{i}' for i in range(images[0].size)] 
#         f.write(','.join(columns) + '\n')
        
#         # Process and write each image's flattened data
#         for image, label in zip(images, labels):
#             # Flatten the image to a 1D array
#             flattened_image = image.flatten()
#             # Convert the array to a string format suitable for CSV
#             row = np.append(label, flattened_image)
#             row_str = ','.join(map(str, row))
#             # Write the row to the CSV
#             f.write(row_str + '\n')
    
#     print(f"Data saved to {output_csv}")

# # Save all images and labels to a single CSV file
# save_images_labels_to_csv(all_images, all_labels, 'all_images_labels.csv')

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=42)

# print(f"Train data shape: {X_train.shape}")
# print(f"Test data shape: {X_test.shape}")

In [12]:
# # Loading the images from the csv
# image_size = 48  # Assuming 48x48 images

# # Load the data from the CSV
# def load_images_labels_from_csv(csv_file):
#     df = pd.read_csv(csv_file)
#     labels = df['label'].values
#     # Drop the label column to retain only pixel data
#     image_data = df.drop(columns=['label']).values
#     # Reshape the image data back to original shape (48x48)
#     images = [np.array(image, dtype=np.uint8).reshape((image_size, image_size)) for image in image_data]
#     return images, labels

# # Load images and labels
# all_images, all_labels = load_images_labels_from_csv('all_images_labels.csv')

In [13]:
# print(len(all_labels))

In [14]:
# Extract HOG features
def extract_hog_features(image):
    return hog(image, pixels_per_cell=(8, 8), cells_per_block=(2, 2), block_norm='L2-Hys')

# Extract ORB features
def extract_orb_features(image, max_features=128):
    orb = cv2.ORB_create()
    _, descriptors = orb.detectAndCompute(image, None)
    if descriptors is None:
        return np.zeros(max_features * 32)
    if descriptors.shape[0] < max_features:
        padding = np.zeros((max_features - descriptors.shape[0], descriptors.shape[1]))
        descriptors = np.vstack((descriptors, padding))
    return descriptors[:max_features].flatten()

# Extract histogram features (for grayscale images)
def extract_histogram(image):
    # Compute the histogram with 256 bins (grayscale)
    hist = cv2.calcHist([image], [0], None, [256], [0, 256])
    # Normalize the histogram
    hist = cv2.normalize(hist, hist).flatten()
    return hist

# Combine selected features
def extract_combined_features(image):
    hog_features = extract_hog_features(image)
    orb_features = extract_orb_features(image)
    histogram = extract_histogram(image)
    
    return np.concatenate([hog_features, orb_features, histogram])


In [15]:
# print(type(all_images[0][1]))

In [16]:
def save_features_to_csv(features, labels, output_csv):
    # Combine features and labels
    data = [np.append(label, feature) for feature, label in zip(features, labels)]
    # Create a DataFrame
    df = pd.DataFrame(data)
    # Save to CSV
    df.to_csv(output_csv, index=False, header=False)
    print(f"Features saved to {output_csv}")


In [17]:
# X_train, X_test, y_train, y_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=42)

# print(f"Train data shape: {X_train.shape}")
# print(f"Test data shape: {X_test.shape}")

In [18]:
print(X_train.shape)

(101010, 48, 48)


In [19]:
# Step 1: Extract features for training and testing images
X_train_features = [extract_combined_features(img) for img in X_train]
X_test_features = [extract_combined_features(img) for img in X_test]

# Step 2: Convert features to numpy arrays for scaling and PCA
# X_train_features = np.array(train_features)
# X_test_features = np.array(test_features)

print(type(X_train_features))
print(X_train_features[0].shape)  

# Step 3: Standardize the training features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)

# Step 4: Apply PCA to the scaled training features
pca = PCA(n_components=100)  # Adjust n_components as needed
X_train_pca = pca.fit_transform(X_train_scaled)

# Step 5: Save the processed training features to a CSV
save_features_to_csv(X_train_pca, y_train, 'train_features.csv')

# Step 6: Apply the same scaler and PCA to the test features
X_test_scaled = scaler.transform(X_test_features)
X_test_pca = pca.transform(X_test_scaled)

# Step 7: Save the processed testing features to a CSV
save_features_to_csv(X_test_pca, y_test, 'test_features.csv')

<class 'list'>
(5252,)
Features saved to train_features.csv
Features saved to test_features.csv


### Train the model

In [20]:
# import pandas as pd
# from lazypredict.Supervised import LazyClassifier
# from sklearn.model_selection import train_test_split

# # Load the preprocessed features and labels from CSV
# train_df = pd.read_csv('train_features.csv', header=None)

# # Separate features and labels
# X_train = train_df.iloc[:, 1:].values  # Features
# y_train = train_df.iloc[:, 0].values  # Labels (assuming label is in the first column)

# # Load the preprocessed features and labels from CSV
# test_df = pd.read_csv('test_features.csv', header=None)

# # Separate features and labels
# X_test = test_df.iloc[:, 1:].values  # Features
# y_test = test_df.iloc[:, 0].values  # Labels (assuming label is in the first column)

# # Split the loaded data into training and testing sets
# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Use LazyClassifier for a quick comparison of models
# clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
# models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# # Print out the model performance
# print(models)

#### Training the best model

In [21]:
# Load the preprocessed features and labels from CSV
train_df = pd.read_csv('train_features.csv', header=None)

# Separate features and labels
X_train = train_df.iloc[:, 1:].values  # Features
y_train = train_df.iloc[:, 0].values  # Labels (assuming label is in the first column)

# Load the preprocessed features and labels from CSV
test_df = pd.read_csv('test_features.csv', header=None)

# Separate features and labels
X_test = test_df.iloc[:, 1:].values  # Features
y_test = test_df.iloc[:, 0].values  # Labels (assuming label is in the first column)

# Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVC model
model = SVC(kernel='rbf')  # You can change the kernel to 'rbf', 'poly', etc.

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[ 405   23   78  113  150  146   43]
 [  19   64    7   10    1    8    2]
 [ 128   11  359   94  134  196  102]
 [  83    5   62 1351  116  134   23]
 [  99    9   87  156  652  183   47]
 [ 157   11  111  153  211  578   26]
 [  33    8   64   53   64   43  566]]

Classification Report:
              precision    recall  f1-score   support

       angry       0.44      0.42      0.43       958
     disgust       0.49      0.58      0.53       111
        fear       0.47      0.35      0.40      1024
       happy       0.70      0.76      0.73      1774
     neutral       0.49      0.53      0.51      1233
         sad       0.45      0.46      0.46      1247
    surprise       0.70      0.68      0.69       831

    accuracy                           0.55      7178
   macro avg       0.53      0.54      0.53      7178
weighted avg       0.55      0.55      0.55      7178



In [22]:
print("\nModel Accuracy:")
print(model.score(X_test, y_test))


Model Accuracy:
0.5537754249094455


In [23]:
with open('svc_model2.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)