## Gender Classification

### Model - PCA and SVM

### Input - Images

In [None]:
import dlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# uncomment it for your first try
# ! wget http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
# ! bzip2 -d shape_predictor_68_face_landmarks.dat.bz2

In [None]:
# Read the DataFrame from the pickle file
with open('custom_dataset.pkl', 'rb') as f:
    loaded_df = pickle.load(f)

In [None]:
# Convert the list to a 2D matrix of shape (512, 512)
loaded_df['image_front_data'] = loaded_df['image_front_data'].apply(lambda x: np.array(x).reshape(512, 512))
loaded_df['image_side_data'] = loaded_df['image_side_data'].apply(lambda x: np.array(x).reshape(512, 512))

In [None]:
loaded_df.head()

In [None]:
loaded_df.info()

In [None]:
# Load face detector and landmark predictor
detector = dlib.get_frontal_face_detector()
predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

In [None]:
temp = pd.DataFrame()

In [None]:
labels = []

In [None]:
for i in range(len(loaded_df)):
    image = loaded_df['image_front_data'].iloc[i]

    # Detect faces
    faces = detector(image)
    
    if not faces:
        print('No face detected :',loaded_df['id'].iloc[i])
        continue

    landmarks = predictor(image, faces[0])

    if not landmarks:
        print('No landmarks detected :',loaded_df['id'].iloc[i])
        continue
        
    feature = []
    for coord in range(68):
        x_coord = landmarks.part(coord).x
        y_coord = landmarks.part(coord).y
        feature.append(x_coord)
        feature.append(y_coord)
        
    temp = pd.concat([temp, pd.DataFrame([feature])], ignore_index=True)
    if(loaded_df['sex'].iloc[i] == 'Male'):
        labels.append(0)
    else:
        labels.append(1)

In [None]:
labels.count(0), labels.count(1)

In [None]:
temp.head()

In [None]:
temp.shape

In [None]:
def PCA(dataset):
    mean = np.mean(dataset, axis=0)
    z = dataset - mean
    cov_matrix = (1/len(dataset)) * (z.T).dot(z)
    eig_values, eig_vectors = np.linalg.eig(cov_matrix)
    eig_pairs = [(eig_values[i], eig_vectors[:,i]) for i in range(len(eig_values))]
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    sorted_eigvalues = [eig_pairs[i][0] for i in range(len(eig_values))]
    sorted_eigvectors = np.array([eig_pairs[i][1] for i in range(len(eig_values))])

    cumulative_variance = np.cumsum(sorted_eigvalues) / np.sum(sorted_eigvalues)
    plt.plot(cumulative_variance)
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Cumulative Variance')
    plt.title('Cumulative Variance vs Number of Principal Components')
    plt.show()

    n_components = np.argmax(cumulative_variance > 0.99) + 1

    print('Number of Principal Components required:', n_components)

    W = sorted_eigvectors[:, :20]
    print('Shape of W:', W.shape)

    dataset_pca = z.dot(W)
    print('Shape of Reduced Dataset:', dataset_pca.shape)
    return W, dataset_pca, mean

In [None]:
W, dataset_reduced, mean = PCA(np.array(temp))

In [None]:
# perform svm on the reduced dataset and labels

X_train, X_test, y_train, y_test = train_test_split(dataset_reduced, labels, test_size=0.2, random_state=42)

clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
# perform svm on the original dataset and labels
X_train, X_test, y_train, y_test = train_test_split(temp, labels, test_size=0.2, random_state=42)

clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)