In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import random
import glob
import cv2
import os
from numpy import mean
from numpy import cov
from numpy.linalg import eig

In [0]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

## We read the files using OpenCV and store them in a list after downscaling the images and flattening them. We also create a dictionary mapping the file labels to the image matrix.

In [None]:
faces = []
path= '/content/drive/My Drive/SMAI_Assignment3_Dataset/dataset/'
data_path = os.path.join(path,'*g')
files = glob.glob(data_path)
index = path.rindex('/')
labels = []
# print(index)
labels_dict = {}
for f1 in files:
    # print(f1)
    file_name = f1[index+1:]
    file_label, name = file_name.split('_')
    labels.append(file_label)
    img = cv2.imread(f1)
    img = rgb2gray(img)
    img = cv2.resize(img, (100,100), interpolation = cv2.INTER_AREA)
    face = img.flatten()
    faces.append(face)
    if file_label in labels_dict:
        labels_dict[file_label].append(face)
    else:
        labels_dict[file_label] = [face] 

faces = np.array(faces)
print(faces)
faces.shape

### We calculate the mean of each column and center the values in each column of the matrix by subtracting the mean column value. We also find the covariance matrix of the centered matrix.

In [0]:
img_mean = mean(faces)
temp = faces - img_mean
covariance = cov(temp.T)

### We calculate the eigen decomposition of the covariance matrix, thus resulting in a list of eigenvalues and a list of eigenvectors. We sort the eigen values in descending order and accordingly store the eigen vectors.

In [0]:
e, v = eig(covariance)
indexes = e.argsort()[::-1]   
eigen_values = e[indexes]
eigen_vectors = v[:,indexes]

### We calculate the number of components required by keeping a variance of atleast 90%.

In [7]:
total = np.sum(eigen_values)
variance_reqd = 0.90
var_list = list()
no_of_components = 0
components_reqd = 0
sum1 = 0
while True:
  sum1 += eigen_values[no_of_components]
  var = sum1/total
  var_list.append(var)
  var_achieved = np.real(var)
  if var_achieved >= variance_reqd:
      components_reqd = no_of_components + 1
      break
  no_of_components += 1


print("No. of components required to achieve less than 10% error: ", components_reqd)

No. of components required to achieve less than 10% error:  76


### We perform PCA on the number of components as found out from above and then form the reconstruction matrix, which will be used to reconstruct the images.

In [51]:
components = components_reqd

vectors = eigen_vectors[:,:components]
transformation_matrix = np.real(vectors)
pca_projections = np.dot(faces, transformation_matrix)
reconstruction_matrix = np.dot(pca_projections, transformation_matrix.T)

print(faces.shape)
print(reconstruction_matrix.shape)

(520, 10000)
(520, 10000)


In [0]:
i=0
labels_dict1 = {}
for i in range(reconstruction_matrix.shape[0]):
    face = reconstruction_matrix[i]
    if labels[i] in labels_dict1:
        labels_dict1[labels[i]].append(face)
    else:
        labels_dict1[labels[i]] = [face] 

faces = np.array(faces)
# print(faces)
# faces.shape

## We create the training and testing sets by splitting the data

In [0]:
num_of_classes = len(labels_dict1)

def one_hot_encoding(label):
  one_hot_label = []
  one_hot_label = [0 for i in range(num_of_classes)]
  one_hot_label[label] = 1
  return one_hot_label

X_train = [0 for i in range(10000)]
y_train = [0 for i in range(8)]

X_test = [0 for i in range(10000)]
y_test = []

for key,value in labels_dict1.items():
  label = int(key)
  one_hot_label = one_hot_encoding(label)
  data = np.array(value)
  train = data[:55,:]
  test = data[55:,:]
  X_train = np.vstack((X_train, train))
  X_test = np.vstack((X_test, test))
  y_train = np.vstack((y_train,np.array([one_hot_label]*55)))
  for i in range(10):
    y_test.append(label)

X_train = X_train[1:,:]
y_train = y_train[1:,:]
X_test = X_test[1:,:]
y_test = np.array(y_test)

## We scale the data and then add the bias column.

In [0]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
scaler.fit(X_test)
X_test=scaler.transform(X_test)

In [0]:
print(X_train.shape[0])
ones = np.ones([X_train.shape[0],1])
X_train = np.concatenate((ones,X_train),axis=1)
ones = np.ones([X_test.shape[0],1])
X_test = np.concatenate((ones,X_test),axis=1)

## Logistic regression function

In [0]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def loss_func(h, y):
  return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()


def logistic_regression(X_train, y_train, iter, alpha):
  theta = np.random.rand(X_train.shape[1])
  count = 1
  while(count<=iter):
    temp = np.dot(X_train, theta)
    h = sigmoid(temp)
    # print(h.dtype)
    count = count+1

    error = h-y_train


    gradient = np.dot(X_train.T, error)/y_train.shape[0]
    theta = theta - alpha*gradient
    # print(count)
    # z = np.dot(X_train, theta)
    # h = sigmoid(z)
    # loss = loss_func(h, y_train)

    # print(loss)

  return theta

## We run the logistic regression function using learning rate as 0.001 for 10000 iterations

In [0]:
weights = [[0 for i in range(10001)]]
for i in range(num_of_classes):
  weights = np.vstack((weights,logistic_regression(X_train, y_train[:,i], 10000, 0.001)))


weights = weights[1:]

## We make the predictions.

In [0]:
predictions = sigmoid(np.dot(X_test, weights.T))
print(predictions)
print(predictions.shape)

y_pred=[]

for i in range(predictions.shape[0]):
  maxElement = np.amax(predictions[i])
  for j in range(8):
    if(predictions[i][j] == maxElement):
      ind = j
      break
  y_pred.append(j)

# print(len(y_pred))
# print(y_pred)
# print(y_test)

## The accuracy is found out to be 0.65
### Due the random initialization of weights, the accuracy may vary slightly in different trials.

In [160]:
a_1 = (y_pred == y_test).mean()
print("Accuracy: ", a_1)

Accuracy:  0.65


## The confusion matrix and f1 score obtained are printed below

In [157]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[ 5  2  0  0  2  0  0  1]
 [ 2  3  1  2  1  0  1  0]
 [ 0  1  5  2  0  1  1  0]
 [ 0  0  0 10  0  0  0  0]
 [ 0  0  1  0  8  1  0  0]
 [ 0  0  2  0  0  8  0  0]
 [ 2  2  1  1  0  0  4  0]
 [ 0  0  0  0  0  1  0  9]]


In [159]:
from sklearn.metrics import f1_score
f1_1 = f1_score(y_test,y_pred, average='weighted')
print("f1 score: ", f1_1)

f1 score:  0.6354323308270677


# Without PCA
## In this section we test our model without applying PCA

## First we create the training and testing sets by splitting the data

In [0]:
num_of_classes = len(labels_dict)

def one_hot_encoding(label):
  one_hot_label = []
  one_hot_label = [0 for i in range(num_of_classes)]
  one_hot_label[label] = 1
  return one_hot_label

X_train = [0 for i in range(10000)]
y_train = [0 for i in range(8)]

X_test = [0 for i in range(10000)]
y_test = []

for key,value in labels_dict.items():
  label = int(key)
  one_hot_label = one_hot_encoding(label)
  # print(label, one_hot_label)
  data = np.array(value)
  train = data[:55,:]
  test = data[55:,:]
  X_train = np.vstack((X_train, train))
  X_test = np.vstack((X_test, test))
  y_train = np.vstack((y_train,np.array([one_hot_label]*55)))
  for i in range(10):
    y_test.append(label)

X_train = X_train[1:,:]
y_train = y_train[1:,:]
X_test = X_test[1:,:]
y_test = np.array(y_test)

## We scale the data and then add the bias column.

In [0]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
scaler.fit(X_test)
X_test=scaler.transform(X_test)

In [0]:
print(X_train.shape[0])
ones = np.ones([X_train.shape[0],1])
X_train = np.concatenate((ones,X_train),axis=1)
ones = np.ones([X_test.shape[0],1])
X_test = np.concatenate((ones,X_test),axis=1)

## Logistic regression function

In [0]:
def sigmoid(z):
  return 1 / (1 + np.exp(-z))

def loss_func(h, y):
  return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()


def logistic_regression(X_train, y_train, iter, alpha):
  theta = np.random.rand(X_train.shape[1])
  count = 1
  while(count<=iter):
    temp = np.dot(X_train, theta)
    h = sigmoid(temp)
    # print(h.dtype)
    count = count+1

    error = h-y_train


    gradient = np.dot(X_train.T, error)/y_train.shape[0]
    theta = theta - alpha*gradient
    # print(count)
    # z = np.dot(X_train, theta)
    # h = sigmoid(z)
    # loss = loss_func(h, y_train)

    # print(loss)

  return theta

## We run the logistic regression function using learning rate as 0.01 for 10000 iterations

In [0]:
weights = [[0 for i in range(10001)]]
for i in range(num_of_classes):
  weights = np.vstack((weights,logistic_regression(X_train, y_train[:,i], 10000, 0.01)))


weights = weights[1:]

## We make the predictions

In [0]:
predictions = sigmoid(np.dot(X_test, weights.T))
print(predictions)
print(predictions.shape)

y_pred=[]

for i in range(predictions.shape[0]):
  maxElement = np.amax(predictions[i])
  for j in range(8):
    if(predictions[i][j] == maxElement):
      ind = j
      break
  y_pred.append(j)

# print(len(y_pred))
# print(y_pred)
# print(y_test)

## The accuracy is found out to be 0.65
### Due the random initialization of weights, the accuracy may vary slightly in different trials.

In [162]:
a_2 = (y_pred == y_test).mean()
print("Accuracy: ", a_2)

Accuracy:  0.6875


## The confusion matrix and f1 score obtained are printed below

In [0]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred))

[[7 0 1 1 1 0 0 0]
 [1 6 0 1 0 0 2 0]
 [1 0 8 0 0 0 1 0]
 [0 2 0 7 0 1 0 0]
 [0 1 0 0 4 4 1 0]
 [0 0 2 0 0 8 0 0]
 [0 1 2 0 0 0 6 1]
 [0 0 0 0 0 1 0 9]]


In [163]:
from sklearn.metrics import f1_score
f1_2 = f1_score(y_test,y_pred, average='weighted')
print("f1 score: ", f1_2)

f1 score:  0.6836670480549198


# Summary

In [164]:
without_pca = ["Model without PCA", a_1, f1_1]
with_pca = ["Model without PCA", a_2, f1_2]
data = [without_pca, with_pca]
df = pd.DataFrame(data, columns = ['Model', 'Accuracy', 'f1 score'])
df

Unnamed: 0,Model,Accuracy,f1 score
0,Model without PCA,0.65,0.635432
1,Model without PCA,0.6875,0.683667


## Thus we see that our model performed relatively well on applying PCA, getting an accuracy score comparable to that of the model where PCA is not applied.