In [None]:
###########################################################################
# I would like to acknowledge the authors of these task and images.
# Kaggle problem: https://www.kaggle.com/andrewmvd/face-mask-detection
# Source of the data: https://makeml.app/datasets/mask
###########################################################################

In [2]:
#### 
# Modify the model_num according to the model to be used.
####
# 1 = PCA SVM
# 2 = RandomForest
# 3 = CNN
model_num = 3

In [3]:
!pip install opencv-python


Collecting opencv-python
  Using cached opencv_python-4.3.0.36-cp37-cp37m-win_amd64.whl (33.4 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.3.0.36


In [4]:
import os, sys
import pandas as pd
import numpy as np
import cv2

import matplotlib as mpl
import matplotlib.pyplot as plt

# 1. Image Processing

In [5]:
# import saved data from XMLParsing
df = pd.read_csv(r"C:\Users\vigne\OneDrive\Desktop\adv_ds_project\data_table.csv",index_col=[0])
df.head()

Unnamed: 0,Filename,ImgDim,Bndbox,ClassStr
0,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[79, 109, 105, 142]",without_mask
1,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[185, 226, 100, 144]",with_mask
2,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[325, 360, 90, 141]",without_mask
3,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[400, 156]","[321, 354, 34, 69]",with_mask
4,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[400, 156]","[224, 261, 38, 73]",with_mask


In [6]:
# Extract the range of BoundingBox from df
df_row_length = df.shape[0]
xmin_list = []
xmax_list = []
ymin_list = []
ymax_list = []

for i in range(df_row_length):
    # Extract xmin into a list
    xmin = df.Bndbox[i].split(',')[0].strip('[')
    xmin_list.append(int(xmin))
    # Extract xmax into a list
    xmax = df.Bndbox[i].split(',')[1]
    xmax_list.append(int(xmax))
    # Extract ymin into a list
    ymin = df.Bndbox[i].split(',')[2]
    ymin_list.append(int(ymin))
    # Extract ymax into a list
    ymax = df.Bndbox[i].split(',')[3].strip(']')
    ymax_list.append(int(ymax))
    

In [7]:
# Load images
path = "C:/Users/vigne/OneDrive/Desktop/adv_ds_project/images/"
dirs = os.listdir(path)
img_list = []

# Gather all .png file paths
for file in dirs:
    img_list.append(path+file)

In [8]:
# Collect the number of faces per image in a list
faces_per_img_list = None
df1 = df.groupby('Filename').count().reset_index()
faces_per_img_list = df1.ImgDim

In [9]:
len(faces_per_img_list)

853

In [10]:
len(img_list)

853

In [11]:
# Collect all faces from all images and standardize their sizes

face_list = []
size = 100
j=0

for ix in range(0,len(img_list)):
    # Read image
    img = cv2.imread(img_list[ix], 1)
    # Change the colour to black and white
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Extract the faces in each image
    for i in range(0,faces_per_img_list[ix]):
    
        # Extract each face
        face = gray_img[ymin_list[j]:ymax_list[j],xmin_list[j]:xmax_list[j]]
        # Resize to standard image size - width = 100 & height = 100 - using INTER_LINEAR for interpolation
        face_resized = cv2.resize(face, (size, size), interpolation = cv2.INTER_LINEAR)
        # Normalize image data
        face_resized_normalized = face_resized/255.0
        # Store each face in the list
        face_list.append(face_resized_normalized)
        j+=1
        
    


In [12]:
# Reshape the face_list into (4072,size,size) to get stack of images
f1 = face_list 
face_list = np.array(face_list)
face_list = np.reshape(face_list, (face_list.shape[0],size,size))


In [13]:
face_list.shape

(4072, 100, 100)

In [14]:
# Save the numpy array to be used in other notebooks for modelling
if model_num == 2:
    np.save(r'C:\Users\vigne\OneDrive\Desktop\adv_ds_project\face_list.npy', face_list)
elif model_num == 3:
    np.save(r'C:\Users\vigne\OneDrive\Desktop\adv_ds_project\face_list_cnn.npy', face_list)
else:
    np.save(r'C:\Users\vigne\OneDrive\Desktop\adv_ds_project\face_list_pca.npy', face_list)
        

In [15]:
import pickle as pk

pk.dump(face_list, open(r"C:\Users\vigne\OneDrive\Desktop\adv_ds_project\face_list2.pkl","wb"))

# 2. Labelling

In [16]:
# Time to do labelling for the label categories
# Import preprocessing from sklearn
from sklearn import preprocessing
# Classify the ClassStr into 3 classes in integers
le = preprocessing.LabelEncoder()
df['label'] = le.fit_transform(df['ClassStr'])
df.head()

Unnamed: 0,Filename,ImgDim,Bndbox,ClassStr,label
0,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[79, 109, 105, 142]",without_mask,2
1,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[185, 226, 100, 144]",with_mask,1
2,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[325, 360, 90, 141]",without_mask,2
3,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[400, 156]","[321, 354, 34, 69]",with_mask,1
4,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[400, 156]","[224, 261, 38, 73]",with_mask,1


In [17]:
label_cat = df['label'].unique()
label_cat

array([2, 1, 0])

In [18]:
# One hot encoding for the labels
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')
ohe_df = pd.DataFrame(ohe.fit_transform(df[['label']]).toarray())
# merge with main df bridge_df on key values
combined_df = pd.concat([df,ohe_df],axis=1)
combined_df.head()

Unnamed: 0,Filename,ImgDim,Bndbox,ClassStr,label,0,1,2
0,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[79, 109, 105, 142]",without_mask,2,0.0,0.0,1.0
1,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[185, 226, 100, 144]",with_mask,1,0.0,1.0,0.0
2,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[512, 366]","[325, 360, 90, 141]",without_mask,2,0.0,0.0,1.0
3,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[400, 156]","[321, 354, 34, 69]",with_mask,1,0.0,1.0,0.0
4,C:/Users/vigne/OneDrive/Desktop/adv_ds_project...,"[400, 156]","[224, 261, 38, 73]",with_mask,1,0.0,1.0,0.0


In [19]:
#Save the combined_df to a csv file
if model_num == 2:
    combined_df.to_csv(r"C:\Users\vigne\OneDrive\Desktop\adv_ds_project\combined_df.csv")
elif model_num == 3:
    combined_df.to_csv(r"C:\Users\vigne\OneDrive\Desktop\adv_ds_project\combined_df_cnn.csv")
else:
    combined_df.to_csv(r"C:\Users\vigne\OneDrive\Desktop\adv_ds_project\combined_df_pca.csv")