# Second Data Preparation Process

In this notebook file, selected data will be read from pkl file.

After that the faces in the images of the selected data will be detected 
and the position of the detected face will be added to dataframe in ([x, y, width, height]) format as numpy array.

Oranized dataframes will be saved as md5 files for future use after checking.

Since face detection is a study in itself (See https://en.wikipedia.org/wiki/Face_detection),
this will be done in this project using the haarcascade_frontalface_alt2 xml file
 (See https://docs.opencv.org/3.4/db/d28/tutorial_cascade_classifier.html and
https://github.com/opencv/opencv/tree/master/data/haarcascades)
trained for face detection in the opencv library.

Also see https://en.wikipedia.org/wiki/Cascading_classifiers for details of Cascading Classifier

xml file to be used for face detection will be added to <ins>/Cascade/</ins> folder.

This project will not focus on face detection. 
For this reason, the details of face detection will not be mentioned.

In [1]:
#Importing libraries
import numpy as np
import cv2
import pandas as pd

In [2]:
#Printing library versions
print('numpy Version: ' + np.__version__)
print('cv2 Version: ' + cv2.__version__)
print('pandas Version: ' + pd.__version__)

numpy Version: 1.21.5
cv2 Version: 3.4.2
pandas Version: 1.3.5


In [3]:
#Defining the function, to detect faces in images using the cascade classifier.

#The situations that may occur for each image were examined in 3 different ways.

# 1) Only one face can be detected in the image
#In this case the detected face frame is assigned directly as the face frame in this image

# 2) Face may not be detected in the image
#In this case, the face frame in the image is considered as a frame that covers the entire image.
#Since the dataset is prepared to highlight the face of the person directly, 
#it will not be a big problem to get the whole image.

# 3) Multiple faces can be detected in the image
#In this case, the frame of the face that occupies the largest area among the detected faces is used.
#The face that occupies the largest area is most likely to be the face closest to the viewing device.
#It can be assumed that the face closest to the viewing device is also the face to be highlighted.

#Since the images are 250x250 in size,
#there is no harm in using the numpy data type uint8 whose boundaries are between [0, 255]
#For this reason, numpy arrays holding face frame information are converted to uint8 data type.

def FaceDetection(cascade : cv2.CascadeClassifier, imageList : object) -> list:
    
    faceList = []
    detectionResult = {'SingleFace' : 0, 'NoFace' : 0, 'MultipleFace' : 0, 'totalImage' : 0}
    
    for img in imageList:
        
        #Since opencv haarcascade classifiers work on single-channel images
        #The image is used by converting it to grayscale to get the position of the face.
        #see https://docs.opencv.org/3.4/db/d28/tutorial_cascade_classifier.html
        grayImage = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        faces = faceCascade.detectMultiScale(grayImage)
        lenFaces = len(faces)
        
        if lenFaces == 1:
            faceList.append(faces[0].astype(np.uint8))
            detectionResult['SingleFace'] += 1
            
        elif lenFaces == 0:
            faceList.append(np.array([0, 0, 250, 250], dtype = np.uint8))
            detectionResult['NoFace'] += 1
            
        else:
            maxValue = None
            maxArea = 0
            
            for (x, y, width, height) in faces:
                area = width * height
                if area > maxArea:
                    maxArea = area
                    maxValue = np.array([x, y, width, height], dtype = np.uint8)
                    
            faceList.append(maxValue)
            detectionResult['MultipleFace'] += 1
        
    
    detectionResult['totalImage'] = detectionResult['SingleFace'] + detectionResult['NoFace'] + detectionResult['MultipleFace']
    
    return [detectionResult, faceList]

In [4]:
#Loading cascade classifier for face detection
faceCascade = cv2.CascadeClassifier('../Cascade/haarcascade_frontalface_alt2.xml')

In [5]:
#Reading selected training data from pkl file
trainingDf = pd.read_pickle("../Data/RawData/Selected/Training.pkl")
trainingDf

Unnamed: 0,PersonID,ImageBGR
0,150,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
1,148,"[[[4, 24, 41], [4, 24, 41], [5, 25, 42], [6, 2..."
2,12,"[[[0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], ..."
3,120,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
4,126,"[[[3, 8, 11], [4, 9, 12], [4, 9, 12], [2, 7, 1..."
...,...,...
4574,236,"[[[6, 8, 2], [19, 23, 17], [41, 48, 43], [74, ..."
4575,222,"[[[17, 105, 181], [20, 108, 184], [22, 110, 18..."
4576,222,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
4577,396,"[[[228, 218, 211], [221, 211, 204], [214, 204,..."


In [6]:
#Face detecting in all images in trainingDf
result = FaceDetection(faceCascade, trainingDf.ImageBGR)

#result[0] is printing
result[0]

{'SingleFace': 4221, 'NoFace': 47, 'MultipleFace': 311, 'totalImage': 4579}

In [7]:
#length of result[1] is printing
len(result[1])

4579

In [8]:
#the face frame of each image is appending to the trainingDf
trainingDf['FaceFrame'] = result[1]
trainingDf

Unnamed: 0,PersonID,ImageBGR,FaceFrame
0,150,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[72, 69, 112, 112]"
1,148,"[[[4, 24, 41], [4, 24, 41], [5, 25, 42], [6, 2...","[76, 72, 109, 109]"
2,12,"[[[0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], ...","[68, 67, 113, 113]"
3,120,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[69, 70, 110, 110]"
4,126,"[[[3, 8, 11], [4, 9, 12], [4, 9, 12], [2, 7, 1...","[68, 66, 119, 119]"
...,...,...,...
4574,236,"[[[6, 8, 2], [19, 23, 17], [41, 48, 43], [74, ...","[74, 73, 104, 104]"
4575,222,"[[[17, 105, 181], [20, 108, 184], [22, 110, 18...","[56, 63, 127, 127]"
4576,222,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[70, 69, 112, 112]"
4577,396,"[[[228, 218, 211], [221, 211, 204], [214, 204,...","[70, 70, 115, 115]"


In [9]:
#training data save in WithFaces path as pkl file for future use
trainingDf.to_pickle("../Data/WithFaces/Training.pkl")
del trainingDf

In [10]:
#Reading selected validation data from pkl file
validationDf = pd.read_pickle("../Data/RawData/Selected/Validation.pkl")
validationDf

Unnamed: 0,PersonID,ImageBGR
0,378,"[[[2, 0, 0], [2, 0, 0], [2, 0, 0], [2, 0, 0], ..."
1,148,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
2,363,"[[[0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], ..."
3,152,"[[[211, 222, 230], [210, 221, 229], [210, 219,..."
4,10,"[[[32, 17, 15], [32, 17, 15], [33, 18, 16], [3..."
...,...,...
698,209,"[[[249, 251, 251], [250, 252, 252], [250, 252,..."
699,167,"[[[8, 10, 4], [8, 10, 4], [8, 10, 4], [8, 10, ..."
700,115,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ..."
701,186,"[[[16, 3, 0], [16, 3, 0], [16, 3, 0], [16, 3, ..."


In [11]:
#Face detecting in all images in validationDf
result = FaceDetection(faceCascade, validationDf.ImageBGR)

#result[0] is printing
result[0]

{'SingleFace': 628, 'NoFace': 8, 'MultipleFace': 67, 'totalImage': 703}

In [12]:
#length of result[1] is printing
len(result[1])

703

In [13]:
#the face frame of each image is appending to the validationDf
validationDf['FaceFrame'] = result[1]
validationDf

Unnamed: 0,PersonID,ImageBGR,FaceFrame
0,378,"[[[2, 0, 0], [2, 0, 0], [2, 0, 0], [2, 0, 0], ...","[71, 66, 116, 116]"
1,148,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[69, 68, 115, 115]"
2,363,"[[[0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1], ...","[68, 67, 117, 117]"
3,152,"[[[211, 222, 230], [210, 221, 229], [210, 219,...","[68, 70, 114, 114]"
4,10,"[[[32, 17, 15], [32, 17, 15], [33, 18, 16], [3...","[73, 70, 109, 109]"
...,...,...,...
698,209,"[[[249, 251, 251], [250, 252, 252], [250, 252,...","[75, 75, 96, 96]"
699,167,"[[[8, 10, 4], [8, 10, 4], [8, 10, 4], [8, 10, ...","[71, 68, 113, 113]"
700,115,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...","[70, 69, 115, 115]"
701,186,"[[[16, 3, 0], [16, 3, 0], [16, 3, 0], [16, 3, ...","[69, 69, 108, 108]"


In [14]:
#validation data save in WithFaces path as pkl file for future use
validationDf.to_pickle("../Data/WithFaces/Validation.pkl")
del validationDf

In [15]:
#Reading selected test data from pkl file
testDf = pd.read_pickle("../Data/RawData/Selected/Test.pkl")
testDf

Unnamed: 0,PersonID,ImageBGR
0,124,"[[[115, 91, 7], [117, 93, 9], [117, 93, 9], [1..."
1,366,"[[[137, 130, 115], [134, 127, 112], [131, 124,..."
2,311,"[[[23, 47, 183], [42, 62, 193], [47, 59, 183],..."
3,0,"[[[26, 65, 74], [29, 68, 77], [30, 69, 78], [2..."
4,19,"[[[51, 18, 2], [51, 18, 2], [51, 19, 0], [51, ..."
...,...,...
698,169,"[[[138, 118, 83], [138, 118, 83], [137, 118, 8..."
699,171,"[[[62, 63, 24], [61, 62, 23], [60, 61, 22], [6..."
700,56,"[[[46, 106, 152], [47, 107, 153], [49, 109, 15..."
701,95,"[[[94, 131, 175], [98, 135, 179], [99, 136, 18..."


In [16]:
#Face detecting in all images in testDf
result = FaceDetection(faceCascade, testDf.ImageBGR)

#result[0] is printing
result[0]

{'SingleFace': 644, 'NoFace': 5, 'MultipleFace': 54, 'totalImage': 703}

In [17]:
#length of result[1] is printing
len(result[1])

703

In [18]:
#the face frame of each image is appending to the testDf
testDf['FaceFrame'] = result[1]
testDf

Unnamed: 0,PersonID,ImageBGR,FaceFrame
0,124,"[[[115, 91, 7], [117, 93, 9], [117, 93, 9], [1...","[69, 70, 115, 115]"
1,366,"[[[137, 130, 115], [134, 127, 112], [131, 124,...","[79, 77, 99, 99]"
2,311,"[[[23, 47, 183], [42, 62, 193], [47, 59, 183],...","[73, 72, 107, 107]"
3,0,"[[[26, 65, 74], [29, 68, 77], [30, 69, 78], [2...","[71, 72, 108, 108]"
4,19,"[[[51, 18, 2], [51, 18, 2], [51, 19, 0], [51, ...","[70, 69, 115, 115]"
...,...,...,...
698,169,"[[[138, 118, 83], [138, 118, 83], [137, 118, 8...","[69, 68, 112, 112]"
699,171,"[[[62, 63, 24], [61, 62, 23], [60, 61, 22], [6...","[72, 69, 114, 114]"
700,56,"[[[46, 106, 152], [47, 107, 153], [49, 109, 15...","[66, 67, 118, 118]"
701,95,"[[[94, 131, 175], [98, 135, 179], [99, 136, 18...","[70, 71, 111, 111]"


In [19]:
#test data save in WithFaces path as pkl file for future use
testDf.to_pickle("../Data/WithFaces/Test.pkl")
del testDf

In [20]:
#Reading selected person data from pkl file
personDf = pd.read_pickle("../Data/RawData/Selected/Person.pkl")
personDf

Unnamed: 0,ID,Name
0,0,Abdullah Gul
1,1,Adrien Brody
2,2,Ahmed Chalabi
3,3,Ai Sugiyama
4,4,Alan Greenspan
...,...,...
418,418,Yasser Arafat
419,419,Yoko Ono
420,420,Yoriko Kawaguchi
421,421,Zhu Rongji


In [21]:
#Person data save in WithFaces path as pkl file for future use
personDf.to_pickle("../Data/WithFaces/Person.pkl")