In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mtcnn.mtcnn import MTCNN
import tensorflow as tf
from PIL import Image
import seaborn as sns
import pandas as pd
import numpy as np
import cv2
import os

In [None]:
### Define path
img_path = 'images'


### Load dataset

In [None]:
def load_data(path):
    name = []
    labels = []
    for subdir, dirs, files in os.walk(img_path):
        for dir in dirs:
            x = os.path.join(subdir, dir)
            for i in os.listdir(x):
                path = os.path.join(x, i)
                label = path.split(os.sep)[1]
                labels.append(label)
                name.append(i)
    df = pd.DataFrame({'filename':name, 'label':labels})
    return df
df = load_data(img_path)

df.head()

In [None]:
# Analyze
df["label"].value_counts()

In [None]:
plt.figure(figsize=(7,7))
sns.countplot(x = df['label'])
plt.show()

In [None]:
#setting seed and spliting
seed = 0
np.random.seed(seed)
tf.random.set_seed(3)


train_set, valid_set = train_test_split(df, test_size=0.2,random_state=seed)

In [None]:
train_set.shape, valid_set.shape

In [None]:
train_set = train_set.reset_index() #To avoid keyerror when looping
train_set.head()

In [None]:
valid_set = valid_set.reset_index()
valid_set.head()

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(1,3,1)
sns.countplot(x=train_set['label'])
plt.title('Train set label distribution')

plt.subplot(1,3,3)
sns.countplot(x=valid_set['label'])
plt.title('valid set label distribution')
plt.show()

### Detect face with mtcnn

In [None]:
def crop_faces(image):
    img = Image.open(image)
    # convert to RGB if needed
    img = img.convert('RGB')
    # convert to array
    pixel = np.asarray(img)
    
    #load mtcnn detector
    detector = MTCNN()
    locate_face = detector.detect_faces(pixel)
    x1,y1,w,h = locate_face[0]['box']
    x1, y1 = abs(x1), abs(y1)
    x2, y2 = x1 + w, y1 + h
    
    # crop the face with coordinates
    face = pixel[y1:y2, x1:x2]
    
    # resize pixel to that required by facenet
    image = Image.fromarray(face)
    image = image.resize((160,160))
    face_array = np.asarray(image)
#     plt.imshow(face_array)
#     plt.show()
    return face_array

In [None]:
def extract_faces(img_path, dataframe):
    faces = []
    labels = []
    for subdir, dirs, files in os.walk(img_path):
        for dir in dirs:
            for i in range(len(dataframe)):
                path = os.path.join(subdir, dir, dataframe['filename'][i])
                if os.path.exists(path):
                    label = dataframe['label'][i]
                    face = crop_faces(path)
                    labels.append(label)
                    faces.append(face)
                else:
                    continue
        return np.asarray(faces), np.asarray(labels)

In [None]:
#for train set
X_train, y_train = extract_faces(img_path, train_set)

#for valid set
X_test, y_test = extract_faces(img_path, valid_set)

print('Amount of train data is ', len(X_train))
print('Amount of test data is ', len(X_test))

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
# save arrays to one file in compressed format

np.savez_compressed('all_face_data.npz', X_train, y_train, X_test, y_test)
print('Done')