In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,32,4)
        self.conv2 = nn.Conv2d(32,64,4)
        self.conv3 = nn.Conv2d(64,128,4)
        
        x = torch.rand(64,64).view(-1,1,64,64)
        self._to_linear = None
        self.convs(x)
        
        self.fc1 = nn.Linear(self._to_linear, 512)
        self.fc2 = nn.Linear(512, 2)
        #print(x[0].shape)
    def convs(self, x):
        x =F.max_pool2d(F.relu(self.conv1(x)),(2,2))
        x =F.max_pool2d(F.relu(self.conv2(x)),(2,2))
        x =F.max_pool2d(F.relu(self.conv3(x)),(2,2))
        
        
        if self._to_linear is None:
            self._to_linear = x[0].shape[0]*x[0].shape[1]*x[0].shape[2]
        return x
    
    def forward(self, x):
        x = self.convs(x)
        x = x.view(-1, self._to_linear)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x,dim=1)

In [10]:
import warnings
warnings.filterwarnings('ignore')
print("Warnings ignored!!")



The next few lines are just to load the trained model and check the accuracy of the model on a reshuffled data.

In [4]:
from fastai import *
from fastai.vision import *

In [5]:
smile_detector = torch.load('trained-1.pth')
body = create_body(models.resnet50, True, None)
data_classes = 2
nf = callbacks.hooks.num_features_model(body)*2
head = create_head(nf, data_classes, None, ps=0.5, bn_final = False)
model = nn.Sequential(body, head)
model.load_state_dict(smile_detector['model'])

NameError: name 'create_body' is not defined

In [25]:
import numpy as np
from tqdm import tqdm

In [12]:
training_data = np.load('training_data.npy',allow_pickle = True)
np.random.shuffle(training_data)

In [13]:
X = torch.Tensor([i[0] for i in training_data]).view(-1,64,64)
y = torch.Tensor([i[1] for i in training_data])
test_pct = 0.2
val_size = int(len(X)*test_pct)
train_X = X[:-val_size]
test_X = X[-val_size:]
train_y = y[:-val_size]
test_y = y[-val_size:]

In [14]:
correct = 0
total = 0
with torch.no_grad():
    for i in tqdm(range(len(test_X))):
        real_class = torch.argmax(test_y[i])
        net_out = smile_detector(test_X[i].view(-1,1,64,64))[0]
        predicted_class = torch.argmax(net_out)
        if predicted_class ==real_class:
            correct+=1
        total+=1

print('Accuracy:', round(correct/total,3))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:00<00:00, 272.84it/s]

Accuracy: 0.95





There are three steps to this smile detector function. Detect the face, extract the face, determine smiling or not using model. Face detection was done using an existing face detector that is widely used. The images were converted to gray scale as the face detector seem to only work on gray scaled images. 

The extract_face function ensures that the picture extracted matches the olivetti data set where the size of the image has to be 64x64. 

The smiling function uses the face_detect and extract_face function and outputs 'smiling' or 'not smiling' if a face is detected.

In [48]:
from torchvision import transforms
transform = transforms.Compose([transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(), 
                                transforms.Normalize((0.5),(0.5))])

#mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]

In [37]:
transform

Compose(
    Resize(size=256, interpolation=PIL.Image.BILINEAR)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)

In [64]:
import cv2
from scipy.ndimage import zoom
from PIL import Image

faceCascade = cv2.CascadeClassifier("haarcascade_frontalface_default.xml")



def face_detect(path):
    test_img = cv2.imread(path)
    grayed_img = cv2.cvtColor(test_img, cv2.COLOR_BGR2GRAY)
    detected_face = faceCascade.detectMultiScale(grayed_img,1.1,5)
    return grayed_img, detected_face


def extract_face(grayed, detected_face_img):
    if len(detected_face_img)==0:
        return 'No face detected'
    for (x,y,w,h) in detected_face_img:
        extracted_face = grayed[y:y+h, x:x+w]
        extracted_face = zoom(extracted_face, (64. / extracted_face.shape[0], 64. / extracted_face.shape[1]))
        extracted_face = extracted_face.astype(float)
        extracted_face /= float(extracted_face.max())
    return extracted_face
                            
                            
def smiling(path):
    grayed_img,detected_face = face_detect(path)
    if len(detected_face)== 0:
        return 'Face not detected'
    

    else:
        face_extract = extract_face(grayed_img,detected_face)
        face_extract = (face_extract*255).astype('int')
        print(face_extract)
        labels = ('not smiling','smiling')
#         img_t = transform(face_extract)
#         batch_t = torch.unsqueeze(img_t, 0)
        net_out = model(torch.Tensor(face_extract).view(-1,1,64,64))
        category = torch.argmax(net_out)
        return labels[category]

Some images were downloaded from flickr to be tested and one of the test can be found below.

In [65]:
import matplotlib.pyplot as plt


g,d_f = face_detect('smile_test_7.jpg')
face_extract = extract_face(g,d_f)
test_img = smiling('smile_test_7.jpg')
print(test_img)
plt.figure(figsize=(8,8))
plt.imshow(face_extract)
print(face_extract)


[[120  91  60  97 ...  43  53  91 149]
 [ 95  66  97  84 ...  34  44  82 124]
 [ 53  85  57  66 ... 117  85  82 104]
 [ 72  57  47  55 ...  57  91  57  79]
 ...
 [ 68  52  73  52 ... 105  49  91  43]
 [ 66  78  70  42 ... 123  26 108  28]
 [ 65  70  66  44 ...  63  36  26  43]
 [ 50  65  52  49 ... 110 115  37  36]]


RuntimeError: Given groups=1, weight of size 64 3 7 7, expected input[1, 1, 64, 64] to have 3 channels, but got 1 channels instead

Accuracy of the model itself may not be very good due to lack of data and an model that may have been overfitted due to large number of epochs. May explore using the entire dataset fully instead of just 80% of the data but that would make it harder for validation as would need to have another set of images where faces can be detected. 

In [12]:
print(type(smile_detector))

<class 'dict'>
