## Import Libraries and Models

In [15]:
#manipulate paths
import os

#Audio utilities libraries
from scipy.io import wavfile as wav
import librosa
import sounddevice as sd
import IPython.display as ipd

#numpy
import numpy as np

#plot libraries
import matplotlib as mpl
import matplotlib.pyplot as plt

#tree load
import joblib

#keras for predictions
import keras
from keras.applications.mobilenet_v2 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator, img_to_array

import cv2 as cv

In [16]:
# Caricamento modello per il rilevamento di volti frontali
face_detector = cv.CascadeClassifier('haarcascade_frontalface_default.xml')

In [17]:
facenet = keras.models.load_model('FaceRecognition.h5')
recnet = keras.models.load_model('FFNN.h5')

## Demo

In [18]:
names = ['Lorenzo' ,'Raffaele', 'Riprova']

In [23]:
def process_frame(img):
    
    gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    face = img
    
    faces = face_detector.detectMultiScale(gray, minSize=(250,250), minNeighbors=10)
     
    for (x,y,w,h) in faces:
        cv.rectangle(img, (x,y), (x+w,y+h), (0,255,0), 2)
        face = img[y:y+h,x:x+w]

        img_pixels = cv.resize(face, (224, 224)) 
        img_pixels = img_to_array(img_pixels)
        img_pixels = np.expand_dims(img_pixels, axis = 0)
        img_pixels = preprocess_input(img_pixels)
        y_prob = facenet.predict(img_pixels)
        y_pred = np.argmax(y_prob)
        y_prob = y_prob[0, y_pred]

        if(y_prob > 0.85):
            index = names[y_pred]
        else:
            index = "Unknown"

        cv.putText(img,
                  (str(index)+''+str(y_prob)),
                  (x+5,y-5),
                  cv.FONT_HERSHEY_SIMPLEX,
                  1,
                  (255,255,255),
                  2)
                    
    return img, faces

In [24]:
cap = cv.VideoCapture(0)
duration = 2 # (qqqsecondi)
rec_rate = 44100
sent='Press "r" to register'
cv.startWindowThread()

while(True):
    color = (255,0,0)
    r, frame = cap.read()
    frame, faces = process_frame(frame)
    cv.putText(frame, sent, (15, 37), cv.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2)  
    cv.rectangle(frame, (1, 1), (850, 45), (255,255,255), 2)
    cv.imshow('Video', frame)
    
    if cv.waitKey(20) & 0xFF == ord('r'):
        prova = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
        wav.write('test.wav', rate=rec_rate, data=(prova))
        rec_rate, rec = wav.read('test.wav')
        mfcc = np.mean(librosa.feature.mfcc(rec*1.0, sr=int(rec_rate), n_mfcc=20).T, axis=0)
        mfcc = mfcc.reshape(1,mfcc.shape[0])
        prob_audio = recnet.predict(mfcc)
        
        if max(prob_audio[0]) < 0.6: 
            pred_audio = 4 
        else:   
            pred_audio = np.argmax(prob_audio)

        if   pred_audio==0:
            res = "Raffaele ha detto acconsento"

        elif pred_audio == 1:
            res ="Raffaele ha detto rifiuto"

        elif pred_audio == 2:
            res ="Lorenzo ha detto acconsento"

        elif pred_audio == 3:
            res ="Lorenzo ha detto rifiuto"

        else:
            res ="Retry"

        sent = str(res)+''+str(prob_audio[0][pred_audio])

    if cv.waitKey(20) & 0xFF == ord("q"):
        break
        
cap.release()
cv.destroyAllWindows()