# Step 0. Import libraries and install all dependencies

In [1]:
#Clone yolov5 github repo if needed
#clone repo
#!git clone https://github.com/ultralytics/yolov5

#Install requirements
#%cd yolov5
#%pip install -r "requirements.txt

In [1]:
#Import libraries
import cv2
import torch
import numpy as np
import time
import os
from tensorflow.keras.models import load_model

#Fixes an error when loading the yolov5 model
import pathlib
from pathlib import Path
pathlib.PosixPath = pathlib.WindowsPath

# Step 1. Define all functions

In [2]:
# Define a function to load yolov5 trained model
def load_custom_yolo(path_to_yolo_repo = r"C:\Users\kurom\Desktop\AI_GBC\S2\DL_II\Final_Project\yolov5"):
    """
    Function to load a custom fine tunned yolov5 model. To run this function you need to have previously 
    cloned the yolov5 repo and provide the path to that folder, also the best.pt file must be 
    inside the yolov5 folder, if needed, manually copy and paste it in the desired location. 
    """
    #Change working directory to yolov5 repo, this is needed to load the model
    os.chdir(path_to_yolo_repo)

    #Load Model
    custom_yolo = torch.hub.load('', 'custom', path='best.pt', source='local')

    # Changing settings to prevent finding the faces multiple times
    custom_yolo.conf = 0.5
    custom_yolo.iou = 0.3

    print('\n')
    print("Yolo custom model successfully loaded...")

    return custom_yolo

#Define a function to load pre trained CNN to perform image classification
def load_img_clf(path_to_model_type = "baseline"):
    """
    Function to load a pre trained model 
    in keras using .keras or .h5 file if needed modify the path locations specified for each model
    Also you can add more models. The path_to_model_type arg controls the model to be loaded
    """
    #Define path to model controlled by argument
    if path_to_model_type == 'baseline':
        path_to_model = r"C:\Users\kurom\Desktop\AI_GBC\S2\DL_II\Final_Project\model.h5"

    elif path_to_model_type == 'custom_resnet_hub':
        path_to_model = r"C:\Users\kurom\Desktop\AI_GBC\S2\DL_II\Final_Project\custom_resnet_hub.pb"

    elif path_to_model_type == 'custom_VGG16_dcl':
        path_to_model = r"C:\Users\kurom\Desktop\AI_GBC\S2\DL_II\Final_Project\VGG_16_Cata.h5"
        
    # Load the pre-trained model
    clf_model = load_model(path_to_model)

    print('\n')
    print("Image Classifier successfully loaded ... ")

    return clf_model

#Define a function that executes preprocessing 
def image_preprocess(face, type='default', width=48, height=48):
    """
    Defines a function to perform image pre processing 
    to a sliced_frame later passed to the classifier model
    for inference, this sliced_frame contain only 1 face
    type argument controls different types of pre processing
    width and height control resizing
    """
    if type == 'default':
        # Resize the image
        resized_face = cv2.resize(face, (width, height))

        # Normalize the image (convert pixel values to [0, 1])
        normalized_face = resized_face.astype(float) / 255.0

        #Expand dims (1, w, h, channels)
        normalized_face = np.expand_dims(normalized_face, axis=0)

        return normalized_face
    
#Define a function to handle inference (predictions)
def inference(model, face, label_dict_type='baseline'):
    """
    Define a function that performs inference and 
    returns the emotion string i.e "Happy" and 
    the probability associated i.e "0.87" for a single sliced_frame i.e a face
    """
    #Label map depends on the model we are using for inference
    if label_dict_type == 'resnet':
        label_dict = {0: 'Angry', 1: 'Fearful', 2: 'Happy', 3: 'Neutral', 4: 'sad', 5: 'surprised'}

    elif label_dict_type == 'baseline':
        label_dict={0: 'Angry', 1: 'Disgust', 2: 'Fear', 3: 'Happy', 4: 'sad', 5: 'Surprised', 6: 'Neutral'}

    elif label_dict_type == 'vgg16':
        label_dict={0: 'Angry', 1: 'Fearful', 2: 'Happy', 3: 'Neutral', 4: 'sad', 5: 'surprised'}
    
    #Make prediction
    pred = model.predict(face)
    label_key = pred.argmax(axis=1)[0]
    label_str = label_dict[label_key]
    score = round(pred[0][label_key], 2)

    return label_str, str(score)

#Define main loop that will control the flow of this script
def main_loop(face_detector, emotion_clf, emotion_label_dict_type='baseline', process_frames=1, font = cv2.FONT_HERSHEY_DUPLEX, width=48, height=48, model_caption='baseline'):
    """
    Define function for controlling main loop
    process_frames controls the number of frames to be 
    processed before skipping next frame to reduce
    processing load

    face_detector -> yolo fine tuned model
    emotion_clf -> keras pre trained model for img (emotion) classification 
    emotion_label_dict_type -> used in inference() call controls the mapping of labels to emotions
    process_frames -> controlls number of frames to be processed before skipping a frame this is to reduce
    processing load on pc, if you don't understand well just leave it on 1.
    font -> controlls font to write on live video feed, this one is a good font, leave it like that
    width, height -> used in image_preprocess(), controlls shape that a face should be resized before feeding it to classifier model
    remember this depends on the model you are using, make sure to use the same shape as the classifier you choose
    model_caption -> controls the descriptive caption written in green box at the top of live video feed, 
    manually adjust it regarding to the model you are testing example "custom VGG 16 Cata"
    """

    # Initialize the video capture from camera
    cap = cv2.VideoCapture(0)  # Use 0 for the default camera

    #Control frame skipping to reduce processing load on pc
    process_frame = True
    frame_count = 0

    while True:
        # Read a frame from the camera
        ret, frame = cap.read()
        if not ret:
            break

        # Check if the frame should be processed or skipped
        if process_frame:
            # 1st detect faces in frame
            faces = face_detector(frame)
            
            #2nd if at least 1 face was detected
            if len(faces.pred[0]) > 0:
                #For each detected face
                i = 1 #detected face count
                for face in faces.pred[0]:
                    #Retrieve bbox coordinates for this face
                    x_1 = int(face[0])
                    y_1 = int(face[1])
                    x_2 = int(face[2])
                    y_2 = int(face[3])

                    #Retrieve width of bbox
                    w = abs(x_1 - x_2)

                    #Crop face
                    sliced_face = frame[y_1:y_2, x_1:x_2]

                    #pre process sliced_face for img clf
                    processed_face = image_preprocess(face=sliced_face, type='default', width=width, height=height)

                    #make emotion prediction and retrieve emotion string and score for that emotion
                    emotion_str, e_score_str = inference(model=emotion_clf, face=processed_face, label_dict_type=emotion_label_dict_type)

                    #Retrieve probability score for object detection (this is different from img classification)
                    f_score_str = str(round(float(face[4]), 2))

                    #Draw bbox rectangle for face
                    cv2.rectangle(frame, (x_1, y_1), (x_2, y_2), color=(0,0,255), thickness=1)
                    #Draw face detection label rectangle
                    cv2.rectangle(frame, (x_1, y_1-23), (x_1+w, y_1-7), color=(0,0,255), thickness=-1)
                    #Draw emotion detection label rectangle
                    cv2.rectangle(frame, (x_1, y_1-45), (x_1+w, y_1-28), color=(255,0,0), thickness=-1)
                    #Draw model label rectangle
                    cv2.rectangle(frame, (220, 1), (420, 25), color=(0,255,0), thickness=-1)

                    #Write face detection caption
                    cv2.putText(img=frame, text='Face #'+str(i)+' '+f_score_str, org=(x_1,y_1-10), 
                                fontFace=font, fontScale=0.5, color=(255,255,255), thickness=1, 
                                lineType=cv2.LINE_AA, bottomLeftOrigin=False)
                    
                    #Write emotion detection caption
                    cv2.putText(img=frame, text=emotion_str+''+e_score_str, org=(x_1,y_1-33), 
                                fontFace=font, fontScale=0.5, color=(255,255,255), thickness=1, 
                                lineType=cv2.LINE_AA, bottomLeftOrigin=False)
                    
                    #Write model caption
                    cv2.putText(img=frame, text=model_caption, org=(260,20), 
                                fontFace=font, fontScale=0.5, color=(0,0,0), thickness=1, 
                                lineType=cv2.LINE_AA, bottomLeftOrigin=False)
                    
                    i = i+1 #Update detected faces counter
                    #Don't detect more than specified number of faces
                    if i >= 3:
                        break
                frame_count += 1 #update frame skipping control counter
                #when number of processed frames = the number of frames to be processed
                if frame_count == process_frames:
                    frame_count = 0 #reset counter
                    process_frame = False #skip next frame
            else: #When no faces are detected
                #Draw no face detected label rectangle
                cv2.rectangle(frame, (20, 1), (255, 25), color=(0,0,255), thickness=-1)
                #Draw model label rectangle
                cv2.rectangle(frame, (220, 1), (420, 25), color=(0,255,0), thickness=-1)

                #Draw no face detected caption
                cv2.putText(img=frame, text='No faces detected', org=(20,20), fontFace=font, 
                            fontScale=0.8, color=(255,255,255), thickness=1, lineType=cv2.LINE_AA, 
                            bottomLeftOrigin=False)
                #Write model caption
                cv2.putText(img=frame, text=model_caption, org=(260,20), 
                            fontFace=font, fontScale=0.5, color=(0,0,0), thickness=1, 
                            lineType=cv2.LINE_AA, bottomLeftOrigin=False)

                frame_count += 1 #update frame skipping control counter
                if frame_count == process_frames:
                    frame_count = 0 #reset counter
                    process_frame = False #skip next frame
        else:
            # Skip the frame        
            # Set process_frame to True to process the next frame
            process_frame = True
            continue

        # Display the frame with bounding boxes, rectangles and captions
        cv2.imshow('Live Video', frame)

        # Check for q key press to exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    # Release the video capture and close the OpenCV windows
    cap.release()
    cv2.destroyAllWindows()


In [3]:
#First always read yolo, this step is static unless we try a different object detector for face detection, not for now
custom_yolo_model = load_custom_yolo()

  from .autonotebook import tqdm as notebook_tqdm
YOLOv5  v7.0-294-gdb125a20 Python-3.10.8 torch-2.2.1+cpu CPU

Fusing layers... 
Model summary: 157 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 




Yolo custom model successfully loaded...


### Create different calls for evaluating different model, first always load model and then call main loop function, see some examples

In [None]:
#baseline
clf_model = load_img_clf(path_to_model_type='baseline')
main_loop(face_detector=custom_yolo_model, emotion_clf=clf_model, emotion_label_dict_type='baseline', 
          process_frames=1, font = cv2.FONT_HERSHEY_DUPLEX, width=48, height=48, model_caption='baseline')

In [None]:
#VGG16
clf_model = load_img_clf(path_to_model_type='custom_VGG16_dcl')
main_loop(face_detector=custom_yolo_model, emotion_clf=clf_model, emotion_label_dict_type='vgg16', 
          process_frames=1, font=cv2.FONT_HERSHEY_DUPLEX, width=224, height=224, model_caption='custom_VGG16')

In [None]:
#resnet
clf_model = load_img_clf(path_to_model_type='custom_resnet_hub')
main_loop(face_detector=custom_yolo_model, emotion_clf=clf_model, emotion_label_dict_type='resnet', 
          process_frames=1, font=cv2.FONT_HERSHEY_DUPLEX, width=48, height=48, model_caption='custom_resnet')