In [None]:
from scipy.spatial import distance
from imutils import face_utils
import imutils
import dlib
import cv2
from deepface import DeepFace
import numpy as np 
import pyaudio
import wave
import matplotlib.pyplot as plt
import threading
import keras
import librosa


In [None]:
def eye_aspect_ratio(eye):
	A = distance.euclidean(eye[1], eye[5])
	B = distance.euclidean(eye[2], eye[4])
	C = distance.euclidean(eye[0], eye[3])
	ear = (A + B) / (2.0 * C)
	return ear

In [None]:
thresh = 0.25
frame_check = 20
detect = dlib.get_frontal_face_detector()
predict = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")

In [None]:
(lStart, lEnd) = face_utils.FACIAL_LANDMARKS_IDXS["left_eye"]
(rStart, rEnd) = face_utils.FACIAL_LANDMARKS_IDXS["right_eye"]

In [None]:
imgpath = "sad face.jpg"  #put the image where this file is located and put its name here
image = cv2.imread(imgpath)

analyze = DeepFace.analyze(img_path = imgpath, 
        actions = ['emotion']
)
print(analyze[0].get('dominant_emotion'))

In [None]:
obj = DeepFace.analyze(img_path = "/Drowsiness-Detection-system-with-emotion-analysis-for-improved-vehicle-experience/Combination of Both/emotions/scared.PNG", 
        actions = ['age', 'gender', 'race', 'emotion']
)
print(obj)

In [None]:
#input in wav file

def mp3_to_wav():
    FRAMES_PER_BUFFER = 3200
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000

    pa = pyaudio.PyAudio()

    stream = pa.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=FRAMES_PER_BUFFER
    )

    print('Recording Started')

    seconds = 8
    frames = []
    second_tracking = 0
    second_count = 0
    for i in range(0, int(RATE/FRAMES_PER_BUFFER*seconds)):
        data = stream.read(FRAMES_PER_BUFFER)
        frames.append(data)
        second_tracking += 1
        if second_tracking == RATE/FRAMES_PER_BUFFER:
            second_count += 1
            second_tracking = 0
            #print(f'Time Left: {seconds - second_count} seconds')


    stream.stop_stream()
    stream.close()
    pa.terminate()

    #creating wav file
    obj = wave.open('output.wav', 'wb')
    obj.setnchannels(CHANNELS)
    obj.setsampwidth(pa.get_sample_size(FORMAT))
    obj.setframerate(RATE)
    obj.writeframes(b''.join(frames))
    obj.close()

    # reading input from file


    # file = wave.open('lemaster_tech.wav', 'rb')

    # sample_freq = file.getframerate()
    # frames = file.getnframes()
    # signal_wave = file.readframes(-1)

    # file.close()

    # time = frames / sample_freq


    # # if one channel use int16, if 2 use int32
    # audio_array = np.frombuffer(signal_wave, dtype=np.int16)

    # times = np.linspace(0, time, num=frames)

    # plt.figure(figsize=(15, 5))
    # plt.plot(times, audio_array)
    # plt.ylabel('Signal Wave')
    # plt.xlabel('Time (s)')
    # plt.xlim(0, time)
    # plt.title('The Thing I Just Recorded!!')
    # plt.show()

In [None]:
def convert_class_to_emotion(pred):
    """
    Method to convert the predictions (int) into human readable strings.
    """
    
    label_conversion = {'0': 'neutral',
                        '1': 'calm',
                        '2': 'happy',
                        '3': 'sad',
                        '4': 'angry',
                        '5': 'fearful',
                        '6': 'disgust',
                        '7': 'surprised'}

    for key, value in label_conversion.items():
        if int(key) == pred:
            label = value
    return label


def mer_prediction(file):
    file = file
    path = 'Emotion_Voice_Detection_Model.h5'
    loaded_model = keras.models.load_model(path)

    data, sampling_rate = librosa.load(file)
    mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=40).T, axis=0)
    x = np.expand_dims(mfccs, axis=1)
    x = np.expand_dims(x, axis=0)
    predictions = loaded_model.predict(x)
    predicted_class = np.argmax(predictions, axis=1)  # Get class index
    print("Prediction is", convert_class_to_emotion(predicted_class[0]))

    return convert_class_to_emotion(predicted_class[0])


In [None]:
face_cascade_name = cv2.data.haarcascades + 'haarcascade_frontalface_alt.xml'  #getting a haarcascade xml file
face_cascade = cv2.CascadeClassifier()  #processing it for our project
if not face_cascade.load(cv2.samples.findFile(face_cascade_name)):  #adding a fallback event
    print("Error loading xml file")

video=cv2.VideoCapture(0)  #requisting the input from the webcam or camera

mp3_thread = threading.Thread(target = mp3_to_wav, args = ())
mp3_thread.start()

while True:  
    ret,frame = video.read()
    
    frame = imutils.resize(frame, width=450)
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    subjects = detect(gray, 0)
    for subject in subjects:
        shape = predict(gray, subject)
        shape = face_utils.shape_to_np(shape) #converting to NumPy Array
        leftEye = shape[lStart:lEnd]
        rightEye = shape[rStart:rEnd]
        leftEAR = eye_aspect_ratio(leftEye)
        rightEAR = eye_aspect_ratio(rightEye)
        ear = (leftEAR + rightEAR) / 2.0
        leftEyeHull = cv2.convexHull(leftEye)
        rightEyeHull = cv2.convexHull(rightEye)
        cv2.drawContours(frame, [leftEyeHull], -1, (0, 255, 0), 1)
        cv2.drawContours(frame, [rightEyeHull], -1, (0, 255, 0), 1)
        if ear < thresh:
            flag += 1
            #print (flag)
            if flag >= frame_check:
                cv2.putText(frame, "****************ALERT!****************", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
                cv2.putText(frame, "****************ALERT!****************", (10,325),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
        else:
            flag = 0

    #gray=cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)  #changing the video to grayscale to make the face analisis work properly
    face=face_cascade.detectMultiScale(gray,scaleFactor=1.1,minNeighbors=5)

    for x,y,w,h in face:
        img = cv2.rectangle(frame,(x,y),(x+w,y+h),(0,0,255),1)  #making a recentangle to show up and detect the face and setting it position and colour
   
      #making a try and except condition in case of any errors
        try:
            #analyze = DeepFace.analyze(frame, actions = ['age'])
            analyze = DeepFace.analyze(frame,actions=['emotion'])  #same thing is happing here as the previous example, we are using the analyze class from deepface and using ‘frame’ as input
            #print(analyze[0].get('dominant_emotion'))  #here we will only go print out the dominant emotion also explained in the previous example
            if analyze[0].get('dominant_emotion') in ('sad', 'fear', 'surprise', 'angry'):
                cv2.putText(frame, "************ALERT!" + analyze[0].get('dominant_emotion') + "************", (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
                cv2.putText(frame, "************ALERT!" + analyze[0].get('dominant_emotion') + "************", (10,325),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
        except:
            # print("no face")
            continue

      #this is the part where we display the output to the user

    #analyzing audio

    if(mp3_thread.is_alive() == False):
        emotion = mer_prediction(file='output.wav')
        print(emotion)
        if emotion in ('sad', 'fearful', 'surprise', 'angry'):
            cv2.putText(frame, "************ALERT!" + emotion + "************", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
            cv2.putText(frame, "************ALERT!" + emotion + "************", (10,325),
                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)
        mp3_thread = threading.Thread(target = mp3_to_wav, args = ())
        mp3_thread.start()

    cv2.imshow('frame', frame)
      
    key=cv2.waitKey(1) & 0xFF
    if key==ord('q'):# here we are specifying the key which will stop the loop and stop all the processes going
        cv2.destroyAllWindows()
        video.release() 
        break
    