In [2]:
import tkinter as tk
from tkinter import ttk
import cv2
import numpy as np
import pyaudio
import librosa
from keras.models import load_model
from PIL import Image, ImageTk

# Load the trained model
model = load_model('emotion_model.h5')

class EmotionGUI:
    def __init__(self, root):
        self.root = root
        self.root.title("Speech Emotion Recognition")

        # Initialize camera
        self.video_capture = cv2.VideoCapture(0)

        # Create label to display live camera feed
        self.camera_label = ttk.Label(root)
        self.camera_label.pack()

        # Create label to display detected emotion
        self.label = ttk.Label(root, text="Detected Emotion:", font=('Helvetica', 16))
        self.label.pack(pady=10)

        # Create buttons
        self.capture_button = ttk.Button(root, text="Capture Video and Voice", command=self.capture_video_voice)
        self.capture_button.pack(pady=10)

        # Create a separate function to continuously update the camera feed
        self.update_camera_feed()

        # Set up the closing protocol
        self.root.protocol("WM_DELETE_WINDOW", self.quit)

    def capture_video_voice(self):
        ret, frame = self.video_capture.read()

        # Use PyAudio to capture audio
        fs = 22050  # Sampling rate
        duration = 3  # Capture audio for 3 seconds
        audio_data = self.record_audio(fs, duration)

        # Process the captured audio and predict emotion using the model
        emotion = self.predict_emotion(audio_data)

        # Display the detected emotion
        self.label.config(text=f"Detected Emotion: {emotion}")

    def record_audio(self, fs, duration):
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paFloat32, channels=1, rate=fs, input=True, frames_per_buffer=1024)
        print("Recording...")
        frames = []
        for i in range(0, int(fs / 1024 * duration)):
            data = stream.read(1024)
            frames.append(data)
        print("Finished recording.")
        stream.stop_stream()
        stream.close()
        p.terminate()
        audio_data = np.frombuffer(b''.join(frames), dtype=np.float32)  # Cast to float32
        return audio_data

    def predict_emotion(self, audio_data):
        mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=22050, n_mfcc=40).T, axis=0)
        mfcc = np.expand_dims(mfcc, axis=0)
        mfcc = np.expand_dims(mfcc, axis=2)
        prediction = model.predict(mfcc)
        emotion_labels = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'ps', 'sad']
        emotion = emotion_labels[np.argmax(prediction)]
        return emotion

    def update_camera_feed(self):
        # Read a frame from the camera
        ret, frame = self.video_capture.read()

        # Convert the frame to RGB format
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Convert the frame to a PhotoImage
        img = Image.fromarray(frame_rgb)
        img = ImageTk.PhotoImage(img)

        # Update the label with the new frame
        self.camera_label.img = img
        self.camera_label.configure(image=img)

        # Call this function again after a delay (e.g., 30 milliseconds) for continuous updating
        self.root.after(30, self.update_camera_feed)

    def quit(self):
        self.video_capture.release()
        self.root.destroy()

# Create and run the GUI
root = tk.Tk()
gui = EmotionGUI(root)
root.mainloop()
