In [42]:
import os
import json
import librosa
import numpy as np
import tkinter as tk
from tkinter import filedialog, messagebox
from tensorflow.keras.models import model_from_json
from sklearn.preprocessing import LabelEncoder

# Load emotion model
with open('model1.json', 'r') as json_file:
    loaded_model_json = json_file.read()
emotion_model = model_from_json(loaded_model_json)
emotion_model.load_weights('Speech_Model.h5')

# Load gender model
with open('model2.json', 'r') as json_file:
    loaded_gender_model_json = json_file.read()
gender_model = model_from_json(loaded_gender_model_json)
gender_model.load_weights('Gender_Model.h5')

# Print model summaries to check expected input shapes

# Manually define emotion classes
emotion_classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad']

# Fit emotion label encoder
emotion_encoder = LabelEncoder()
emotion_encoder.fit(emotion_classes)

# Define gender labels
label2int = {"male": 1, "female": 0}

def preprocess_audio(file_path):
    y, sr = librosa.load(file_path, sr=16000)
    y = librosa.effects.trim(y)[0]
    y = librosa.util.normalize(y)
    return y, sr

def extract_feature(file_name, **kwargs):
    mfcc = kwargs.get("mfcc", False)
    chroma = kwargs.get("chroma", False)
    mel = kwargs.get("mel", False)
    contrast = kwargs.get("contrast", False)
    tonnetz = kwargs.get("tonnetz", False)
    X, sample_rate = librosa.core.load(file_name)
    if chroma or contrast:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel))
    if contrast:
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, contrast))
    if tonnetz:
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
        result = np.hstack((result, tonnetz))
    return result

def extract_features(y, sr, n_mfcc=13):
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean

def extract_gender_features(y, sr, n_mfcc=128):
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    return mfccs_mean

def predict_gender(file_path):
    y, sr = preprocess_audio(file_path)
    features = extract_gender_features(y, sr, n_mfcc=128)
    features = np.expand_dims(features, axis=0)  # Add batch dimension
    print(f"Gender Features shape: {features.shape}")
    predicted_prob = gender_model.predict(features)[0][0]
    gender = "female" if predicted_prob < 0.5 else "male"
    return gender

def predict_emotion(file_path):
    y, sr = preprocess_audio(file_path)
    features = extract_features(y, sr, n_mfcc=13)
    features = features.reshape(13, 1, 1)  # Reshape to match model input shape (13, 1, 1)
    features = np.expand_dims(features, axis=0)  # Add batch dimension
    print(f"Emotion Features shape: {features.shape}")
    predicted_index = emotion_model.predict(features).argmax()
    predicted_emotion = emotion_encoder.inverse_transform([predicted_index])[0]
    return predicted_emotion if predicted_emotion in emotion_classes else None

class EmotionDetectionApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Emotion Detection from Voice")
        self.create_widgets()

    def create_widgets(self):
        self.label = tk.Label(self.root, text="Upload a Voice Recording")
        self.label.pack(pady=10)

        self.upload_button = tk.Button(self.root, text="Upload Audio", command=self.upload_audio)
        self.upload_button.pack(pady=5)

        self.result_label = tk.Label(self.root, text="")
        self.result_label.pack(pady=10)

    def upload_audio(self):
        file_path = filedialog.askopenfilename(filetypes=[("Audio Files", "*.wav *.mp3")])
        if not file_path:
            return

        gender = predict_gender(file_path)
        if gender != "female":
            self.result_label.config(text="Please upload a female voice")
            return

        emotion = predict_emotion(file_path)
        if emotion:
            self.result_label.config(text=f"Detected Emotion: {emotion}")
        else:
            self.result_label.config(text="Could not detect emotion. Please upload a clear voice recording.")

if __name__ == "__main__":
    root = tk.Tk()
    app = EmotionDetectionApp(root)
    root.mainloop()


Gender Features shape: (1, 128)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 321ms/step
Emotion Features shape: (1, 13, 1, 1)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 576ms/step
Gender Features shape: (1, 128)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
