In [17]:
import soundfile
import numpy as np
import PySimpleGUI as sg
import librosa
import glob
import os
import pickle
import pyaudio
import wave
from sys import byteorder
from array import array
from struct import pack
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

int2emotion = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

AVAILABLE_EMOTIONS = {
    "neutral",
    "calm",
    "happy",
    "sad",
    "angry",
    "fearful",
    "disguest",
    "surprised"
}

def extract_feature(file_name, **kwargs):

    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    contrast = kwargs.get("contrast")
    tonnetz = kwargs.get("tonnetz")
    with soundfile.SoundFile(file_name) as sound_file:
        X, sample_rate = librosa.load(file_name)
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result


def load_data(test_size=0.2):
    X, y = [], []
    for file in glob.glob("C:\\Users\\Raaj\\Machine Learning\\Speech Emotion Recognition\\ravdess-emotional-speech-audio\\Actor_*\\\\*.wav"):
        basename = os.path.basename(file)
        emotion = int2emotion[basename.split("-")[2]]
        if emotion not in AVAILABLE_EMOTIONS:
            continue
        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
        X.append(features)
        y.append(emotion)
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

X_train, X_test, y_train, y_test = load_data(test_size=0.25)

# Preprocessing: Normalize input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model tuning: Grid search with cross-validation
param_grid = {
    'hidden_layer_sizes': [(500,), (500, 250), (500, 250, 100)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

grid_search = GridSearchCV(MLPClassifier(max_iter=1500), param_grid, cv=3, verbose=2)
grid_search.fit(X_train_scaled, y_train)

best_params = grid_search.best_params_
model = grid_search.best_estimator_

# Evaluate the model
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("[+] Number of training samples:", X_train.shape[0])
print("[+] Number of testing samples:", X_test.shape[0])
print("[+] Number of features:", X_train.shape[1])
print("Best parameters:", best_params)
print("Accuracy: {:.2f}%".format(accuracy * 100))


if not os.path.isdir("result"):
    os.mkdir("result")

pickle.dump(model, open("result/mlp_classifier.model", "wb"))

THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 16000
SILENCE = 30

def is_silent(snd_data):
    return max(snd_data) < THRESHOLD

def normalize(snd_data):
    MAXIMUM = 16384
    times = float(MAXIMUM)/max(abs(i) for i in snd_data)

    r = array('h')
    for i in snd_data:
        r.append(int(i*times))
    return r

def trim(snd_data):
    def _trim(snd_data):
        snd_started = False
        r = array('h')

        for i in snd_data:
            if not snd_started and abs(i)>THRESHOLD:
                snd_started = True
                r.append(i)

            elif snd_started:
                r.append(i)
        return r

    snd_data = _trim(snd_data)
    snd_data.reverse()
    snd_data = _trim(snd_data)
    snd_data.reverse()
    return snd_data

def add_silence(snd_data, seconds):
    r = array('h', [0 for i in range(int(seconds*RATE))])
    r.extend(snd_data)
    r.extend([0 for i in range(int(seconds*RATE))])
    return r
def record():
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=1, rate=RATE,
        input=True, output=True,
        frames_per_buffer=CHUNK_SIZE)

    num_silent = 0
    snd_started = False

    r = array('h')

    while 1:
        snd_data = array('h', stream.read(CHUNK_SIZE))
        if byteorder == 'big':
            snd_data.byteswap()
        r.extend(snd_data)

        silent = is_silent(snd_data)

        if silent and snd_started:
            num_silent += 1
        elif not silent and not snd_started:
            snd_started = True

        if snd_started and num_silent > SILENCE:
            break

    sample_width = p.get_sample_size(FORMAT)
    stream.stop_stream()
    stream.close()
    p.terminate()

    r = normalize(r)
    r = trim(r)
    r = add_silence(r, 0.5)
    return sample_width, r

def record_to_file(path):
    sample_width, data = record()
    data = pack('<' + ('h'*len(data)), *data)

    wf = wave.open(path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(sample_width)
    wf.setframerate(RATE)
    wf.writeframes(data)
    wf.close()
    

    

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END alpha=0.0001, hidden_layer_sizes=(500,), learning_rate=constant; total time=  15.9s
[CV] END alpha=0.0001, hidden_layer_sizes=(500,), learning_rate=constant; total time=  16.3s
[CV] END alpha=0.0001, hidden_layer_sizes=(500,), learning_rate=constant; total time=  16.4s
[CV] END alpha=0.0001, hidden_layer_sizes=(500,), learning_rate=adaptive; total time=  16.3s
[CV] END alpha=0.0001, hidden_layer_sizes=(500,), learning_rate=adaptive; total time=  15.9s
[CV] END alpha=0.0001, hidden_layer_sizes=(500,), learning_rate=adaptive; total time=  17.3s
[CV] END alpha=0.0001, hidden_layer_sizes=(500, 250), learning_rate=constant; total time=  15.3s
[CV] END alpha=0.0001, hidden_layer_sizes=(500, 250), learning_rate=constant; total time=  16.0s
[CV] END alpha=0.0001, hidden_layer_sizes=(500, 250), learning_rate=constant; total time=  16.0s
[CV] END alpha=0.0001, hidden_layer_sizes=(500, 250), learning_rate=adaptive; total time= 

In [19]:
if __name__ == "__main__":
    model = pickle.load(open("result/mlp_classifier.model", "rb"))
    print("Please talk")
    filename = "test.wav"
    record_to_file(filename)
    features = extract_feature(filename, mfcc=True, chroma=True, mel=True).reshape(1, -1)
    result = model.predict(features)[0]
    outpu="The emotion speaker expressing is : "+result
    
layout = [[sg.Text(outpu)], [sg.Button("OK")]]
window = sg.Window("Results", layout, margins=(100, 50))
while True:
    event, values = window.read()
    if event == "OK" or event == sg.WIN_CLOSED:
        break

window.close()

Please talk
