In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import joblib
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
import tensorflow as tf
import tkinter as tk
from tkinter import filedialog

In [2]:
ravdess = "data"
ravdess_directory_list = os.listdir(ravdess)

In [4]:
emotions = []
paths = []

for actor in ravdess_directory_list:
    actor_path = os.path.join(ravdess, actor)
    actor_files = os.listdir(actor_path)

    for audio_file in actor_files:
        file_parts = audio_file.split('.')[0].split('-')
        if len(file_parts) > 2:
            emotion_code = int(file_parts[2])
            emotions.append(emotion_code)
            paths.append(os.path.join(actor_path, audio_file))

data = pd.DataFrame({
    'Emotion': emotions,
    'File_Path': paths
})

emotion_labels = {
    1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
    5: 'angry', 6: 'fear', 7: 'disgust', 8: 'surprise'
}

data['Emotion'] = data['Emotion'].map(emotion_labels)

head_and_tail = pd.concat([data.head(), data.tail()])
print("First and last few rows of the DataFrame:")
print(head_and_tail)


First and last few rows of the DataFrame:
       Emotion                               File_Path
0      neutral  data\Actor_01\03-01-01-01-01-01-01.wav
1      neutral  data\Actor_01\03-01-01-01-01-02-01.wav
2      neutral  data\Actor_01\03-01-01-01-02-01-01.wav
3      neutral  data\Actor_01\03-01-01-01-02-02-01.wav
4         calm  data\Actor_01\03-01-02-01-01-01-01.wav
1435  surprise  data\Actor_24\03-01-08-01-02-02-24.wav
1436  surprise  data\Actor_24\03-01-08-02-01-01-24.wav
1437  surprise  data\Actor_24\03-01-08-02-01-02-24.wav
1438  surprise  data\Actor_24\03-01-08-02-02-01-24.wav
1439  surprise  data\Actor_24\03-01-08-02-02-02-24.wav


In [5]:
class FeatureExtractor:
    def __init__(self, frame_length=2048, hop_length=512):
        self.frame_length = frame_length
        self.hop_length = hop_length

    def zcr(self, data):
        return librosa.feature.zero_crossing_rate(data, frame_length=self.frame_length, hop_length=self.hop_length).flatten()

    def rmse(self, data):
        return librosa.feature.rms(y=data, frame_length=self.frame_length, hop_length=self.hop_length).flatten()

    def mfcc(self, data, sr, n_mfcc=13, flatten=True):
        mfcc_features = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=n_mfcc, hop_length=self.hop_length)
        return mfcc_features.T.flatten() if flatten else mfcc_features.T

    def chroma(self, data, sr):
        chroma_features = librosa.feature.chroma_stft(y=data, sr=sr, hop_length=self.hop_length)
        return chroma_features.T.flatten()

    def spectral_contrast(self, data, sr):
        contrast_features = librosa.feature.spectral_contrast(y=data, sr=sr, hop_length=self.hop_length)
        return contrast_features.T.flatten()

    def mel_spectrogram(self, data, sr):
        mel_features = librosa.feature.melspectrogram(y=data, sr=sr, hop_length=self.hop_length)
        return librosa.power_to_db(mel_features).flatten()

    def extract_features(self, data, sr):
        zcr_features = self.zcr(data)
        rmse_features = self.rmse(data)
        mfcc_features = self.mfcc(data, sr)
        chroma_features = self.chroma(data, sr)
        spectral_contrast_features = self.spectral_contrast(data, sr)
        mel_spectrogram_features = self.mel_spectrogram(data, sr)
        return np.concatenate([zcr_features,
                                rmse_features,
                                mfcc_features,
                                chroma_features,
                                spectral_contrast_features,
                                mel_spectrogram_features])


In [6]:
class DataAugmentation:
    @staticmethod
    def noise(data, noise_factor=0.005):
        noise_amp = noise_factor * np.random.uniform() * np.amax(data)
        return data + noise_amp * np.random.normal(size=data.shape[0])
    @staticmethod
    def pitch(data, sr, n_steps=4):
        return librosa.effects.pitch_shift(y=data, sr=sr, n_steps=n_steps)

In [7]:
from joblib import Parallel, delayed

class AudioProcessor:
    def __init__(self, frame_length=2048, hop_length=512):
        self.feature_extractor = FeatureExtractor(frame_length, hop_length)
        self.augmenter = DataAugmentation()

    def get_features(self, path, duration=2.5, offset=0.6):
        data, sr = librosa.load(path, duration=duration, offset=offset)
        features = [self.feature_extractor.extract_features(data, sr)]

        noised_audio = self.augmenter.noise(data)
        features.append(self.feature_extractor.extract_features(noised_audio, sr))

        pitched_audio = self.augmenter.pitch(data, sr)
        features.append(self.feature_extractor.extract_features(pitched_audio, sr))

        pitched_noised_audio = self.augmenter.noise(pitched_audio)
        features.append(self.feature_extractor.extract_features(pitched_noised_audio, sr))

        return np.array(features)

    def process_feature(self, path, emotion):
        features = self.get_features(path)
        X = features.tolist()
        Y = [emotion] * len(features)
        return X, Y

    def process_dataset(self, df, n_jobs=-1):
        paths = df['File_Path'].values
        emotions = df['Emotion'].values

        results = Parallel(n_jobs=n_jobs)(delayed(self.process_feature)(path, emotion) for path, emotion in zip(paths, emotions))

        X, Y = [], []
        for result in results:
            X.extend(result[0])
            Y.extend(result[1])

        max_len = max(len(x) for x in X)
        X = np.array([np.pad(x, (0, max_len - len(x)), 'constant') if len(x) < max_len else x[:max_len] for x in X])

        return X, np.array(Y)

# Process the dataset
processor = AudioProcessor()
X, Y = processor.process_dataset(data)

In [8]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
def prepare_data_for_cnn(X, Y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    joblib.dump(scaler, 'scaler.pkl')

    encoder = LabelEncoder()
    Y = encoder.fit_transform(Y)
    joblib.dump(encoder, 'label_encoder.pkl')

    num_classes = len(np.unique(Y))
    Y = to_categorical(Y, num_classes=num_classes)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1, 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1, 1))

    return X_train, X_test, Y_train, Y_test, num_classes

X_train, X_test, Y_train, Y_test, num_classes = prepare_data_for_cnn(X, Y)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(4608, 17496, 1, 1) (4608, 8) (1152, 17496, 1, 1) (1152, 8)


In [9]:
from tensorflow.keras.optimizers import Adam

def build_cnn_model(input_shape, num_classes):
    model = Sequential()
    model.add(Conv2D(16, (3, 3), activation='relu', input_shape=input_shape, padding='same'))
    model.add(MaxPooling2D((2, 1)))
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 1)))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model
cnn_model = build_cnn_model(X_train.shape[1:], num_classes)
cnn_model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_data=(X_test, Y_test))
cnn_model.save('cnn_model.h5')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 254ms/step - accuracy: 0.3248 - loss: 1.9655 - val_accuracy: 0.5408 - val_loss: 1.2076
Epoch 2/15
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 234ms/step - accuracy: 0.7215 - loss: 0.8201 - val_accuracy: 0.7995 - val_loss: 0.5502
Epoch 3/15
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 250ms/step - accuracy: 0.9530 - loss: 0.2123 - val_accuracy: 0.8976 - val_loss: 0.3410
Epoch 4/15
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 248ms/step - accuracy: 0.9897 - loss: 0.0608 - val_accuracy: 0.9149 - val_loss: 0.2961
Epoch 5/15
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 230ms/step - accuracy: 0.9995 - loss: 0.0121 - val_accuracy: 0.9366 - val_loss: 0.2402
Epoch 6/15
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 226ms/step - accuracy: 1.0000 - loss: 0.0024 - val_accuracy: 0.9375 - val_loss: 0.2355
Epoch 7/15



In [10]:
cnn_model.evaluate(X_test, Y_test)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 71ms/step - accuracy: 0.9347 - loss: 0.3368


[0.27826908230781555, 0.9392361044883728]

In [38]:
import tkinter as tk
from tkinter import filedialog, messagebox
import sounddevice as sd
import numpy as np
import joblib
from keras.models import load_model
import librosa
import soundfile as sf

# Load the trained model, scaler, and label encoder
model = load_model('cnn_model.h5')
scaler = joblib.load('scaler.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Ensure AudioProcessor is defined or imported properly
processor = AudioProcessor()  # Make sure this class or function exists

def record_audio():
    fs = 44100  # Sample rate
    duration = 5  # Duration in seconds
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float32')
    sd.wait()  # Wait until recording is finished
    print("Recording finished")
    # Save the audio file
    audio = audio.flatten()
    sf.write('recorded_audio.wav', audio, fs)  # Use soundfile instead of librosa
    return 'recorded_audio.wav'

def open_file():
    file_path = filedialog.askopenfilename(filetypes=[("Audio Files", "*.wav *.mp3")])
    return file_path

def predict_emotion(file_path):
    features = processor.get_features(file_path)
    features = scaler.transform([features])
    features = features.reshape((features.shape[0], features.shape[1], 1, 1))
    prediction = model.predict(features)
    predicted_emotion = label_encoder.inverse_transform(np.argmax(prediction, axis=1))[0]
    return predicted_emotion

def record_and_predict():
    try:
        file_path = record_audio()
        emotion = predict_emotion(file_path)
        messagebox.showinfo("Prediction", f"Predicted Emotion: {emotion}")
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {str(e)}")

def select_and_predict():
    try:
        file_path = open_file()
        emotion = predict_emotion(file_path)
        messagebox.showinfo("Prediction", f"Predicted Emotion: {emotion}")
    except Exception as e:
        messagebox.showerror("Error", f"An error occurred: {str(e)}")

# Set up the GUI
root = tk.Tk()
root.title("Emotion Recognition")
root.geometry("400x200")  # Set the window size to 400x200 pixels

record_button = tk.Button(root, text="Record Audio", command=record_and_predict)
record_button.pack(pady=10)

select_button = tk.Button(root, text="Select Audio File", command=select_and_predict)
select_button.pack(pady=10)

root.mainloop()




Recording...
Recording finished




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
