In [2]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Conv2D, Flatten, Dense

In [3]:
database = "/Users/rumen/Plovdiv University/Дипломна работа/InSAES_data"

In [4]:
# Extracting data
features, total_emotions = [], []

for x, _, z in os.walk(database):
    for i in z:
        if i.endswith(".wav"):
            audio_path = os.path.join(x, i)
            emotion = os.path.basename(x).split("_")[-1]
            audio, sr = librosa.load(audio_path, duration=3)
            mfcc = np.mean(librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13), axis=1)
            features.append(mfcc)
            total_emotions.append(emotion)

In [5]:
features = np.array(features)
total_emotions = np.array(total_emotions)

In [6]:
# Convert emotion names to numeric values
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(total_emotions)

In [None]:
lst = []

for i in ["angry", "disgust", "fear", "happy", "neutral", "sad"]:
    curr_emotion_name = os.path.join(database, f"YAF_{i}")
    if os.path.exists(curr_emotion_name):

        random_data = np.random.choice([x for x in os.listdir(curr_emotion_name) if x.endswith(".wav")], size=3, replace=False)

        for k in random_data[:1]:
            file_path = os.path.join(curr_emotion_name, k)
            audio, sr = librosa.load(file_path, duration=3)
            lst.append((audio, sr, i))
    else:
        print(f"Directory {curr_emotion_name} does not exist")

# Visualising results
plt.figure(figsize=(12, 8))
for (audio, sr, emotion), i in zip(lst, ["b", "g", "r", "c", "m", "y", "k"]):
    plt.plot(audio, label=emotion, color=i)

plt.title("Waveforms of Selected Samples")
plt.legend(loc="upper right")
plt.tight_layout()
plt.show()

In [None]:
def augment_audio(features_in, total_emotions_in):
    lst_f = []
    lst_e = []

    for f, e in zip(features_in, total_emotions_in):
        
        lst_f.append(f)
        lst_e.append(e)

        
        noise = np.random.randn(len(f))
        lst_f.append(f + 0.005 * noise)
        lst_e.append(e)

        
        lst_f.append(librosa.effects.time_stretch(f, rate=1.2))
        lst_e.append(e)

        
        lst_f.append(librosa.effects.pitch_shift(f, sr=22050, n_steps=2))
        lst_e.append(e)

    return np.array(lst_f), np.array(lst_e)


additional_features, augmented_t_emotions = augment_audio(features, total_emotions)

# Visualising the original and additional samples
print("Original samples:", len(features))
print("Current samples:", len(additional_features))

In [9]:
emotion_idxs = {"angry": 0, "disgust": 1, "fear": 2, "happy": 3, "neutral": 4, "sad": 5}
encoded_labels = np.array([emotion_idxs.get(x, -1) for x in total_emotions])


X_train, X_test, y_train, y_test = train_test_split(features[encoded_labels != -1],
                                                    encoded_labels[encoded_labels != -1], test_size=0.2, random_state=42)

X_train = X_train.reshape((*X_train.shape, 1))
X_test = X_test.reshape((*X_test.shape, 1))

# Convert the emotion labels to categorical format
y_train = to_categorical(y_train, len(emotion_idxs))
y_test = to_categorical(y_test, len(emotion_idxs))

In [None]:
# Creating CNN model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation="relu", input_shape=(13, 1, 1), padding="same"))
# Convert the output to one-dimensional form
model.add(Flatten())
# Adding dense layer
model.add(Dense(32, activation="relu"))

model.add(Dense(6, activation="softmax"))
# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))

# Model evaluation
loss, accuracy = model.evaluate(X_test, y_test)

print("***********")
print("Test loss:", loss)
print("Test accuracy:", accuracy)

In [None]:
print("Accuracy of model on test data: " , model.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Model Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend(["Train", "Validation"], loc="upper left")
plt.show()

In [None]:
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("Model Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend(["Train", "Validation"], loc="upper left")
plt.show()

In [None]:
predicted_emotions = np.argmax(model.predict(X_test), axis=1)

true_emotions = np.argmax(y_test, axis=1)

report = classification_report(true_emotions, predicted_emotions, 
                               target_names=["angry", "disgust", "fear", "happy", "neutral", "sad"])
print(report)

In [None]:
history = model.fit(X_train, y_train, validation_split=0.3, epochs=10, batch_size=64)

In [None]:
print("Accuracy of model on train data: " , model.evaluate(X_train,y_train)[1]*100 , "%")

In [None]:
model.save("InSAES_model.h5")
print("Model saved correctly!")