In [5]:
import pandas as pd
import numpy as np
np.random.seed(42)

import os
import shutil
import glob
import re

import IPython.display as ipd
import tensorflow as tf
import librosa
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm


%matplotlib inline
matplotlib.style.use('ggplot')
import warnings 
warnings.filterwarnings('ignore')

In [None]:
import re


df = pd.DataFrame()
df['full_path'] = glob.glob('kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio')
df['fname'] = df['full_path'].apply(lambda x: x.split('/')[-1])

features = {'modality':{'01':'full-AV',
                        '02':'video-only', 
                        '03':'audio-only'},
            'vocal channel':{'01':'speech', 
                             '02':'song'},
            'emotion':{'01':'neutral',
                       '02':'calm',
                       '03':'happy',
                       '04':'sad',
                       '05':'angry',
                       '06':'fearful',
                       '07':'disgust',
                       '08':'surprised'},
            'emotional intensity':{'01':'normal', 
                                   '02':'strong'},
            'statement':{'01':'Kids are talking by the door', 
                         '02':'Dogs are sitting by the door'},
            'repetition':{'01':'1st repetition', 
                          '02':'2nd repetition'}}
c = 0
for feature in features:
    df[feature] = df['fname'].apply(lambda x: features[feature][re.split(r"-|.wav|\(|\)", x)[c]])
    c += 1

# df['Actor'] = df['fname'].apply(lambda x: int(re.split(r"-|.wav|\(|\)", x)[-2]))
df['sex'] = df['fname'].apply(lambda x: 'female' if int(re.split(r"-|.wav|\(|\)", x)[-2]) % 2 == 0 else 'male')

df.head()  

In [None]:
df.shape

In [None]:
fig = plt.figure(figsize = (30,12),dpi = 60)

gs = fig.add_gridspec(3,3)
gs.update(wspace = 0.2, hspace = 0.5)

ax1 = fig.add_subplot(gs[:1, :1]) #distribution plot
ax2 = fig.add_subplot(gs[:1, 1:2])
ax3 = fig.add_subplot(gs[:1, 2:3])
ax4 = fig.add_subplot(gs[1:2, :1])
ax5 = fig.add_subplot(gs[1:2, 1:2])
ax6 = fig.add_subplot(gs[1:2, 2:])
ax7 = fig.add_subplot(gs[2:, :1])

axes = [ax1, ax2, ax3, ax4, ax5, ax6, ax7]

# sns.violinplot(x='Emotion', y='audio_duration', data=df[df['Emotional intensity'] == 'normal'], order=df['Emotion'].unique(), ax=ax1)
# sns.violinplot(x='Emotion', y='audio_duration', data=df[df['Emotional intensity'] == 'strong'], order=df['Emotion'].unique(), ax=ax2)

# # setting of axes; visibility of axes and spines turn off
columns = df.columns[2:]
for i in range(len(axes)):
    sns.countplot(x=columns[i], data=df, ax=axes[i])
#     ax.axes.get_yaxis().set_visible(False)
    axes[i].set_xticklabels(axes[i].get_xticklabels(), fontsize=11.5, fontweight='bold')
    axes[i].set_xlabel('')
    axes[i].set_yticklabels(axes[i].get_yticklabels(), fontsize=10, fontweight='bold')
    axes[i].set_ylabel('Count', fontsize=10, fontweight ='bold')
    axes[i].set_title(columns[i], size=18)
#     axes[i].set_facecolor('white')
    
#     for loc in ['left', 'right', 'top', 'bottom']:
#         ax.spines[loc].set_visible(False)

fig.patch.set_facecolor('white')

fig.text(0.42, 1, 'Variance Between Features' ,{'font': 'Serif', 'size': '25','weight':'bold', 'color':'black'})
# fig.text(0.15, 1, '', {'font':'Serif', 'color': 'black', 'size':20})
plt.show()

In [21]:
def create_waveplot(data, sr):
    plt.figure(figsize=(10, 3))
    librosa.display.waveshow(data, sr=sr)
    plt.title('Waveplot')
#     plt.show()

def create_mfcc(data, sr):
    plt.figure(figsize=(12, 3))
    mfcc = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=30)
    librosa.display.specshow(mfcc, x_axis='time')
    plt.colorbar()
    plt.title('MFCC')
#     plt.show()

def create_melspectrogram(data, sr):
    plt.figure(figsize=(12, 3))
    melspec = librosa.feature.melspectrogram(y=data, n_mels = 60)   
    logspec = librosa.amplitude_to_db(melspec)
    librosa.display.specshow(logspec, sr=sr, x_axis='time', y_axis='hz')   
    plt.title('Mel Spectrogram')
    plt.colorbar()
#     plt.show()

In [None]:
happy_audio = df[(df.emotion == 'happy') & (df.statement == 'Kids are talking by the door')].iloc[0].full_path
sad_audio = df[(df.emotion == 'sad') & (df.statement == 'Kids are talking by the door')].iloc[0].full_path
angry_audio = df[(df.emotion == 'angry') & (df.statement == 'Kids are talking by the door')].iloc[0].full_path
neutral_audio = df[(df.emotion == 'neutral') & (df.statement == 'Kids are talking by the door')].iloc[0].full_path

In [None]:
data, sampling_rate = librosa.load(happy_audio)
create_waveplot(data, sampling_rate)
create_mfcc(data, sampling_rate)
create_melspectrogram(data, sampling_rate)
ipd.Audio(happy_audio)
# plt.show()

In [None]:
data, sampling_rate = librosa.load(sad_audio)
create_waveplot(data, sampling_rate)
create_mfcc(data, sampling_rate)
create_melspectrogram(data, sampling_rate)
ipd.Audio(sad_audio)

In [None]:
data, sampling_rate = librosa.load(angry_audio)
create_waveplot(data, sampling_rate)
create_mfcc(data, sampling_rate)
create_melspectrogram(data, sampling_rate)
ipd.Audio(angry_audio)

In [None]:
data, sampling_rate = librosa.load(neutral_audio)
create_waveplot(data, sampling_rate)
create_mfcc(data, sampling_rate)
create_melspectrogram(data, sampling_rate)
ipd.Audio(neutral_audio)

In [None]:
normal_audio = df[(df.emotion == 'disgust') & (df.statement == 'Kids are talking by the door') & (df['emotional intensity']== 'normal')].iloc[0].full_path
strong_audio = df[(df.emotion == 'disgust') & (df.statement == 'Kids are talking by the door') & (df['emotional intensity'] == 'strong')].iloc[0].full_path

In [None]:
data, sampling_rate = librosa.load(normal_audio)
create_waveplot(data, sampling_rate)
create_mfcc(data, sampling_rate)
create_melspectrogram(data, sampling_rate)
ipd.Audio(normal_audio)

In [None]:
data, sampling_rate = librosa.load(strong_audio)
create_waveplot(data, sampling_rate)
create_mfcc(data, sampling_rate)
create_melspectrogram(data, sampling_rate)
ipd.Audio(strong_audio)

In [None]:
def get_audio_duration(path):
    data, sr = librosa.load(path)
    return data.shape[0]/sr

df['audio_duration'] = df['full_path'].apply(get_audio_duration)

In [None]:
fig = plt.figure(figsize = (30,10),dpi = 60)

gs = fig.add_gridspec(10,24)
gs.update(wspace = 1, hspace = 0.05)


ax1 = fig.add_subplot(gs[:]) #distribution plot
sns.violinplot(x='emotion', y='audio_duration', data=df, order=df['emotion'].unique(), ax=ax1)
axes=[ax1]
# setting of axes; visibility of axes and spines turn off
for ax in axes:
#     ax.axes.get_yaxis().set_visible(False)
    ax.set_xticklabels(df['emotion'].unique(), fontsize=20, fontweight='bold')
    ax.set_xlabel('')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=13, fontweight='bold')
    ax.set_ylabel('Audio Duration', fontweight ='bold')
    ax.set_facecolor('white')
    for loc in ['left', 'right', 'top', 'bottom']:
        ax.spines[loc].set_visible(False)

fig.patch.set_facecolor('white')

ax1.text(0, 6.4, 'Are the Audio Durations Different per Emotion?' ,{'font': 'Serif', 'size': '25','weight':'bold', 'color':'black'})
ax1.text(0, 6,'The majority of the emotions mean audio durations are around 3.5 seconds, while disgust and angry are slightly longer. \n\
Neutral has the shortest durations while disgust and angry have the longest durations.', {'font':'Serif', 'color': 'black', 'size':20})
plt.show()

In [None]:
fig = plt.figure(figsize = (30,10),dpi = 60)

gs = fig.add_gridspec(10,25)
gs.update(wspace = 1, hspace = 0.05)

ax1 = fig.add_subplot(gs[:, :12]) #distribution plot
ax2 = fig.add_subplot(gs[:, 13:])
sns.violinplot(x='emotion', y='audio_duration', data=df[df['emotional intensity'] == 'normal'], order=df['emotion'].unique(), ax=ax1)
sns.violinplot(x='emotion', y='audio_duration', data=df[df['emotional intensity'] == 'strong'], order=df['emotion'].unique(), ax=ax2)

axes=[ax1, ax2]
# setting of axes; visibility of axes and spines turn off
for ax in axes:
#     ax.axes.get_xaxis().set_visible(False)
    ax.set_xticklabels(df['emotion'].unique(), fontsize=13, fontweight='bold')
    ax.set_xlabel('')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=13, fontweight='bold')
    ax.set_ylabel('Audio Duration', fontweight ='bold')
    ax.set_facecolor('white')
    
    for loc in ['left', 'right', 'top', 'bottom']:
        ax.spines[loc].set_visible(False)

fig.patch.set_facecolor('white')

ax1.set_title('Normal', size=18, weight='bold')
ax2.set_title('Strong', size=18, weight='bold')

ax1.text(0, 6.4, 'How does Emotional Intensity Effect Duration?' ,{'font': 'Serif', 'size': '25','weight':'bold', 'color':'black'})
ax1.text(0, 6,'When the emotional intensity is "strong" the audio duration is longer. There is no neutral \nemotion when the emotional intensity is "strong".', {'font':'Serif', 'color': 'black', 'size':20})
plt.show()

In [None]:
fig = plt.figure(figsize = (30,10),dpi = 60)

gs = fig.add_gridspec(10,24)
gs.update(wspace = 1, hspace = 0.05)


ax1 = fig.add_subplot(gs[:]) #distribution plot
sns.violinplot(x='sex', y='audio_duration', data=df, ax=ax1)
axes=[ax1]
# setting of axes; visibility of axes and spines turn off
for ax in axes:
#     ax.axes.get_yaxis().set_visible(False)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=20, fontweight='bold')
    ax.set_xlabel('')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=13, fontweight='bold')
    ax.set_ylabel('Audio Duration', fontweight ='bold')
    ax.set_facecolor('white')
    
#     for loc in ['left', 'right', 'top', 'bottom']:
#         ax.spines[loc].set_visible(False)
fig.patch.set_facecolor('white')

ax1.text(-0.4, 6.4, 'Are the Audio Durations Different for the Sexes?' ,{'font': 'Serif', 'size': '25','weight':'bold', 'color':'black'})
ax1.text(-0.4, 6, 'Interestingly the male has the longest and shortest audio durations. The mean duration is the same for the sexes, \nbut there are more female audios that are around the mean.', {'font':'Serif', 'color': 'black', 'size':20})
plt.show()

In [None]:
df = df.drop(['modality', 'vocal channel', 'emotional intensity', 'statement', 'repetition'], axis=1)
df.head()

In [None]:
#pip install resample

In [None]:
import resampy
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical


In [None]:
def padding_and_offset(path, sr=16000, input_length=48000):
    data, _ = librosa.load(path, sr=sr, res_type='kaiser_fast')
    if len(data) > input_length:
        max_offset = len(data) - input_length
        offset = np.random.randint(max_offset)
        data = data[offset:(input_length+offset)]
    else:
        if input_length > len(data):
            max_offset = input_length - len(data)
            offset = np.random.randint(max_offset)
        else:
            offset = 0
        data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
        
    return data
def speed_pitch(data):
    length_change = np.random.uniform(low=0.8, high = 1)
    speed_fac = 1.2  / length_change 
    tmp = np.interp(np.arange(0,len(data),speed_fac),np.arange(0,len(data)),data)
    minlen = min(data.shape[0], tmp.shape[0])
    data *= 0
    data[0:minlen] = tmp[0:minlen]
    return data

def noise(data):
    noise_amp = 0.05*np.random.uniform()*np.amax(data)
    data = data.astype('float64') + noise_amp * np.random.normal(size=data.shape[0])
    return data

In [None]:
data, _ = librosa.load(df.iloc[0].full_path)
create_waveplot(data, _)
ipd.Audio(data, rate=_)

In [None]:
speed_pitched = speed_pitch(data)
create_waveplot(speed_pitched, _)
ipd.Audio(speed_pitched, rate=_)

In [None]:
noised = noise(data)
create_waveplot(noised, _)
ipd.Audio(noised, rate=_)

In [None]:
x_data, y_data = [], []
x_aug, y_aug = [], []
for r in tqdm(df.values):
    x = padding_and_offset(r[0])
    x_data.append(x)
    y_data.append(r[2:4])
    
    x_aug.append(speed_pitch(x))
    x_aug.append(noise(x))

    y_aug.extend([r[2:4], r[2:4]])
    
    
x_data, y_data = np.array(x_data), np.array(y_data)
x_aug, y_aug = np.array(x_aug), np.array(y_aug)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=0.8, stratify=y_data, random_state=42, shuffle=True)

In [None]:
def encode(e_data, s_data):
    e_encoder = LabelEncoder()
    e_encoder = e_encoder.fit(list(df.emotion.unique()))
    e_encoded = to_categorical(e_encoder.transform(e_data))

    s_encoder = LabelEncoder()
    s_encoder = s_encoder.fit(list(df.sex.unique()))
    s_encoded = to_categorical(s_encoder.transform(s_data))
    
    return e_encoded, s_encoded

In [None]:
e_y , s_y = [], []
for e, s in y_train:
    e_y.append(e)
    s_y.append(s)
    
e_y_train, s_y_train = encode(e_y, s_y)

e_y , s_y = [], []
for e, s in y_test:
    e_y.append(e)
    s_y.append(s)
    
e_y_test, s_y_test = encode(e_y, s_y)

e_y_train_aug , s_y_train_aug = [], []
for e, s in y_aug:
    e_y_train_aug.append(e)
    s_y_train_aug.append(s)

    
e_y_train_aug, s_y_train_aug = encode(e_y_train_aug, s_y_train_aug)

In [None]:
x_train_aug = np.concatenate((x_train, x_aug), axis=0)
e_y_train_aug = np.concatenate((e_y_train, e_y_train_aug), axis=0)
s_y_train_aug = np.concatenate((s_y_train, s_y_train_aug), axis=0)

np.random.seed(42)
np.random.shuffle(x_train_aug)
np.random.seed(42)
np.random.shuffle(e_y_train_aug)
np.random.seed(42)
np.random.shuffle(s_y_train_aug)

In [None]:
def get_mfcc(data):
    MFCC = librosa.feature.mfcc(y=data, sr=16000, n_mfcc=30)
    MFCC = np.expand_dims(MFCC, axis=-1)
    return MFCC

def get_melspec(data):
    melspec = librosa.feature.melspectrogram(y=data, n_mels=60)   
    logspec = librosa.amplitude_to_db(melspec)
    logspec = np.expand_dims(logspec, axis=-1)
    return logspec

In [None]:
x_train_mfcc = []
x_test_mfcc = []
x_train_aug_mfcc = []

for i in x_train:
    mfcc = get_mfcc(i)
    x_train_mfcc.append(mfcc)

for i in x_test:
    mfcc = get_mfcc(i)
    x_test_mfcc.append(mfcc)

for i in x_train_aug:
    mfcc = get_mfcc(i)
    x_train_aug_mfcc.append(mfcc)
    
x_train_mfcc, x_test_mfcc, x_train_aug_mfcc = np.array(x_train_mfcc), np.array(x_test_mfcc), np.array(x_train_aug_mfcc)

In [None]:
x_train_melspec = []
x_test_melspec = []
x_train_aug_melspec = []

for i in x_train:
    melspec = get_melspec(i)
    x_train_melspec.append(melspec)

for i in x_test:
    melspec = get_melspec(i)
    x_test_melspec.append(melspec)

for i in x_train_aug:
    melspec = get_melspec(i)
    x_train_aug_melspec.append(melspec)
    
x_train_melspec, x_test_melspec, x_train_aug_melspec = np.array(x_train_melspec), np.array(x_test_melspec), np.array(x_train_aug_melspec)

In [None]:
from tensorflow.keras.layers import Input, Conv1D, Conv2D, MaxPool1D, MaxPool2D, GlobalMaxPool1D, Dropout, Dense, Flatten, BatchNormalization, Activation
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def get_1d_model(input_len=48000, n_emotions=8, n_sex=2):
    inputs = Input(shape=(input_len, 1))
    x = Conv1D(16, 3, activation='relu', padding='valid')(inputs)
    x = Conv1D(16, 3, activation='relu', padding='valid')(x)
#     x = BatchNormalization()(x)
    x = MaxPool1D(16)(x)
    x = Dropout(0.2)(x)
    
    x = Conv1D(32, 3, activation='relu', padding='valid')(x)
    x = Conv1D(32, 3, activation='relu', padding='valid')(x)
#     x = BatchNormalization()(x)
x = MaxPool1D(4)(x)
    x = Dropout(0.3)(x)
    
    x = Conv1D(256, 3, activation='relu', padding='valid')(x)
    x = Conv1D(256, 3, activation='relu', padding='valid')(x)
#     x = BatchNormalization()(x)
    x = MaxPool1D(4)(x)
    x = Dropout(0.3)(x)
    
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(256, activation='relu')(x)
    
    emotion_output = Dense(n_emotions, activation='softmax', name='emotion_output')(x)
    sex_output = Dense(n_sex, activation='sigmoid', name='sex_output')(x)
    
    model = Model(inputs, [emotion_output, sex_output])
    
    return model

def get_2d_mfcc_model(shape, n_emotions=8, n_sex=2):
    inputs = Input(shape=shape)
    x = Conv2D(32, (4,10), padding="same")(inputs)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(256, activation='relu')(x)
    
    emotion_output = Dense(n_emotions, activation='softmax', name='emotion_output')(x)
    sex_output = Dense(n_sex, activation='sigmoid', name='sex_output')(x)
    
    model = Model(inputs, [emotion_output, sex_output])
    
    return model

def get_2d_melspec_model(shape, n_emotions=8, n_sex=2):
    inputs = Input(shape=shape)
    x = Conv2D(32, (4,10), padding="same")(inputs)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
     x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Conv2D(32, (4,10), padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = MaxPool2D()(x)
    x = Dropout(rate=0.2)(x)
    
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(256, activation='relu')(x)
      emotion_output = Dense(n_emotions, activation='softmax', name='emotion_output')(x)
    sex_output = Dense(n_sex, activation='sigmoid', name='sex_output')(x)
    
    model = Model(inputs, [emotion_output, sex_output])
    
    return model

In [None]:
def get_callbacks(name_model):
    callbacks = [
        EarlyStopping(monitor="val_loss", mode="min", patience=20),
        ModelCheckpoint(name_model, monitor='val_loss', verbose=1, save_best_only=True)
    ]
    return callbacks

In [None]:
model_1d = get_1d_model()
model_1d.compile(optimizer=Adam(), loss={"emotion_output" : "categorical_crossentropy", "sex_output":"binary_crossentropy"}, metrics=['accuracy'])
# history_1d = model_1d.fit(x_train, {'emotion_output':e_y_train, 'sex_output':s_y_train}, validation_split=0.2, callbacks=get_callbacks('best_1d.h5'), epochs=100, batch_size=32)
history_1d = model_1d.fit(x_train, {'emotion_output':e_y_train, 'sex_output':s_y_train}, validation_data=(x_test, {'emotion_output':e_y_test, 'sex_output':s_y_test}), callbacks=get_callbacks('best_1d.h5'), epochs=100, batch_size=32)

In [None]:
plt.plot(history_1d.history['emotion_output_accuracy'])
plt.plot(history_1d.history['sex_output_accuracy'])
plt.plot(history_1d.history['val_emotion_output_accuracy'])
plt.plot(history_1d.history['val_sex_output_accuracy'])
plt.legend(['emotion_output_accuracy', 'sex_output_accuracy', 'val_emotion_output_accuracy', 'val_sex_output_accuracy'])
plt.show()

In [None]:
model_1d_aug = get_1d_model()
model_1d_aug.compile(optimizer=Adam(), loss={"emotion_output" : "categorical_crossentropy", "sex_output":"binary_crossentropy"}, metrics=['accuracy'])
# history_1d_aug = model_1d_aug.fit(x_train_aug, {'emotion_output':e_y_train_aug, 'sex_output':s_y_train_aug}, validation_split=0.2, callbacks=get_callbacks('best_1d_aug.h5'), epochs=100, batch_size=32)
history_1d_aug = model_1d_aug.fit(x_train_aug, {'emotion_output':e_y_train_aug, 'sex_output':s_y_train_aug}, validation_data=(x_test, {'emotion_output':e_y_test, 'sex_output':s_y_test}), callbacks=get_callbacks('best_1d_aug.h5'), epochs=100, batch_size=32)

In [None]:
plt.plot(history_1d_aug.history['emotion_output_accuracy'])
plt.plot(history_1d_aug.history['sex_output_accuracy'])
plt.plot(history_1d_aug.history['val_emotion_output_accuracy'])
plt.plot(history_1d_aug.history['val_sex_output_accuracy'])
plt.legend(['emotion_output_accuracy', 'sex_output_accuracy', 'val_emotion_output_accuracy', 'val_sex_output_accuracy'])
plt.show()

In [None]:
model_mfcc = get_2d_mfcc_model((30, 94, 1))
model_mfcc.compile(optimizer=Adam(), loss={"emotion_output" : "categorical_crossentropy", "sex_output":"binary_crossentropy"}, metrics=['accuracy'])
# history_mfcc = model_mfcc.fit(x_train_mfcc, {'emotion_output':e_y_train, 'sex_output':s_y_train}, validation_split=0.2, callbacks=get_callbacks('best_mfcc.h5'), epochs=100, batch_size=32)
history_mfcc = model_mfcc.fit(x_train_mfcc, {'emotion_output':e_y_train, 'sex_output':s_y_train}, validation_data=(x_test_mfcc, {'emotion_output':e_y_test, 'sex_output':s_y_test}), callbacks=get_callbacks('best_mfcc.h5'), epochs=100, batch_size=32)

In [None]:
plt.plot(history_mfcc.history['emotion_output_accuracy'])
plt.plot(history_mfcc.history['sex_output_accuracy'])
plt.plot(history_mfcc.history['val_emotion_output_accuracy'])
plt.plot(history_mfcc.history['val_sex_output_accuracy'])
plt.legend(['emotion_output_accuracy', 'sex_output_accuracy', 'val_emotion_output_accuracy', 'val_sex_output_accuracy'])
plt.show()

In [None]:
model_mfcc_aug = get_2d_mfcc_model((30, 94, 1))
model_mfcc_aug.compile(optimizer=Adam(), loss={"emotion_output" : "categorical_crossentropy", "sex_output":"binary_crossentropy"}, metrics=['accuracy'])
# history_mfcc_aug = model_mfcc_aug.fit(x_train_aug_mfcc, {'emotion_output':e_y_train_aug, 'sex_output':s_y_train_aug}, validation_split=0.2, callbacks=get_callbacks('best_mfcc_aug.h5'), epochs=100, batch_size=32)
history_mfcc_aug = model_mfcc_aug.fit(x_train_aug_mfcc, {'emotion_output':e_y_train_aug, 'sex_output':s_y_train_aug}, validation_data=(x_test_mfcc, {'emotion_output':e_y_test, 'sex_output':s_y_test}), callbacks=get_callbacks('best_mfcc_aug.h5'), epochs=100, batch_size=32)

In [None]:
plt.plot(history_mfcc_aug.history['emotion_output_accuracy'])
plt.plot(history_mfcc_aug.history['sex_output_accuracy'])
plt.plot(history_mfcc_aug.history['val_emotion_output_accuracy'])
plt.plot(history_mfcc_aug.history['val_sex_output_accuracy'])
plt.legend(['emotion_output_accuracy', 'sex_output_accuracy', 'val_emotion_output_accuracy', 'val_sex_output_accuracy'])
plt.show()

In [None]:
model_melspec = get_2d_melspec_model((60, 94, 1))
model_melspec.compile(optimizer=Adam(), loss={"emotion_output" : "categorical_crossentropy", "sex_output":"binary_crossentropy"}, metrics=['accuracy'])
# history_melspec = model_melspec.fit(x_train_melspec, {'emotion_output':e_y_train, 'sex_output':s_y_train}, validation_split=0.2, callbacks=get_callbacks('best_melspec.h5'), epochs=100, batch_size=32)
history_melspec = model_melspec.fit(x_train_melspec, {'emotion_output':e_y_train, 'sex_output':s_y_train}, validation_data=(x_test_melspec, {'emotion_output':e_y_test, 'sex_output':s_y_test}), callbacks=get_callbacks('best_melspec.h5'), epochs=100, batch_size=32)

In [None]:
plt.plot(history_melspec.history['emotion_output_accuracy'])
plt.plot(history_melspec.history['sex_output_accuracy'])
plt.plot(history_melspec.history['val_emotion_output_accuracy'])
plt.plot(history_melspec.history['val_sex_output_accuracy'])
plt.legend(['emotion_output_accuracy', 'sex_output_accuracy', 'val_emotion_output_accuracy', 'val_sex_output_accuracy'])
plt.show()

In [None]:
model_melspec_aug = get_2d_melspec_model((60, 94, 1))
model_melspec_aug.compile(optimizer=Adam(), loss={"emotion_output" : "categorical_crossentropy", "sex_output":"binary_crossentropy"}, metrics=['accuracy'])
# history_melspec_aug = model_melspec_aug.fit(x_train_aug_melspec, {'emotion_output':e_y_train_aug, 'sex_output':s_y_train_aug}, validation_split=0.2, callbacks=get_callbacks('best_melspec_aug.h5'), epochs=100, batch_size=32)
history_melspec_aug = model_melspec_aug.fit(x_train_aug_melspec, {'emotion_output':e_y_train_aug, 'sex_output':s_y_train_aug}, validation_data=(x_test_melspec, {'emotion_output':e_y_test, 'sex_output':s_y_test}), callbacks=get_callbacks('best_melspec_aug.h5'), epochs=100, batch_size=32)

In [None]:
plt.plot(history_melspec_aug.history['emotion_output_accuracy'])
plt.plot(history_melspec_aug.history['sex_output_accuracy'])
plt.plot(history_melspec_aug.history['val_emotion_output_accuracy'])
plt.plot(history_melspec_aug.history['val_sex_output_accuracy'])
plt.legend(['emotion_output_accuracy', 'sex_output_accuracy', 'val_emotion_output_accuracy', 'val_sex_output_accuracy'])
plt.show()

In [None]:
pip install pickle

In [None]:
import pickle


Pkl_Filename = "Pickle_RL_Model.pkl"  

for model in [model_1d, model_1d_aug, model_mfcc, model_mfcc_aug, model_melspec, model_melspec_aug]:
    with open(Pkl_Filename, 'wb') as file:  
        pickle.dump(model, file)

In [None]:
e_y , s_y = [], []
for e, s in y_test:
    e_y.append(e)
    s_y.append(s)
    
e_y_test, s_y_test = encode(e_y, s_y)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

e_y_test, s_y_test = np.argmax(e_y_test, axis=1), np.argmax(s_y_test, axis=1)

def get_prediction(model, x):
    y_pred_e, y_pred_s = model.predict(x)
    y_pred_e, y_pred_s = np.argmax(y_pred_e, axis=1), np.argmax(y_pred_s, axis=1)
    return y_pred_e, y_pred_s

def display_results(y_pred_e, y_pred_s, y_true_e, y_true_s):
    e_conf_matrix = confusion_matrix(y_true_e, y_pred_e)
    s_conf_matrix = confusion_matrix(y_true_s, y_pred_s)
    e_df = pd.DataFrame(e_conf_matrix, index=list(df.emotion.unique()), columns=list(df.emotion.unique()))
    s_df = pd.DataFrame(s_conf_matrix, index=list(df.sex.unique()), columns=list(df.sex.unique()))
    print(classification_report(y_true_e, y_pred_e, target_names=list(df.emotion.unique())))
    sns.heatmap(e_df, annot=True, fmt='g')
    plt.show()
    print(classification_report(y_true_s, y_pred_s, target_names=list(df.sex.unique())))
    sns.heatmap(s_df, annot=True, fmt='g')
    plt.show()

In [None]:
y_pred_e, y_pred_s = get_prediction(model_1d, x_test)

In [None]:
display_results(y_pred_e, y_pred_s, e_y_test, s_y_test)

In [None]:
y_pred_e, y_pred_s = get_prediction(model_1d_aug, x_test)

In [None]:
display_results(y_pred_e, y_pred_s, e_y_test, s_y_test)

In [None]:
y_pred_e, y_pred_s = get_prediction(model_mfcc, x_test_mfcc)

In [None]:
display_results(y_pred_e, y_pred_s, e_y_test, s_y_test)

In [None]:
y_pred_e, y_pred_s = get_prediction(model_mfcc_aug, x_test_mfcc)

In [None]:
display_results(y_pred_e, y_pred_s, e_y_test, s_y_test)

In [None]:
y_pred_e, y_pred_s = get_prediction(model_melspec, x_test_melspec)

In [None]:
display_results(y_pred_e, y_pred_s, e_y_test, s_y_test)

In [None]:
y_pred_e, y_pred_s = get_prediction(model_melspec_aug, x_test_melspec)

In [None]:
display_results(y_pred_e, y_pred_s, e_y_test, s_y_test)

In [None]:
tf.keras.models.save_model(model,'my_model3.h5')
