In [1]:
import emotion_recognition
from utils import get_audio_config
import matplotlib.pyplot as plt
from keras.layers import Input, Dense, Conv2D, Conv1D, Convolution2D, concatenate, LSTM, Reshape
from keras.layers import Dropout, Flatten
from keras.models import Sequential, Model
import numpy as np
from keras.utils import plot_model
import create_csv
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.utils import to_categorical

Using TensorFlow backend.


# FEATURES TO BE USED

In [2]:
train_desc_files = ['train_custom.csv', 'train_emo.csv', 'train_tess_ravdess.csv']
test_desc_files = ['test_custom.csv', 'test_emo.csv', 'test_tess_ravdess.csv']
ALL_FEATURES = ['mfcc', 'chroma', 'mel', 'contrast', 'tonnetz', 'rmse']
image_features = ['mel']
non_image_features = ['contrast', 'rmse', 'mfcc', 'tonnetz']
emotions = ['happy', 'angry']

In [3]:
create_csv.write_emodb_csv(emotions=emotions, train_name="train_emo.csv",
                    test_name="test_emo.csv", train_size=0.8, verbose=1)
create_csv.write_tess_ravdess_csv(emotions=emotions, train_name="train_tess_ravdess.csv",
                            test_name="test_tess_ravdess.csv", verbose=1)
create_csv.write_custom_csv(emotions=emotions, train_name="train_custom.csv", test_name="test_custom.csv",
                    verbose=1)
int2emotions = {i: e for i, e in enumerate(emotions)}
emotions2int = {v: k for k, v in int2emotions.items()}

[EMO-DB] Total files to write: 251
[EMO-DB] Training samples: 200
[EMO-DB] Testing samples: 50
[TESS&RAVDESS] There are 655 training audio files for category:happy
[TESS&RAVDESS] There are 115 testing audio files for category:happy
[TESS&RAVDESS] There are 661 training audio files for category:angry
[TESS&RAVDESS] There are 115 testing audio files for category:angry


# EXTRACTING FEATURES (IMAGE AND NONE)

In [4]:
audio_config = get_audio_config(non_image_features)
data_flat = emotion_recognition.load_data(train_desc_files, test_desc_files, audio_config, classification=True,
                                emotions=emotions, balance=True, shuffle=False)

In [5]:
audio_config = get_audio_config(['image'])
data_image = emotion_recognition.load_data(train_desc_files, test_desc_files, audio_config, classification=True,
                                emotions=emotions, balance=True, shuffle=False)

In [6]:
data_image["X_train"].shape, data_flat["X_train"].shape

((3802, 128, 1412), (3802, 54))

In [7]:
X_train_flat = data_flat["X_train"].reshape(data_flat["X_train"].shape[0], 1,  54, 1)
X_train_image =  data_image["X_train"].reshape(data_image["X_train"].shape[0], 128, 1412, 1)
X_test_flat = data_flat["X_test"].reshape(data_flat["X_test"].shape[0], 1,  54, 1)
X_test_image =  data_image["X_test"].reshape(data_image["X_test"].shape[0], 128, 1412, 1)
y_train = to_categorical([emotions2int[str(e)] for e in data_image['y_train'].squeeze() ])
y_test =  to_categorical([emotions2int[str(e)] for e in data_image['y_test'].squeeze() ])
X_train_flat.shape, X_train_image.shape, y_train.shape

((3802, 1, 54, 1), (3802, 128, 1412, 1), (3802, 2))

In [8]:
np.sum(y_test, axis =0), np.sum(y_train, axis =0)

(array([431., 431.], dtype=float32), array([1901., 1901.], dtype=float32))

# MODEL HYPER PARAMETERS

In [9]:
# model constants
n_rnn_layers = 2
n_rnn_layers -=1
rnn_units = 128
dropout = 0.35
n_dense_layers = 2
dense_units = 64
output_dims = len(emotions)

# MODEL DEFINITION

In [11]:
# RAMS MODEL
input_cnn = Input(shape = (128, 1412, 1))
CNN = Conv2D(8, (12, 13), activation='relu', strides=(1,13),
             input_shape = (128, 1412, 1))(input_cnn)
CNN = Conv2D(16, (16, 2), activation='relu', strides=(1,2))(CNN)
CNN = Conv2D(32, (24, 1), activation='relu', strides=(1,1) )(CNN)
CNN = Conv2D(48, (32, 1), activation='relu', strides=(1,1))(CNN)
CNN = Conv2D(64, (48, 1), activation='relu', strides=(1,1))(CNN)


CNN = Model(inputs=input_cnn, outputs=CNN)

input_rnn = Input(shape = (1, 54, 1))
RNN = Model(inputs=input_rnn, outputs=input_rnn)

print(CNN.output)
print(RNN.output)
RAMS = concatenate([CNN.output, RNN.output])
print("here")
print(RAMS)


RAMS = Reshape((54, 65))(RAMS)
RAMS = (LSTM(rnn_units, return_sequences=True, input_shape=(48, 129)))(RAMS)
RAMS = (Dropout(dropout))(RAMS)

# rnn layers
for i in range(n_rnn_layers):
    RAMS = LSTM(rnn_units, return_sequences=True)(RAMS)
    RAMS = (Dropout(dropout))(RAMS)
RAMS = Flatten()(RAMS)
# dense layers
RAMS = (Dense(dense_units, activation="relu"))(RAMS)
RAMS = (Dropout(dropout))(RAMS)
for j in range(n_dense_layers):
    RAMS = (Dense(dense_units, activation="relu"))(RAMS)
    RAMS = (Dropout(dropout))(RAMS)
RAMS = (Dense(output_dims, activation="softmax"))(RAMS)


model = Model(inputs = [input_cnn, input_rnn], outputs = RAMS)
model.compile(loss='categorical_crossentropy', metrics=["accuracy"], optimizer='adam')


Tensor("conv2d_10/Relu:0", shape=(?, 1, 54, 64), dtype=float32)
Tensor("input_4:0", shape=(?, 1, 54, 1), dtype=float32)
here
Tensor("concatenate_2/concat:0", shape=(?, 1, 54, 65), dtype=float32)


W0828 17:45:02.843023 140736179348352 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0828 17:45:02.849240 140736179348352 deprecation.py:506] From /usr/local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0828 17:45:03.154213 140736179348352 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0828 17:45:03.178338 140736179348352 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/backend/tenso

In [12]:
model.summary()
plot_model(model, to_file='model.svg')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 128, 1412, 1) 0                                            
__________________________________________________________________________________________________
conv2d_6 (Conv2D)               (None, 117, 108, 8)  1256        input_3[0][0]                    
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 102, 54, 16)  4112        conv2d_6[0][0]                   
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 79, 54, 32)   12320       conv2d_7[0][0]                   
__________________________________________________________________________________________________
conv2d_9 (

In [13]:
batch_size = 1024
epochs = 500
verbose = True

In [None]:
checkpointer = ModelCheckpoint("RAMS_trial_1", save_best_only=True, verbose=1)
tensorboard = TensorBoard(log_dir=f"logs/RAMS_trial_1")

history = model.fit([X_train_image, X_train_flat], y_train,
                batch_size=batch_size,
                epochs=epochs,
                validation_data=([X_test_image, X_test_flat], y_test),
                callbacks=[checkpointer, tensorboard],
                verbose=verbose)


W0828 17:48:01.965946 140736179348352 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 3802 samples, validate on 862 samples


W0828 17:48:04.575776 140736179348352 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/callbacks.py:850: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.

W0828 17:48:04.576835 140736179348352 deprecation_wrapper.py:119] From /usr/local/lib/python3.7/site-packages/keras/callbacks.py:853: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.



Epoch 1/500

Epoch 00001: val_loss improved from inf to 0.67443, saving model to RAMS_trial_1
Epoch 2/500

Epoch 00002: val_loss improved from 0.67443 to 0.63671, saving model to RAMS_trial_1
Epoch 3/500

Epoch 00003: val_loss improved from 0.63671 to 0.60551, saving model to RAMS_trial_1
Epoch 4/500

In [9]:
def predict(audio_path):    
    image_audio_config = 
    flat_audio_config = 
    image_feature = extract_feature(audio_path, **image_audio_config).reshape(1, 1,  54, 1)
    flat_feature = extract_feature(audio_path, **flat_audio_config).reshape(1,128, 1412, 1)
    return self.int2emotions[self.model.predict_classes(feature)[0][0]], self.model.predict(feature)
