# Neural Network for Emotion Recognition from Speech
Recognizes emotion by classifying binary  Valence/arousal values for given utterances.
Provide input data by extracting 13MFCC features (via OpenSMILE) and export to binary files.
Network consists of one convolution layer

In [None]:
import numpy as np
import tensorflow as tf
import random
import math

In [None]:
# load input data
filepath = 'Inpu/'

# IEMOCAP data

# load IEMOCAP input and labels to numpy arrays
samples_train_ENG = np.load(path + '/IEMOCAP/IEMOCAP_feature_train.npy')
labels_train_ENG = np.load(path + '/IEMOCAP/IEMOCAP_labels_train.npy')

# load IMEOCAP testset
samples_test_ENG =  np.load(path + '/IEMOCAP/IEMOC_feature_valid.npy')
labels_test_ENG =  np.load(path + '/IEMOCAP/IEMOC_labels_valid.npy')

# RECOLA data

# load RECOLA input and labels to numpy arrays
samples_train_FR = np.load(path + '/RECOLA/RECOLA_feature_train.npy')
labels_train_FR = np.load(path + '/RECOLA/RECOLA_labels_train.npy')

# load RECOLA testset
samples_test_FR =  np.load(path + '/RECOLA/RECOL_feature_valid.npy')
labels_test_FR =  np.load(path + '/RECOLA/RECOL_labels_valid.npy')

In [None]:
# hyperparameters
ITERATIONS = 50000
BATCH_SIZE = 50
LEARN_RATE = 0.01
DISPLAY_STEP = 10

In [None]:
# network parameters
# define number of frames for each sample based on language used
n_frames_eng = int(len(samples_train_ENG)/len(labels_train_ENG))
n_frames_fr = int(len(samples_train_FR)/len(labels_train_FR))

# set the number of samples for both datasets
n_samples_ENG = len(labels_train_ENG)
n_samples_FR = len(labels_train_FR)

n_features = len(samples_train_ENG[0])
n_convInput = n_frames * n_features
n_classes = 4
dropout = 0.5

# 2 gateways for our data: one for sound samples & one for labels
x = tf.placeholder(tf.float32, shape=(None, n_features*n_frames), name="Input")
y = tf.placeholder(tf.float32, shape=(None, n_classes), name="Prediction")

# gateway for dropout
keep_prob =  tf.placeholder(tf.float32, name="Dropout")

In [None]:
# function to generate each batch
# TODO

In [None]:
# define convolution layer
def conv2d(x, W, b, name="Convolution"): 
    with tf.name_scope(name):
        conv = tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding = 'VALID')
        act = tf.nn.relu(conv + b)
        return tf.nn.max_pool(act, ksize = [1, 30, 1, 1], strides = [1, 3, 1, 1], padding = 'SAME')

# create model
def conv_net(x, weights, biases):
    # reshape input data to [ #samples, #frames, # features, 1]
    x = tf.reshape(x, shape = [-1, n_frames, n_features, 1])
    # convolution layer with maxpooling
    act1 = conv2d(x, weights['wc1'], biases['bc1'])
    
    # reshape to one fully connected layer with inputs as a list
    act1 = tf.reshape(act1, [-1, weights['out'].get_shape().as_list()[0]])
    # apply dropout
    act1 = tf.nn.dropout(act1, keep_prob)
    
    # output, class prediciton
    out = tf.add(tf.matmul(act1, weights['out']), biases['out'])
    return out #return the classification

In [None]:
# create weights
weights = {
    'wc1': tf.Variable(tf.random_normal([10,n_features,1,50])),
    'out': tf.Variable(tf.random_normal([int(math.ceil((n_frames-9)/3) * 50), n_classes]))
}

# create biases
biases = {
    'bc1': tf.Variable(tf.random_normal([50])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

In [None]:
# construct model
pred = conv_net(x, weights, biases)

# define optimizer and loss
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = pred, labels = y))
optimizer = tf.train.AdamOptimizer(learning_rate = LEARNING_RATE).minimize(cost)

# evaluate model with tf.equal(predictedValue, testData)
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [None]:
# launch the graph
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
step = 1
# keep training until max iterations
while step * BATCH_SIZE < TRAIN_ITERS:
    batch_x, batch_y = get_next_train_batch()
    # Run optimization (backprop)
    sess.run(optimizer, feed_dict={x: batch_x, y: batch_y, keep_prob: dropout})
    if step % DISPLAY_STEP == 0:
        # Calculate batch loss and accuracy
        loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x, y: batch_y, keep_prob: 1.})
        print("Iter " + str(step*BATCH_SIZE) + ", Minibatch Loss= " + "{:.6f}".format(loss) + ", Training Accuracy= " + "{:.5f}".format(acc))
    step += 1
    
print("Optimization Finished! Testing Model...")

In [None]:
# Test the Model

# Evaluate model with english test data
test_x = get_test_batch_eng(samples_test_ENG, labels_test_ENG)
test_y = labels_test_ENG
accuracy_eng = sess.run(accuracy, feed_dict={x: test_x, y: test_y, keep_prob: 1.})
print("English Test Accuracy: " + str(accuracy_eng))

# Evaluate model with french test data
test_x = get_test_batch_fr(samples_test_FR, labels_test_FR)
test_y = labels_test_FR
accuracy_fr = sess.run(accuracy, feed_dict={x: test_x, y: test_y, keep_prob: 1.})
print("French Test Accuracy: " + str(accuracy_fr))