In [2]:
import numpy as np
import os
import sys

import wave
import copy
import math

from keras.models import Sequential, Model
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import GRU, LSTM, Input, Flatten, Concatenate, Bidirectional
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import SGD, Adam, RMSprop
from keras.layers.normalization import BatchNormalization
from sklearn.preprocessing import label_binarize
from keras.layers import add

from features import *
from helper import *


Using TensorFlow backend.


In [3]:

#code_path = os.path.dirname(os.path.realpath(os.getcwd()))
code_path = "I:/UESTC/Masters Degree/Research Area/Speech Emotion Recognition/Datasets/IEMOCAP_full_release"
emotions_used = np.array(['ang', 'exc', 'neu', 'sad'])
data_path = code_path + "/"
sessions = ['Session1', 'Session2', 'Session3', 'Session4', 'Session5']
framerate = 16000


In [4]:
import pickle
with open(data_path + '/../'+'data_collected.pickle', 'rb') as handle:
    data2 = pickle.load(handle)

In [5]:
def calculate_features(frames, freq, options):
    window_sec = 0.2
    window_n = int(freq * window_sec)

    st_f = stFeatureExtraction(frames, freq, window_n, window_n / 2)

    if st_f.shape[1] > 2:
        i0 = 1
        i1 = st_f.shape[1] - 1
        if i1 - i0 < 1:
            i1 = i0 + 1
        
        deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float)
        for i in range(i0, i1):
            i_left = i - 1
            i_right = i + 1
            deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i]
        return deriv_st_f
    elif st_f.shape[1] == 2:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f
    else:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f

In [6]:
x_train_speech = []

counter = 0
for ses_mod in data2:
    x_head = ses_mod['signal']
    st_features = calculate_features(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    x_train_speech.append( st_features.T )
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_speech = np.array(x_train_speech)
x_train_speech.shape

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


(4936, 100, 34)

In [7]:
def lstm_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(LSTM(512, return_sequences=True, input_shape=(100, 34)))
    model.add(LSTM(256, return_sequences=False))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [8]:
model = lstm_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 100, 512)          1120256   
_________________________________________________________________
lstm_2 (LSTM)                (None, 256)               787456    
_________________________________________________________________
dense_1 (Dense)              (None, 512)               131584    
_________________________________________________________________
activation_1 (Activation)    (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 2052      
_________________________________________________________________
activation_2 (Activation)    (None, 4)                 0         
Total params: 2,041,348
Trainable params: 2,041,348
Non-trainable params: 0
____________________________________________

In [9]:
Y=[]
for ses_mod in data2:
    Y.append(ses_mod['emotion'])
    
Y = label_binarize(Y,emotions_used)

Y.shape

(4936, 4)

In [10]:
hist = model.fit(x_train_speech, Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until



Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [11]:
def kaldi_model(optimizer='Adam'):
    model = Sequential()
    model.add(TimeDistributed(Dense(450), input_shape=(100, 34)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(TimeDistributed(Dense(450)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(TimeDistributed(Dense(450)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(TimeDistributed(Dense(450)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(TimeDistributed(Dense(450)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(TimeDistributed(Dense(450)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Flatten())
    model.add(Dense(100))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [12]:
model = kaldi_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_1 (TimeDist (None, 100, 450)          15750     
_________________________________________________________________
batch_normalization_1 (Batch (None, 100, 450)          1800      
_________________________________________________________________
activation_3 (Activation)    (None, 100, 450)          0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 100, 450)          202950    
_________________________________________________________________
batch_normalization_2 (Batch (None, 100, 450)          1800      
_________________________________________________________________
activation_4 (Activation)    (None, 100, 450)          0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 100, 450)         

In [13]:
hist = model.fit(x_train_speech, Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [14]:
def linear_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(Flatten(input_shape=(100, 34)))
    model.add(Dense(1024))
    model.add(Activation('relu'))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [15]:
model = linear_model()
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 3400)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              3482624   
_________________________________________________________________
activation_11 (Activation)   (None, 1024)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 512)               524800    
_________________________________________________________________
activation_12 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 256)               131328    
_________________________________________________________________
activation_13 (Activation)   (None, 256)              

In [16]:
hist = model.fit(x_train_speech, Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [17]:
def calculate_features_2(frames, freq, options):
    #double the window duration
    window_sec = 0.4
    window_n = int(freq * window_sec)

    st_f = stFeatureExtraction(frames, freq, window_n, window_n / 2)

    if st_f.shape[1] > 2:
        i0 = 1
        i1 = st_f.shape[1] - 1
        if i1 - i0 < 1:
            i1 = i0 + 1
        
        deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float)
        for i in range(i0, i1):
            i_left = i - 1
            i_right = i + 1
            deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i]
        return deriv_st_f
    elif st_f.shape[1] == 2:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f
    else:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f

In [18]:
x_train_speech2 = []
from sklearn.preprocessing import normalize
counter = 0
for ses_mod in data2:
    x_head = ses_mod['signal']
    st_features = calculate_features_2(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=100)
    x_train_speech2.append( st_features.T )
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_speech2 = np.array(x_train_speech2)
x_train_speech2.shape

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


(4936, 100, 34)

In [19]:
def linear_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(Flatten(input_shape=(100, 34)))
    model.add(Dense(1024))
    model.add(Activation('relu'))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(256))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [20]:
model = linear_model()
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 3400)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 1024)              3482624   
_________________________________________________________________
activation_15 (Activation)   (None, 1024)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 512)               524800    
_________________________________________________________________
activation_16 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 256)               131328    
_________________________________________________________________
activation_17 (Activation)   (None, 256)              

In [21]:
hist = model.fit(x_train_speech2, Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [22]:
def linear_model_combined(optimizer='Adadelta'):
    modela = Sequential()
    modela.add(Flatten(input_shape=(100, 34)))
    modela.add(Dense(1024))
    modela.add(Activation('relu'))
    modela.add(Dense(512))
    
    modelb = Sequential()
    modelb.add(Flatten(input_shape=(100, 34)))
    modelb.add(Dense(1024))
    modelb.add(Activation('relu'))
    modelb.add(Dense(512))
    
    merged_output = add([modela.output, modelb.output])
    
    model_combined = Sequential()
    model_combined.add(Activation('relu'))
    model_combined.add(Dense(256))
    model_combined.add(Activation('relu'))
    
    model_combined.add(Dense(4))
    model_combined.add(Activation('softmax'))
    
    model_combined = Model([modela.input, modelb.input], model_combined(merged_output))

    model_combined.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model_combined

In [23]:
model = linear_model_combined()
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
flatten_4_input (InputLayer)    (None, 100, 34)      0                                            
__________________________________________________________________________________________________
flatten_5_input (InputLayer)    (None, 100, 34)      0                                            
__________________________________________________________________________________________________
flatten_4 (Flatten)             (None, 3400)         0           flatten_4_input[0][0]            
__________________________________________________________________________________________________
flatten_5 (Flatten)             (None, 3400)         0           flatten_5_input[0][0]            
____________________________________________________________________________________________

In [24]:
hist = model.fit([x_train_speech, x_train_speech2], Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [25]:
import tensorflow as tf
from keras import backend as K
from keras import regularizers, constraints, initializers, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec
from keras.layers.core import Dense, Activation, Dropout, Flatten
#from keras.layers.wrappers import TimeDistributed

tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d)

def _time_distributed_dense(x, w, b=None, dropout=None,
                        input_dim=None, output_dim=None,
                        timesteps=None, training=None):
        """Apply `y . w + b` for every temporal slice y of x.
        # Arguments
            x: input tensor.
            w: weight matrix.
            b: optional bias vector.
            dropout: wether to apply dropout (same dropout mask
                for every temporal slice of the input).
            input_dim: integer; optional dimensionality of the input.
            output_dim: integer; optional dimensionality of the output.
            timesteps: integer; optional number of timesteps.
            training: training phase tensor or boolean.
        # Returns
            Output tensor.
        """
        if not input_dim:
            input_dim = K.shape(x)[2]
        if not timesteps:
            timesteps = K.shape(x)[1]
        if not output_dim:
            output_dim = K.shape(w)[1]

        if dropout is not None and 0. < dropout < 1.:
            # apply the same dropout pattern at every timestep
            ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
            dropout_matrix = K.dropout(ones, dropout)
            expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
            x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)

        # collapse time dimension and batch dimension together
        x = K.reshape(x, (-1, input_dim))
        x = K.dot(x, w)
        if b is not None:
            x = K.bias_add(x, b)
        # reshape to 3D tensor
        if K.backend() == 'tensorflow':
            x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
            x.set_shape([None, None, output_dim])
        else:
            x = K.reshape(x, (-1, timesteps, output_dim))
        return x

class AttentionDecoder(Recurrent):

    def __init__(self, units, output_dim,
                 activation='tanh',
                 return_probabilities=False,
                 name='AttentionDecoder',
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        """
        Implements an AttentionDecoder that takes in a sequence encoded by an
        encoder and outputs the decoded states
        :param units: dimension of the hidden state and the attention matrices
        :param output_dim: the number of labels in the output space

        references:
            Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio.
            "Neural machine translation by jointly learning to align and translate."
            arXiv preprint arXiv:1409.0473 (2014).
        """
        self.units = units
        self.output_dim = output_dim
        self.return_probabilities = return_probabilities
        self.activation = activations.get(activation)
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.recurrent_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)

        self.kernel_constraint = constraints.get(kernel_constraint)
        self.recurrent_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        super(AttentionDecoder, self).__init__(**kwargs)
        self.name = name
        self.return_sequences = True  # must return sequences

    def build(self, input_shape):
        """
          See Appendix 2 of Bahdanau 2014, arXiv:1409.0473
          for model details that correspond to the matrices here.
        """

        self.batch_size, self.timesteps, self.input_dim = input_shape

        if self.stateful:
            super(AttentionDecoder, self).reset_states()

        self.states = [None, None]  # y, s

        """
            Matrices for creating the context vector
        """

        self.V_a = self.add_weight(shape=(self.units,),
                                   name='V_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.W_a = self.add_weight(shape=(self.units, self.units),
                                   name='W_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.U_a = self.add_weight(shape=(self.input_dim, self.units),
                                   name='U_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.b_a = self.add_weight(shape=(self.units,),
                                   name='b_a',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the r (reset) gate
        """
        self.C_r = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_r = self.add_weight(shape=(self.units, self.units),
                                   name='U_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_r = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_r = self.add_weight(shape=(self.units, ),
                                   name='b_r',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        """
            Matrices for the z (update) gate
        """
        self.C_z = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_z = self.add_weight(shape=(self.units, self.units),
                                   name='U_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_z = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_z = self.add_weight(shape=(self.units, ),
                                   name='b_z',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the proposal
        """
        self.C_p = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_p = self.add_weight(shape=(self.units, self.units),
                                   name='U_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_p = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_p = self.add_weight(shape=(self.units, ),
                                   name='b_p',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for making the final prediction vector
        """
        self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim),
                                   name='C_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_o = self.add_weight(shape=(self.units, self.output_dim),
                                   name='U_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim),
                                   name='W_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_o = self.add_weight(shape=(self.output_dim, ),
                                   name='b_o',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)

        # For creating the initial state:
        self.W_s = self.add_weight(shape=(self.input_dim, self.units),
                                   name='W_s',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)

        self.input_spec = [
            InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))]
        self.built = True
    

    def call(self, x):
        # store the whole sequence so we can "attend" to it at each timestep
        self.x_seq = x

        # apply the a dense layer over the time dimension of the sequence
        # do it here because it doesn't depend on any previous steps
        # thefore we can save computation time:
        self._uxpb = _time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
                                             input_dim=self.input_dim,
                                             timesteps=self.timesteps,
                                             output_dim=self.units)

        return super(AttentionDecoder, self).call(x)

    def get_initial_state(self, inputs):
        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))

        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])

        return [y0, s0]

    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)

        # new hidden state:
        st = (1-zt)*stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]

    def compute_output_shape(self, input_shape):
        """
            For Keras internal compatability checking
        """
        if self.return_probabilities:
            return (None, self.timesteps, self.timesteps)
        else:
            return (None, self.timesteps, self.output_dim)

    def get_config(self):
        """
            For rebuilding models on load time.
        """
        config = {
            'output_dim': self.output_dim,
            'units': self.units,
            'return_probabilities': self.return_probabilities
        }
        base_config = super(AttentionDecoder, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [26]:
def attention_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(100, 34)))
    model.add(AttentionDecoder(128,128))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [27]:
model = attention_model()
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 100, 128)          83456     
_________________________________________________________________
AttentionDecoder (AttentionD (None, 100, 128)          246528    
_________________________________________________________________
flatten_6 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_25 (Dense)             (None, 512)               6554112   
_________________________________________________________________
activation_24 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 4)                 2052      
_________________________________________________________________
activation_25 (Activation)   (None, 4)                

In [28]:
hist = model.fit(x_train_speech, Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [29]:
def calculate_features_3(frames, freq, options):
    #double the window duration
    window_sec = 0.08
    window_n = int(freq * window_sec)

    st_f = stFeatureExtraction(frames, freq, window_n, window_n / 2)

    if st_f.shape[1] > 2:
        i0 = 1
        i1 = st_f.shape[1] - 1
        if i1 - i0 < 1:
            i1 = i0 + 1
        
        deriv_st_f = np.zeros((st_f.shape[0], i1 - i0), dtype=float)
        for i in range(i0, i1):
            i_left = i - 1
            i_right = i + 1
            deriv_st_f[:st_f.shape[0], i - i0] = st_f[:, i]
        return deriv_st_f
    elif st_f.shape[1] == 2:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f
    else:
        deriv_st_f = np.zeros((st_f.shape[0], 1), dtype=float)
        deriv_st_f[:st_f.shape[0], 0] = st_f[:, 0]
        return deriv_st_f

In [30]:
x_train_speech3 = []
from sklearn.preprocessing import normalize
counter = 0
for ses_mod in data2:
    x_head = ses_mod['signal']
    st_features = calculate_features_3(x_head, framerate, None)
    st_features, _ = pad_sequence_into_array(st_features, maxlen=200)
    x_train_speech3.append( st_features.T )
    counter+=1
    if(counter%100==0):
        print(counter)
    
x_train_speech3 = np.array(x_train_speech3)
x_train_speech3.shape

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


(4936, 200, 34)

In [31]:
def attention_model2(optimizer='Adadelta'):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(200, 34)))
    model.add(AttentionDecoder(128,128))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

In [32]:
model = attention_model2()
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 200, 128)          83456     
_________________________________________________________________
AttentionDecoder (AttentionD (None, 200, 128)          246528    
_________________________________________________________________
flatten_7 (Flatten)          (None, 25600)             0         
_________________________________________________________________
dense_27 (Dense)             (None, 512)               13107712  
_________________________________________________________________
activation_26 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 4)                 2052      
_________________________________________________________________
activation_27 (Activation)   (None, 4)                

In [33]:
hist = model.fit(x_train_speech3, Y, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_split=0.2)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3948 samples, validate on 988 samples
Epoch 1/2
Epoch 2/2


In [34]:
counter = 0
for ses_mod in data2:
    if (ses_mod['id'][:5]=="Ses05"):
        break
    counter+=1
counter

3838

In [35]:
xtrain_sp = x_train_speech[:3838]
xtest_sp = x_train_speech[3838:]
ytrain_sp = Y[:3838]
ytest_sp = Y[3838:]

In [36]:
def attention_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(100, 34)))
    model.add(AttentionDecoder(128,128))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

model = attention_model()
model.summary()

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100, 128)          83456     
_________________________________________________________________
AttentionDecoder (AttentionD (None, 100, 128)          246528    
_________________________________________________________________
flatten_8 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_29 (Dense)             (None, 512)               6554112   
_________________________________________________________________
activation_28 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 4)                 2052      
_________________________________________________________________
activation_29 (Activation)   (None, 4)               

In [37]:
hist = model.fit(xtrain_sp, ytrain_sp, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_data=(xtest_sp, ytest_sp))

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3838 samples, validate on 1098 samples
Epoch 1/2
Epoch 2/2


In [38]:
def attention_model(optimizer='Adadelta'):
    model = Sequential()
    model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(100, 34)))
    model.add(Bidirectional(AttentionDecoder(128,128)))
    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(4))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    return model

model = attention_model()
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 100, 256)          166912    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100, 256)          689664    
_________________________________________________________________
flatten_9 (Flatten)          (None, 25600)             0         
_________________________________________________________________
dense_31 (Dense)             (None, 512)               13107712  
_________________________________________________________________
activation_30 (Activation)   (None, 512)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 4)                 2052      
_________________________________________________________________
activation_31 (Activation)   (None, 4)               

In [40]:
hist = model.fit(xtrain_sp, ytrain_sp, 
                 batch_size=100, nb_epoch=2, verbose=1, shuffle = True, 
                 validation_data=(xtest_sp, ytest_sp))

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 3838 samples, validate on 1098 samples
Epoch 1/2


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted: OOM when allocating tensor with shape[25600,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training_8/Adadelta/mul_292}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[metrics_8/accuracy/Identity/_2073]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted: OOM when allocating tensor with shape[25600,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node training_8/Adadelta/mul_292}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored.