## Load labels

In [1]:
from pathlib import Path

data_path = Path("../data")
data_path

WindowsPath('../data')

### List all WAV files

In [2]:
wav_files = data_path.glob("*.wav")
wav_files = list(wav_files)
wav_files[:5]

[WindowsPath('../data/03-01-01-01-01-01-01.wav'),
 WindowsPath('../data/03-01-01-01-01-01-02.wav'),
 WindowsPath('../data/03-01-01-01-01-01-03.wav'),
 WindowsPath('../data/03-01-01-01-01-01-04.wav'),
 WindowsPath('../data/03-01-01-01-01-01-05.wav')]

### Map file names to their classes.

Each emotion is labelled as 01 - 08, so we convert that to labels 0 - 7

In [3]:
def class_from_file_name(fname):
    return int(fname.split('-')[2]) - 1

labels = {
    f.name: class_from_file_name(f.name)
    for f in wav_files
}
[(k, v) for k, v in labels.items()][:5]

[('03-01-01-01-01-01-01.wav', 0),
 ('03-01-01-01-01-01-02.wav', 0),
 ('03-01-01-01-01-01-03.wav', 0),
 ('03-01-01-01-01-01-04.wav', 0),
 ('03-01-01-01-01-01-05.wav', 0)]

### Extract number of classes

In [4]:
NUM_CLASSES = len(set(labels.values()))
NUM_CLASSES

8

## Load Data

### Fix PYTHONPATH

Add the path to the vgg-related files to the pythonpath so that we can import the modules

In [5]:
import os
import sys
nb_dir = Path(os.getcwd())
vgg_dir = nb_dir.parent / 'vgg'
vgg_dir

WindowsPath('D:/Work/playground/vgg-emotion-classifier/vgg')

In [6]:
if str(vgg_dir) not in sys.path:
    sys.path.append(str(vgg_dir))
sys.path

['',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\python36.zip',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\DLLs',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\lib',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\lib\\site-packages',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Sam\\.ipython',
 'D:\\Work\\playground\\vgg-emotion-classifier\\vgg']

### Read WAV files

Read in the wav files and convert them into the correct shape for the VGGish model (this is thankfully taken care of already by the example code provided)

In [7]:
from vggish_input import wavfile_to_examples 

In [8]:
data = {
    f.name: wavfile_to_examples(str(f))
    for f in wav_files
}

In [9]:
data['03-01-01-01-01-01-01.wav'].shape

(3, 96, 64)

### Convert labels to one-hot vectors

For multi-class classification using categorical crossentropy we want the labels in one-hot encoded form.

E.g. label `1` becomes `[0, 1, 0, 0 ,0 ,0, 0, 0]`

In [71]:
import numpy as np

y = np.array(list(labels.values()))
y_one_hot = np.zeros(((y.size, y.max()+1)))
y_one_hot[np.arange(y.size), y] = 1

### Split dataset

Split into

* train: 70%
* val: 15%
* test: 15%

In [10]:
from sklearn.model_selection import train_test_split

In [72]:
seed = 987234871

x_train_keys, x_test_keys, y_train, y_test = train_test_split(list(labels.keys()),
                                                    y_one_hot,
                                                    test_size = 0.15,
                                                    random_state = seed)

x_train_keys, x_val_keys, y_train, y_val = train_test_split(x_train_keys,
                                                            y_train,
                                                            test_size = 0.15 / (1 - 0.15),
                                                            random_state = seed)

x_train = [data[key] for key in x_train_keys]
x_val = [data[key] for key in x_val_keys]
x_test = [data[key] for key in x_test_keys]

print(f"Training size: {len(x_train_keys)}")
print(f"Validation size: {len(x_val_keys)}")
print(f"Training size: {len(x_test_keys)}")

Training size: 1008
Validation size: 216
Training size: 216


In [80]:
y_train[:5]

array([[0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1.]])

### Compute balanced weights

The number of instances for each class isn't balanced, so we need to create weightings for each class to even things out during training

In [58]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight('balanced',
                                                         np.unique(list(labels.values())),
                                                         list(labels.values()))
# class_weights = {c: w for c, w in enumerate(scikit_class_weights)}
# class_weights
# scikit_class_weights

## Convert pretrained TF weights to Keras model checkpoint

Don't want to deal with the TF library, Keras is much easier to use imo. That means we first need to turn the TF checkpoint into a Keras checkpoint.

It should theoretically be possible to load the TF checkpoint as is, but this way I know it'll work.

### Define exact VGGish model in Keras

Code taken from https://github.com/SNeugber/vggish2Keras, which is a clone of https://github.com/antoinemrcr/vggish2Keras

In [13]:
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from keras.models import Model
import vggish_params

def get_vggish_keras():
    input_shape = (vggish_params.NUM_FRAMES,vggish_params.NUM_BANDS,1)

    img_input = Input( shape=input_shape)
    # Block 1
    x = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv1')(img_input)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool1')(x)

    # Block 2
    x = Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool2')(x)

    # Block 3
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_1')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool3')(x)

    # Block 4
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_1')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool4')(x)

    # Block fc
    x = Flatten(name='flatten')(x)
    x = Dense(4096, activation='relu', name='fc1_1')(x)
    x = Dense(4096, activation='relu', name='fc1_2')(x)
    x = Dense(vggish_params.EMBEDDING_SIZE, activation='relu', name='fc2')(x)


    model = Model(img_input, x, name='vggish')
    return model

Using TensorFlow backend.


### Load weights

Need to turn TF layer names into Keras layer names

In [16]:
import tensorflow as tf
import vggish_slim

with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, '../pretrained_models/vggish_model.ckpt')

    weights = {}
    operations = sess.graph.get_operations()
    for op in operations:
        name = op.name
        if 'read' in name:
            name2 = name.replace('vggish/','').replace('/read','').replace('conv3/','').replace('conv4/','').replace('/fc1','')
            name2_layer, name2_type = name2.split('/')
            if name2_type == 'weights':
                weights[name2_layer] = []
                weights[name2_layer].append(sess.run(op.values())[0])

    for op in operations:
        name = op.name
        if 'read' in name:
            name2 = name.replace('vggish/','').replace('/read','').replace('conv3/','').replace('conv4/','').replace('/fc1','')
            name2_layer, name2_type = name2.split('/')
            if name2_type == 'biases':
                weights[name2_layer].append(sess.run(op.values())[0])

INFO:tensorflow:Restoring parameters from ../pretrained_models/vggish_model.ckpt


In [17]:
weights.keys()

dict_keys(['conv1', 'conv2', 'conv3_1', 'conv3_2', 'conv4_1', 'conv4_2', 'fc1_1', 'fc1_2', 'fc2'])

### Save as Keras model

In [18]:
model = get_vggish_keras()
model.summary()
for layer in model.layers:
    if layer.name in list(weights.keys()):
        layer.set_weights(weights[layer.name])
model.save_weights('../pretrained_models/vgg_model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 96, 64, 1)         0         
_________________________________________________________________
conv1 (Conv2D)               (None, 96, 64, 64)        640       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 48, 32, 64)        0         
_________________________________________________________________
conv2 (Conv2D)               (None, 48, 32, 128)       73856     
_________________________________________________________________
pool2 (MaxPooling2D)         (None, 24, 16, 128)       0         
_________________________________________________________________
conv3_1 (Conv2D)             (None, 24, 16, 256)       295168    
_________________________________________________________________
conv3_2 (Conv2D)             (None, 24, 16, 256)       590080    
__________

## Load VGGish model wrapped in TimeDistributed layer

The normal model only works on a single time-frame. We want to train on multiple frames, so we need to wrap everything in Keras' [TimeDistributed](https://keras.io/layers/wrappers/#TimeDistributed)

In [19]:
from keras.layers import TimeDistributed

def get_vggish_keras_timedistributed():
    input_shape = (None, vggish_params.NUM_FRAMES,vggish_params.NUM_BANDS,1)

    img_input = Input(shape=input_shape)
    # Block 1
    x = TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same'), name='conv1')(img_input)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool1')(x)

    # Block 2
    x = TimeDistributed(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool2')(x)

    # Block 3
    x = TimeDistributed(Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_1'))(x)
    x = TimeDistributed(Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_2'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool3')(x)

    # Block 4
    x = TimeDistributed(Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_1'))(x)
    x = TimeDistributed(Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_2'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool4')(x)

    # Block fc
    x = TimeDistributed(Flatten(), name='flatten')(x)
    x = TimeDistributed(Dense(4096, activation='relu'), name='fc1_1')(x)
    x = TimeDistributed(Dense(4096, activation='relu'), name='fc1_2')(x)
    x = TimeDistributed(Dense(vggish_params.EMBEDDING_SIZE, activation='relu'), name='fc2')(x)

    model = Model(img_input, x, name='vggish_across_time')
    return model

In [20]:
model = get_vggish_keras_timedistributed()

In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, 96, 64, 1)   0         
_________________________________________________________________
conv1 (TimeDistributed)      (None, None, 96, 64, 64)  640       
_________________________________________________________________
pool1 (TimeDistributed)      (None, None, 48, 32, 64)  0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 48, 32, 128) 73856     
_________________________________________________________________
pool2 (TimeDistributed)      (None, None, 24, 16, 128) 0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 24, 16, 256) 295168    
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 24, 16, 256) 590080    
__________

In [22]:
model.load_weights('../pretrained_models/vgg_model.h5')

In [27]:
from keras.layers import CuDNNLSTM
from tensorflow import Tensor

x = CuDNNLSTM(127)(model.layers[-1].output)
x = Dense(8, activation='softmax', name='out')(x)

final_model = Model(inputs=model.input, outputs=[x])
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, 96, 64, 1)   0         
_________________________________________________________________
conv1 (TimeDistributed)      (None, None, 96, 64, 64)  640       
_________________________________________________________________
pool1 (TimeDistributed)      (None, None, 48, 32, 64)  0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 48, 32, 128) 73856     
_________________________________________________________________
pool2 (TimeDistributed)      (None, None, 24, 16, 128) 0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 24, 16, 256) 295168    
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 24, 16, 256) 590080    
__________

## Train Model

Putting it all together

### Compile model

First we need to compile the model, for which we'll use the same parameters (for now) as used originally:
* Adam optimizer
* LR of 1e-4
* Adam Epsilon of 1e-8

In [39]:
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy

model = final_model
optimizer = Adam(lr=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON)
metrics = [categorical_accuracy]
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=metrics)

Make sure we're running on the GPU:

In [83]:
from keras import backend as K

config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 8} ) 
sess = tf.Session(config=config) 
K.set_session(sess)

Since we're dealing with var-length data, we can't just call `fit`, but instead need to train on each batch manually.

In [84]:
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
import time
from IPython import display
%matplotlib inline

i = pd.date_range('2013-1-1',periods=100,freq='s')

losses = []
for x, y in zip(x_train, y_train):
    x_batch = np.reshape(x, (1, *x.shape, 1))
    y_batch = np.reshape(y, (1, *y.shape))
    loss, acc = model.train_on_batch(x_batch, y_batch) #, class_weight=class_weights)
    losses.append(loss)
    plt.plot(pd.Series(data=np.random.randn(100), index=i))
    display.display(plt.gcf())
    display.clear_output(wait=True)
    print(acc)

InvalidArgumentError: No OpKernel was registered to support Op 'CudnnRNN' with these attrs.  Registered devices: [CPU], Registered kernels:
  device='GPU'; T in [DT_HALF]
  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_DOUBLE]

	 [[node cu_dnnlstm_1/CudnnRNN (defined at C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\contrib\cudnn_rnn\python\ops\cudnn_rnn_ops.py:922)  = CudnnRNN[T=DT_FLOAT, direction="unidirectional", dropout=0, input_mode="linear_input", is_training=true, rnn_mode="lstm", seed=87654321, seed2=0](cu_dnnlstm_1/transpose, cu_dnnlstm_1/ExpandDims_1, cu_dnnlstm_1/ExpandDims_2, cu_dnnlstm_1/concat_1)]]

Caused by op 'cu_dnnlstm_1/CudnnRNN', defined at:
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\asyncio\base_events.py", line 427, in run_forever
    self._run_once()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\asyncio\base_events.py", line 1440, in _run_once
    handle._run()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\IPython\core\interactiveshell.py", line 2819, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\IPython\core\interactiveshell.py", line 2845, in _run_cell
    return runner(coro)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\IPython\core\interactiveshell.py", line 3020, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\IPython\core\interactiveshell.py", line 3185, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\IPython\core\interactiveshell.py", line 3267, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-7448c46f631a>", line 3, in <module>
    x = CuDNNLSTM(127)(model.layers[-1].output)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\keras\layers\recurrent.py", line 532, in __call__
    return super(RNN, self).__call__(inputs, **kwargs)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\keras\engine\base_layer.py", line 457, in __call__
    output = self.call(inputs, **kwargs)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\keras\layers\cudnn_recurrent.py", line 90, in call
    output, states = self._process_batch(inputs, initial_state)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\keras\layers\cudnn_recurrent.py", line 517, in _process_batch
    is_training=True)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\contrib\cudnn_rnn\python\ops\cudnn_rnn_ops.py", line 1544, in __call__
    input_data, input_h, input_c, params, is_training=is_training)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\contrib\cudnn_rnn\python\ops\cudnn_rnn_ops.py", line 1435, in __call__
    seed=self._seed)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\contrib\cudnn_rnn\python\ops\cudnn_rnn_ops.py", line 922, in _cudnn_rnn
    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\python\ops\gen_cudnn_rnn_ops.py", line 116, in cudnn_rnn
    is_training=is_training, name=name)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\python\util\deprecation.py", line 488, in new_func
    return func(*args, **kwargs)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\python\framework\ops.py", line 3274, in create_op
    op_def=op_def)
  File "C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\python\framework\ops.py", line 1770, in __init__
    self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): No OpKernel was registered to support Op 'CudnnRNN' with these attrs.  Registered devices: [CPU], Registered kernels:
  device='GPU'; T in [DT_HALF]
  device='GPU'; T in [DT_FLOAT]
  device='GPU'; T in [DT_DOUBLE]

	 [[node cu_dnnlstm_1/CudnnRNN (defined at C:\Users\Sam\Anaconda3\envs\vggec\lib\site-packages\tensorflow\contrib\cudnn_rnn\python\ops\cudnn_rnn_ops.py:922)  = CudnnRNN[T=DT_FLOAT, direction="unidirectional", dropout=0, input_mode="linear_input", is_training=true, rnn_mode="lstm", seed=87654321, seed2=0](cu_dnnlstm_1/transpose, cu_dnnlstm_1/ExpandDims_1, cu_dnnlstm_1/ExpandDims_2, cu_dnnlstm_1/concat_1)]]


In [85]:
tf.test.is_gpu_available()

False