## Load labels

In [1]:
from pathlib import Path

data_path = Path("../data")
data_path

WindowsPath('../data')

### List all WAV files

In [2]:
wav_files = data_path.glob("*.wav")
wav_files = list(wav_files)
wav_files[:5]

[WindowsPath('../data/03-01-01-01-01-01-01.wav'),
 WindowsPath('../data/03-01-01-01-01-01-02.wav'),
 WindowsPath('../data/03-01-01-01-01-01-03.wav'),
 WindowsPath('../data/03-01-01-01-01-01-04.wav'),
 WindowsPath('../data/03-01-01-01-01-01-05.wav')]

### Map file names to their classes.

Each emotion is labelled as 01 - 08, so we convert that to labels 0 - 7

In [3]:
def class_from_file_name(fname):
    return int(fname.split('-')[2]) - 1

def speaker_from_file_name(fname):
    return int(fname.split('-')[6].split('.')[0]) - 1

labels = {
    f.name: {
        "class": class_from_file_name(f.name),
        "speaker": speaker_from_file_name(f.name)
    }
    for f in wav_files
}
[(k, v) for k, v in labels.items()][:5]

[('03-01-01-01-01-01-01.wav', {'class': 0, 'speaker': 0}),
 ('03-01-01-01-01-01-02.wav', {'class': 0, 'speaker': 1}),
 ('03-01-01-01-01-01-03.wav', {'class': 0, 'speaker': 2}),
 ('03-01-01-01-01-01-04.wav', {'class': 0, 'speaker': 3}),
 ('03-01-01-01-01-01-05.wav', {'class': 0, 'speaker': 4})]

### Extract number of classes

In [4]:
CLASSES = list(set([v["class"] for v in labels.values()]))
NUM_CLASSES = len(CLASSES)
NUM_CLASSES

8

## Load Data

### Fix PYTHONPATH

Add the path to the vgg-related files to the pythonpath so that we can import the modules

In [5]:
import os
import sys
nb_dir = Path(os.getcwd())
vgg_dir = nb_dir.parent / 'vgg'
vgg_dir

WindowsPath('D:/Work/playground/vgg-emotion-classifier/vgg')

In [6]:
if str(vgg_dir) not in sys.path:
    sys.path.append(str(vgg_dir))
sys.path

['',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\python36.zip',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\DLLs',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\lib',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\lib\\site-packages',
 'C:\\Users\\Sam\\Anaconda3\\envs\\vggec\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Sam\\.ipython',
 'D:\\Work\\playground\\vgg-emotion-classifier\\vgg']

### Read WAV files

Read in the wav files and convert them into the correct shape for the VGGish model (this is thankfully taken care of already by the example code provided)

In [7]:
from vggish_input import wavfile_to_examples 

In [8]:
data = {
    f.name: wavfile_to_examples(str(f))
    for f in wav_files
}

In [9]:
data['03-01-01-01-01-01-01.wav'].shape

(3, 96, 64)

### Split dataset

Split into

* train: 70%
* val: 15%
* test: 15%

Need to take into account that there are multiple files per speaker saying the same thing with slightly different intonation, so we should probably split by speaker.

In [10]:
from random import shuffle
import numpy as np

speakers = list(set([v["speaker"] for k, v in labels.items()]))

seed = 987234871
shuffle(speakers)

train_speakers_index = int(0.8 * len(speakers))
val_speakers_index = int(0.9 * len(speakers))
train_speakers = speakers[:train_speakers_index]
val_speakers = speakers[train_speakers_index:val_speakers_index]
test_speakers = speakers[val_speakers_index:]

x_train, y_train = zip(*[(data[key], value["class"]) for key, value in labels.items() if value["speaker"] in train_speakers])
x_val, y_val = zip(*[(data[key], value["class"]) for key, value in labels.items() if value["speaker"] in val_speakers])
x_test, y_test = zip(*[(data[key], value["class"]) for key, value in labels.items() if value["speaker"] in test_speakers])

x_train = np.array(x_train)
x_val = np.array(x_val)
x_test = np.array(x_test)

y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

print(f"Training size: {len(x_train)}")
print(f"Validation size: {len(x_val)}")
print(f"Test size: {len(x_test)}")


Training size: 1140
Validation size: 120
Test size: 180


### Convert labels to one-hot vectors

For multi-class classification using categorical crossentropy we want the labels in one-hot encoded form.

E.g. label `1` becomes `[0, 1, 0, 0 ,0 ,0, 0, 0]`

In [11]:
import numpy as np

def to_one_hot(y, num_classes):
    y_one_hot = np.zeros(((y.size, num_classes)))
    y_one_hot[np.arange(y.size), y] = 1
    return y_one_hot

y_train = to_one_hot(y_train, NUM_CLASSES)
y_val = to_one_hot(y_val, NUM_CLASSES)
y_test = to_one_hot(y_test, NUM_CLASSES)

In [12]:
y_train[:5]

array([[1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.]])

### Compute balanced weights

The number of instances for each class isn't balanced, so we need to create weightings for each class to even things out during training

In [13]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                  CLASSES,
                                                  list([v["class"] for v in labels.values()]))
# class_weights = {c: w for c, w in enumerate(scikit_class_weights)}
# class_weights
# scikit_class_weights

Create function to convert class weights to sample weights

In [14]:
def sample_weights_from_class_weights(class_weights, labels):
    return np.array([class_weights[np.argmax(label)] for label in labels])

### Create function to get batches from data

In [15]:
from random import shuffle
def get_shuffled_batches(x, y):
    assert len(x) == len(y)
    indexes = list(range(len(x)))
    shuffle(indexes)
    for x, y in zip(x[indexes], y[indexes]):
        x_batch = np.reshape(x, (1, *x.shape, 1))
        y_batch = np.reshape(y, (1, *y.shape))
        yield x_batch, y_batch

## Convert pretrained TF weights to Keras model checkpoint

Don't want to deal with the TF library, Keras is much easier to use imo. That means we first need to turn the TF checkpoint into a Keras checkpoint.

It should theoretically be possible to load the TF checkpoint as is, but this way I know it'll work.

### Define exact VGGish model in Keras

Code taken from https://github.com/SNeugber/vggish2Keras, which is a clone of https://github.com/antoinemrcr/vggish2Keras

In [16]:
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from keras.models import Model
import vggish_params

def get_vggish_keras():
    input_shape = (vggish_params.NUM_FRAMES,vggish_params.NUM_BANDS,1)

    img_input = Input( shape=input_shape)
    # Block 1
    x = Conv2D(64, (3, 3), activation='relu', padding='same', name='conv1')(img_input)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool1')(x)

    # Block 2
    x = Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool2')(x)

    # Block 3
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_1')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool3')(x)

    # Block 4
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_1')(x)
    x = Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_2')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='pool4')(x)

    # Block fc
    x = Flatten(name='flatten')(x)
    x = Dense(4096, activation='relu', name='fc1_1')(x)
    x = Dense(4096, activation='relu', name='fc1_2')(x)
    x = Dense(vggish_params.EMBEDDING_SIZE, activation='relu', name='fc2')(x)


    model = Model(img_input, x, name='vggish')
    return model

Using TensorFlow backend.


### Load weights

Need to turn TF layer names into Keras layer names

In [17]:
import tensorflow as tf
import vggish_slim

with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, '../pretrained_models/vggish_model.ckpt')

    weights = {}
    operations = sess.graph.get_operations()
    for op in operations:
        name = op.name
        if 'read' in name:
            name2 = name.replace('vggish/','').replace('/read','').replace('conv3/','').replace('conv4/','').replace('/fc1','')
            name2_layer, name2_type = name2.split('/')
            if name2_type == 'weights':
                weights[name2_layer] = []
                weights[name2_layer].append(sess.run(op.values())[0])

    for op in operations:
        name = op.name
        if 'read' in name:
            name2 = name.replace('vggish/','').replace('/read','').replace('conv3/','').replace('conv4/','').replace('/fc1','')
            name2_layer, name2_type = name2.split('/')
            if name2_type == 'biases':
                weights[name2_layer].append(sess.run(op.values())[0])

INFO:tensorflow:Restoring parameters from ../pretrained_models/vggish_model.ckpt


In [18]:
weights.keys()

dict_keys(['conv1', 'conv2', 'conv3_1', 'conv3_2', 'conv4_1', 'conv4_2', 'fc1_1', 'fc1_2', 'fc2'])

### Save as Keras model

In [None]:
model = get_vggish_keras()
model.summary()
for layer in model.layers:
    if layer.name in list(weights.keys()):
        layer.set_weights(weights[layer.name])
model.save_weights('../pretrained_models/vgg_model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 96, 64, 1)         0         
_________________________________________________________________
conv1 (Conv2D)               (None, 96, 64, 64)        640       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 48, 32, 64)        0         
_________________________________________________________________
conv2 (Conv2D)               (None, 48, 32, 128)       73856     
_________________________________________________________________
pool2 (MaxPooling2D)         (None, 24, 16, 128)       0         
_________________________________________________________________
conv3_1 (Conv2D)             (None, 24, 16, 256)       295168    
_________________________________________________________________
conv3_2 (Conv2D)             (None, 24, 16, 256)       590080    
__________

## Load VGGish model wrapped in TimeDistributed layer

The normal model only works on a single time-frame. We want to train on multiple frames, so we need to wrap everything in Keras' [TimeDistributed](https://keras.io/layers/wrappers/#TimeDistributed)

In [None]:
from keras.layers import TimeDistributed

def get_vggish_keras_timedistributed():
    input_shape = (None, vggish_params.NUM_FRAMES,vggish_params.NUM_BANDS,1)

    img_input = Input(shape=input_shape)
    # Block 1
    x = TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same'), name='conv1')(img_input)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool1')(x)

    # Block 2
    x = TimeDistributed(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv2'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool2')(x)

    # Block 3
    x = TimeDistributed(Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_1'))(x)
    x = TimeDistributed(Conv2D(256, (3, 3), activation='relu', padding='same', name='conv3_2'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool3')(x)

    # Block 4
    x = TimeDistributed(Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_1'))(x)
    x = TimeDistributed(Conv2D(512, (3, 3), activation='relu', padding='same', name='conv4_2'))(x)
    x = TimeDistributed(MaxPooling2D((2, 2), strides=(2, 2)), name='pool4')(x)

    # Block fc
    x = TimeDistributed(Flatten(), name='flatten')(x)
    x = TimeDistributed(Dense(4096, activation='relu'), name='fc1_1')(x)
    x = TimeDistributed(Dense(4096, activation='relu'), name='fc1_2')(x)
    x = TimeDistributed(Dense(vggish_params.EMBEDDING_SIZE, activation='relu'), name='fc2')(x)

    model = Model(img_input, x, name='vggish_across_time')
    return model

In [None]:
model = get_vggish_keras_timedistributed()

In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, 96, 64, 1)   0         
_________________________________________________________________
conv1 (TimeDistributed)      (None, None, 96, 64, 64)  640       
_________________________________________________________________
pool1 (TimeDistributed)      (None, None, 48, 32, 64)  0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 48, 32, 128) 73856     
_________________________________________________________________
pool2 (TimeDistributed)      (None, None, 24, 16, 128) 0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 24, 16, 256) 295168    
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 24, 16, 256) 590080    
__________

In [None]:
model.load_weights('../pretrained_models/vgg_model.h5')

In [None]:
from keras.layers import CuDNNLSTM
from tensorflow import Tensor

x = CuDNNLSTM(127)(model.layers[-1].output)
x = Dense(8, activation='softmax', name='out')(x)

final_model = Model(inputs=model.input, outputs=[x])
final_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, None, 96, 64, 1)   0         
_________________________________________________________________
conv1 (TimeDistributed)      (None, None, 96, 64, 64)  640       
_________________________________________________________________
pool1 (TimeDistributed)      (None, None, 48, 32, 64)  0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 48, 32, 128) 73856     
_________________________________________________________________
pool2 (TimeDistributed)      (None, None, 24, 16, 128) 0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, None, 24, 16, 256) 295168    
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 24, 16, 256) 590080    
__________

## Train Model

Putting it all together

### Compile model

First we need to compile the model, for which we'll use the same parameters (for now) as used originally:
* Adam optimizer
* LR of 1e-4
* Adam Epsilon of 1e-8

In [None]:
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy

model = final_model
optimizer = Adam(lr=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON)
metrics = [categorical_accuracy]
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=metrics)

Since we're dealing with var-length data, we can't just call `fit`, but instead need to train on each batch manually.

In [None]:
def train_epoch(model, x_train, y_train):
    train_loss = 0.0
    train_batches = 0.0
    train_accuracy = 0.0
    
    for x, y in get_shuffled_batches(x_train, y_train):
        loss, acc = model.train_on_batch(x, y, class_weight=class_weights)

        train_loss += loss
        train_accuracy += acc
        train_batches += 1
    return train_loss / train_batches, train_accuracy / train_batches

In [None]:
def validate_epoch(model, x_val, y_val, class_weights):
    val_loss = 0.0
    val_batches = 0.0
    val_accuracy = 0.0

    for x, y in get_shuffled_batches(x_val, y_val):
        sample_weights = sample_weights_from_class_weights(class_weights, y)
        loss, acc = model.evaluate(x, y, batch_size=1, sample_weight=sample_weights, verbose=0)
        val_loss += loss
        val_accuracy += acc
        val_batches += 1
    return val_loss / val_batches, val_accuracy / val_batches

In [None]:
from IPython import display

def plot_losses(epoch, train_loss, train_acc, val_loss, val_acc):
        fig = plt.figure()

        ax1 = fig.add_subplot(211)
        ax2 = ax1.twinx()
        ax3 = fig.add_subplot(212)

        fig.canvas.draw()
        
        epochs = list(range(epoch + 1))
        
        ax1.set_xlabel("Epoch")
        ax1.set_ylabel("Train Loss")
        ax2.set_ylabel("Val Loss")
        ax1.plot(epochs, train_loss)
        ax2.plot(epochs, val_loss)

        ax3.set_xlabel("Epoch")
        ax3.set_ylabel("Accuracy")
        ax3.plot(epochs, train_acc)
        ax3.plot(epochs, val_acc)
        ax3.legend()

        fig.tight_layout()
        fig.canvas.draw()
        
        display.display(fig)
        display.clear_output(wait=True)


In [None]:
from datetime import datetime
from keras.callbacks import ModelCheckpoint
import os

checkpoint_dir = "../model_checkpoints/"

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

timestamp = datetime.utcnow().strftime('%Y-%m-%d_%H-%M-%S')
model_checkpoint_filepath = checkpoint_dir + timestamp + "_epoch-{epoch:02d}_val-{val_loss:.4f}.hdf5"
model_checkpointer = ModelCheckpoint(model_checkpoint_filepath,
                                     monitor='val_loss',
                                     verbose=0,
                                     save_best_only=True,
                                     save_weights_only=False,
                                     mode='auto',
                                     period=1)

In [None]:
import matplotlib.pylab as plt
import pandas as pd
%matplotlib inline

NUM_EPOCHS = 5

train_losses = []
train_accs = []
val_losses = []
val_accs = []

model_checkpointer.set_model(model)
model_checkpointer.on_train_begin()

for epoch in range(NUM_EPOCHS):
    
    model_checkpointer.on_epoch_begin(epoch)
    train_loss, train_acc = train_epoch(model, x_train, y_train)

    train_losses.append(train_loss)
    train_accs.append(train_acc)

    val_loss, val_acc = validate_epoch(model, x_train, y_train, class_weights)

    val_losses.append(val_loss)
    val_accs.append(val_acc)
    
    model_checkpointer.on_epoch_end(epoch, logs={
        "loss": train_loss,
        "val_loss": val_loss
    })
    
    plot_losses(epoch, train_losses, train_accs, val_losses, val_accs)
    

In [None]:
timestamp = "_".join(os.path.basename(callback.filepath).split("_")[:2])
model_path = [os.path.join(checkpoint_dir, i) for i in os.listdir(checkpoint_dir)
              if os.path.isfile(os.path.join(checkpoint_dir, i))
              and timestamp in i][0]
model.load_weights(model_path)

In [None]:
predictions = []
true_labels = []
sample_weights = []
for x, y in get_shuffled_batches(x_test, y_test):
    prediction = model.predict_on_batch(x)
    predictions.append(prediction)
    true_labels.append(y)
    sample_weights.append(sample_weights_from_class_weights(class_weights, y))

In [None]:
predictions = np.array(predictions)
true_labels = np.array(true_labels)
sample_weights = np.array(sample_weights)

In [None]:
predictions = np.argmax(predictions, axis=2)
true_labels = np.argmax(true_labels, axis=2)

In [None]:
predictions = np.reshape(predictions, predictions.shape[0])
true_labels = np.reshape(true_labels, true_labels.shape[0])
sample_weights = np.reshape(sample_weights, sample_weights.shape[0])

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

metrics = dict(
    precision = precision_score(true_labels, predictions, average=None),
    recall = recall_score(true_labels, predictions, average=None),
    f1 = f1_score(true_labels, predictions, average=None),
    accuracy_raw = accuracy_score(true_labels, predictions),
    accuracy_weighted = accuracy_score(true_labels, predictions, sample_weight=sample_weights),
    confusion = confusion_matrix(true_labels, predictions)
)

In [None]:
metrics

In [None]:
import itertools

def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues,
                          block=False):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = np.average(cm)
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    if block:
        plt.show()

In [None]:
emotions = ["neutral", "calm", "happy", "sad", "angry", "fearful", "disgust", "surprised"]

plot_confusion_matrix(metrics["confusion"], classes=emotions)

In [None]:
plot_confusion_matrix(metrics["confusion"], classes=emotions, normalize=True)