# Training the model

In [25]:
import os
import re
import random
import numpy as np
import pandas as pd
from tensorflow.data import Dataset
from keras.models import Sequential
from keras.layers import Dense, Input
from keras.utils import Sequence

### Loading data in batches

First, we want build utils functions to load our data in batches so it can fit in memory. Let's start by creating a dictionnary that holds references to labels we want to use when we'll extract numpy compressed files (npz) : those file hold multiple numpy arrays representing each sentence labelled in our dataset.

In [26]:
np_idx = {"IDs": [f"arr_{x}" for x in range(20089)]}
np_idx["IDs"][:10]

['arr_0',
 'arr_1',
 'arr_2',
 'arr_3',
 'arr_4',
 'arr_5',
 'arr_6',
 'arr_7',
 'arr_8',
 'arr_9']

We want to set apart some of those indices so we can use them as a validation set.

In [27]:
(len(np_idx["IDs"]) * 20) / 100

4017.8

In [28]:
val_ids = []
for n in range(4018):
    choice = random.choice(np_idx["IDs"])
    np_idx["IDs"].remove(choice)
    val_ids.append(choice)

In [29]:
len(np_idx["IDs"])

16071

In [30]:
len(val_ids)

4018

Let's define some variables to point on our folders and files.

In [31]:
npz_inputs = "inputs.npz"
npz_targets = "targets.npz"
path_to_file = os.path.join("drive", "MyDrive", "Colab Notebooks")

Now we want to build a class that will handle all the unpacking, decompressing and building of dataloaders that we'll use to feed our model progressively.

This is strongly inspired from : https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.

In [32]:
class DataGenerator(Sequence):

    def __init__(self, path_to_file, npz_inputs, npz_targets, list_IDs, batch_size=8, dim=(584, 100), n_classes=4, shuffle=True):
        self.path_to_file = path_to_file
        self.npz_inputs = npz_inputs
        self.npz_targets = npz_targets
        self.dim = dim
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        "Returns the number of batches per epoch."
        batches_per_epoch = int(np.floor(len(self.list_IDs)) / self.batch_size)
        return batches_per_epoch

    def __getitem__(self, index):
        "Generate one batch of data."
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        X, Y = self.__data_generation(list_IDs_temp)

        return X, Y

    def on_epoch_end(self):
        "Update indexes after each epoch."
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        "Generate data containing samples of batch_size size."

        X_temp = []
        Y_temp = []

        for i, ID in enumerate(list_IDs_temp):
            # Loading data
            npz_inputs = np.load(os.path.join(self.path_to_file, self.npz_inputs))
            X = np.array(npz_inputs[ID])
            X_temp.append(X)
            # Loading labels
            npz_targets = np.load(os.path.join(self.path_to_file, self.npz_targets))
            Y = np.array(npz_targets[ID])
            Y_temp.append(Y)

        return np.asarray(X_temp, dtype="float32"), np.asarray(Y_temp, dtype="float32")

In [33]:
train_gen = DataGenerator(path_to_file=path_to_file,
                          npz_inputs=npz_inputs,
                          npz_targets=npz_targets,
                          batch_size=64,
                          list_IDs=np_idx["IDs"],
                          dim=(584, 100),
                          shuffle=True)
val_gen = DataGenerator(path_to_file=path_to_file,
                        npz_inputs=npz_inputs,
                        npz_targets=npz_targets,
                        batch_size=64,
                        list_IDs=val_ids,
                        dim=(584, 100),
                        shuffle=True)

In [34]:
train_gen[4][0].shape

(64, 584, 100)

In [35]:
train_gen[4][1].shape

(64, 4)

## Building the model

### Simple FNN model

In [36]:
from keras.models import Sequential
from keras.layers import Dense, Flatten

In [37]:
model = Sequential()
layers = [Dense(16, activation="relu", input_shape=(584, 100)),
          Dense(32, activation="relu"),
          Dense(4, activation="relu"),
          Flatten(),
          Dense(4, activation="softmax")]

In [38]:
for layer in layers:
    model.add(layer)

In [39]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 584, 16)           1616      
_________________________________________________________________
dense_5 (Dense)              (None, 584, 32)           544       
_________________________________________________________________
dense_6 (Dense)              (None, 584, 4)            132       
_________________________________________________________________
flatten_1 (Flatten)          (None, 2336)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 4)                 9348      
Total params: 11,640
Trainable params: 11,640
Non-trainable params: 0
_________________________________________________________________


In [40]:
model.compile(loss="categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

In [None]:
model.fit(x=train_gen,
          use_multiprocessing=True,
          workers=8,
          epochs=1,
          validation_data=val_gen,
          validation_steps=10,
          verbose=1)

