Try to use no pooling and see if vanishing gradient gets better.

In [1]:
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.layers import Embedding, Flatten, Conv1D, BatchNormalization, LeakyReLU, Dropout, Dense, Add
from tensorflow.keras.layers import Concatenate, GlobalAveragePooling1D, AveragePooling1D, MaxPool1D
from keras import backend as K

In [3]:
scce = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')

In [None]:
class CNN:
    
    def __init__(self, input_shape, seed, **kwargs):
        
        self.__dict__.update(kwargs)
        
        self.input_shape = input_shape
        self.seed = seed
        self.train_callback = [keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=self.plateau_patience, min_lr=1e-3),
                               keras.callbacks.EarlyStopping(monitor="val_loss", patience=self.train_patience, 
                                                             restore_best_weights=True, verbose=1)]
        self.retrain_callback = [keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=self.plateau_patience, min_lr=1e-3),
                                 keras.callbacks.EarlyStopping(monitor="val_loss", patience=self.retrain_patience, 
                                                               restore_best_weights=True, verbose=1)]
        
        self.model_dict = self.build_model()
    
    def build_base_model(self, output_dim, seed):
        init = tf.keras.initializers.HeNormal(seed)
        
        input_layer = keras.layers.Input(self.input_shape)
        x = input_layer
        embedding_layer = Embedding(input_dim=self.num_of_tokens, output_dim=self.embedding_dim, 
                                    embeddings_regularizer=regularizers.L2(1e-3)
                                    )(self.sic_input)
        embedding_layer = Flatten()(embedding_layer)
        
        assert len(self.filter_dims) == len(self.kernel_sizes)
        # assert len(self.filter_dims) == len(self.pool_sizes)
        # assert len(self.filter_dims) == len(self.strides)

        for i in range(len(self.filter_dims)):
            x = Conv1D(filters=self.filter_dims[i], kernel_size=self.kernel_sizes[i], padding="valid", strides=self.strides[i],
                       kernel_initializer=init, 
                    #    bias_initializer='zeros'
                       use_bias = False
                       )(x)
            x = BatchNormalization()(x)
            x = LeakyReLU()(x)
            # x = Dropout(self.dropout_conv, seed=self.seed)(x)
            # x = AveragePooling1D(pool_size=self.pool_sizes[i], strides=self.strides[i], padding="valid")(x)
            # y = Conv1D(filters=self.filter_dims[i], kernel_size=self.kernel_sizes[i], padding="same",
            #            kernel_initializer=init, bias_initializer='zeros')(y)
            # y = BatchNormalization()(y)
            # y = LeakyReLU()(y)
            # x = Add()([x, y])
            # x = MaxPool1D(pool_size=self.pool_sizes[i], strides=self.strides[i], padding="valid")(x)
        
        x = Flatten()(x)
        x = Concatenate()([x, embedding_layer])
        # y = Dense(x.shape[1], activation=self.activation, kernel_initializer=init, bias_initializer='zeros')(x)
        # y = Dropout(self.dropout_dense, seed=self.seed)(y)
        # x = Add()([x, y])
        for layer_dim in self.layer_dims:
            x = Dense(layer_dim, activation=self.activation, kernel_initializer=init, bias_initializer='zeros')(x)
            x = Dropout(self.dropout_dense, seed=self.seed)(x)
        output_layer = Dense(output_dim, activation="softmax", kernel_initializer=init, use_bias=False)(x)
        model = keras.models.Model(inputs=[self.target, input_layer, self.ret_d, self.sic_input], outputs=output_layer)
        model.add_loss(self.custom_loss(self.target, output_layer, self.ret_d))

        # if model_type == 'mov':
        #     model = keras.models.Model(inputs=[self.target, input_layer, self.ret_d, self.sic_input], outputs=output_layer)
        #     model.add_loss(self.custom_loss(self.target, output_layer, self.ret_d))
        # elif model_type == 'dir':
        #     model = keras.models.Model(inputs=[self.direction, input_layer, self.sic_input], outputs=output_layer)
        #     model.add_loss(self.weighted_loss(self.direction, output_layer))
        # else:
        #     raise Exception('Wrong model type!')

        return model

    def build_model(self):
        
        self.model_dict = {}

        for i in range(self.num_models):
            
            self.model_dict[i] = self.build_base_model(5, self.seed+i)
        
            # Direction models
            # self.model_dict[f'dir_{i}'] = self.build_base_model(3, self.seed+i, 'dir')

        return self.model_dict
    
    def custom_loss(self, y_true, y_pred, ret_d):
        y_true = tf.cast(y_true, dtype=tf.float32)
        return tf.reduce_mean(scce(y_true, y_pred) * tf.minimum(tf.abs(ret_d), 1))
    
    # Numpy version of loss
    def custom_loss_np(self, y_true, y_pred, ret_d):
        return np.mean(scce(y_true, y_pred) * np.minimum(abs(ret_d), 1))

    # def weighted_loss(self, y_true, y_pred):
    #     y_true = tf.cast(y_true, dtype=tf.int32)
    #     y_true = tf.one_hot(y_true, 3)
    #     return -tf.reduce_mean(y_true * self.class_weight_loss * tf.math.log(y_pred))
    
    # def weighted_loss_np(self, y_true, y_pred):
    #     y_true = tf.cast(y_true, dtype=tf.int32)
    #     y_true = tf.one_hot(y_true, 3)
    #     return -tf.reduce_mean(y_true * self.class_weight_loss * tf.math.log(y_pred))

    def compile_model(self):

        for i in range(self.num_models):
            self.model_dict[i].compile(loss=None, optimizer=keras.optimizers.Adam(self.learning_rate))
            
            # Direction models
            # self.model_dict[f'dir_{i}'].compile(loss=None, optimizer=keras.optimizers.Adam(self.learning_rate))

        # Just see one model architecture because all of them are the same
        self.model_dict[0].summary()
    
    # Using the same code for training and retraining model
    def train_model(self, x_train, y_train, ret_d_train, sic_train):
        tf.random.set_seed(self.seed)
        random.seed(self.seed)
        np.random.seed(self.seed)
        for i in range(self.num_models):
            history = self.model_dict[i].fit(
                x=[y_train, x_train, ret_d_train, sic_train],
                y=None,
                batch_size=self.batch_size,
                epochs=self.epochs,
                callbacks=self.train_callback,
                validation_split=self.validation_split,
                verbose=1
                )
        
            # history = self.model_dict[f'dir_{i}'].fit(
            #     x=[y_train_dir, x_train, sic_train],
            #     y=None,
            #     batch_size=self.batch_size,
            #     epochs=self.epochs,
            #     callbacks=self.train_callback,
            #     validation_split=self.validation_split,
            #     verbose=1
            # )
    
    def evaluate_model(self, x_train, y_train, ret_d_train, sic_train, x_test, y_test, ret_d_test, sic_test, batch_size):
        for i in range(self.num_models):
            y_pred = self.model_dict[i].predict([y_train, x_train, ret_d_train, sic_train], batch_size=batch_size)
            print(f'Model {i} training loss {self.custom_loss_np(y_train, y_pred, ret_d_train)}')
            y_pred = self.model_dict[i].predict([y_test, x_test, ret_d_test, sic_test], batch_size=batch_size)
            print(f'Model {i} test loss {self.custom_loss_np(y_test, y_pred, ret_d_test)}')

        # for i in range(self.num_models):
        #     y_pred = self.model_dict[f'dir_{i}'].predict([y_train, x_train, sic_train], batch_size=batch_size)
        #     print(f'Direction model {i} training loss {self.weighted_loss_np(y_train, y_pred)}')
        #     y_pred = self.model_dict[f'dir_{i}'].predict([y_test, x_test, sic_test], batch_size=batch_size)
        #     print(f'Direction model {i} test loss {self.weighted_loss_np(y_test, y_pred)}')
        
    def retrain_model(self, x_train, y_train, ret_d_train, sic_train):
        tf.random.set_seed(self.seed)
        random.seed(self.seed)
        np.random.seed(self.seed)
        
        for i in range(self.num_models):
            # Set learning rate back to 1e-2
            K.set_value(self.model_dict[i].optimizer.learning_rate, 1e-2)

            # Retrain models
            history = self.model_dict[i].fit(
                x=[y_train, x_train, ret_d_train, sic_train],
                y=None,
                batch_size=self.batch_size,
                epochs=self.epochs,
                callbacks=self.retrain_callback,
                validation_split=self.validation_split,
                verbose=1
                )
            gc.collect()
        
            # history = self.model_dict[f'dir_{i}'].fit(
            #     x=[y_train_dir, x_train, sic_train],
            #     y=None,
            #     batch_size=self.batch_size,
            #     epochs=self.epochs,
            #     callbacks=self.train_callback,
            #     validation_split=self.validation_split,
            #     verbose=0
            # )