# Task 2: Word2Vec - Sequence Classification Approach


In this notebook, word embeddings produced by Word2Vec are not aggregated into a single sentence emedding, but instead kept as a sequence of embeddings. Therefore, sequence classification models are applied.


----------------------------------------------

## Importing Libraries


In [None]:
#For dataset I/O
import pandas as pd
import numpy as np
import pickle, csv
from sklearn.utils import shuffle
import project2Lib

## for plotting
import matplotlib.pyplot as plt
import seaborn as sns

#for text pre-processing
import re, string
import nltk

# Word2Vec
from gensim.models import Word2Vec, KeyedVectors


#For Keras Deep Learning Models
from tensorflow.keras import models, layers, preprocessing, Sequential,  losses, Model
from tensorflow.keras import backend as K

#from tensorflow import keras
from tensorflow.keras import layers, backend as K
from tensorflow.keras.models import Sequential
import tensorflow as tf

from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import LSTM, Dense, Masking, Embedding, Input
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint


#For Peformance Metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay

np.random.seed(1)

### Checking for GPU

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != "/device:GPU:0":
  device_name = "/cpu:0"
print('Found device at: {}'.format(device_name))

## Loading preprocessed data

### Choosing one of the preprocessing options

In [None]:
suffix = ""
mode = 1

if   mode==0:
    suffix = "lemmatization_noph"
    
elif mode==1:
    suffix = "lemmatization"
    
elif mode==2:
    suffix = "_noph"

elif mode==3:
    suffix = "_"
    
elif mode==4:
    suffix = "stemming_noph"
    
elif mode==5:
    suffix = "stemming"


In [None]:
    
# read data
train_data = pd.read_pickle (f"PreprocessedData/train_{suffix}_w2v.pkl")
dev_data = pd.read_pickle (f"PreprocessedData/dev_{suffix}_w2v.pkl")
test_data = pd.read_pickle (f"PreprocessedData/test_{suffix}_w2v.pkl")

In [None]:
test_data[:3]

## To load existing model's keyed vectors:

In [None]:
# load saved model
kv = KeyedVectors.load_word2vec_format(f"./TrainedModels/w2v_200_{suffix}.bin", binary=True) 

## To prepare data:

the X_train_lines data is extracted for models that use the relative line number as an auxiliary input.

In [None]:
X_train_idx = np.stack(train_data["idx"].values)
X_train_lines = np.reshape(train_data["line_relative"].values, (-1, 1))
Y_train = train_data['label'].values

X_dev_idx = np.stack(dev_data["idx"].values)
X_dev_lines = np.reshape(dev_data["line_relative"].values, (-1, 1))
Y_dev = dev_data['label'].values

X_test_idx = np.stack(test_data["idx"].values)
X_test_lines = np.reshape(test_data["line_relative"].values, (-1, 1))
Y_test = test_data['label'].values

In [None]:
train_data["idx"].values[0]

Here we can see that our prepared data is padded to a standard sequence length. Sequential classifiers will be set to ignore the zero entries.

# Keras Based Sequential Classifiers

In this section, models that process Word2Vec embeddings of individual words sequentially to classify a sentence are explored. In most models, word embeddings are first processed by a Bidirectional LSTM, the rest of the model varies.  

-----------------------

# Bidirectional LSTM

## Without Line Numbers


### Hyperparameters

In [None]:
dimension=kv.vector_size
units=32
lr = 0.01
max_sent_len = len(X_train_idx[0])
epochs = 50
dropout= 0.2

save_name = f"./TrainedModels/biLSTM_noline_{suffix}a" + ".h5"

In [None]:
with tf.device(device_name):
    model = Sequential()

    embed_layer = Embedding(input_dim=   kv.vectors.shape[0]+1, 
                            output_dim=  kv.vectors.shape[1], 
                            weights=     [np.vstack((np.zeros((1, kv.vectors.shape[1])),kv.vectors))],             
                            input_length=max_sent_len,
                            mask_zero=   True,
                            trainable=   False)
    model.add(embed_layer)
    model.add(layers.Bidirectional(
            LSTM(
                units=units,
                activation='tanh'
            )) )

    model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dense(units, activation='relu'))
    model.add(layers.Dense(5, activation='softmax'))

    model.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(model.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)

    callbacks_list = [early, redonplat,checkpoint]  

    model.fit(X_train_idx, Y_train, epochs=epochs, batch_size=1000, 
                                    verbose=1, validation_data=(X_dev_idx,Y_dev), callbacks=callbacks_list )
    

In [None]:
Y_pred = model.predict(X_test_idx)
Y_pred = np.argmax(Y_pred, axis=-1)
del model


print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

-------------------------------------------------------------------------------------------

## With Line Numbers

In this section, a version of the Bidirectional LSTM based model that takes in relative line number as an auxiliary input is implemented. Since for all previous models, line number improves performance significantly, we expect this to be the case for this model too.

### Hyperparameters

In [None]:
dimension=kv.vector_size
units=32
lr = 0.001
max_sent_len = len(X_train_idx[0])
epochs = 40

save_name = f"./TrainedModels/biLSTM_line_{suffix}" + ".h5"

In [None]:
with tf.device(device_name):
    
    input_sequence = Input(shape=(max_sent_len,), name='input_sequence')
    input_sent_num = Input(shape=(1,), name='input_sent_num')


    # ---- the sequential processing section

    x = Embedding(input_dim=  kv.vectors.shape[0]+1, 
                            output_dim=  kv.vectors.shape[1], 
                            weights=     [np.vstack((np.zeros((1, kv.vectors.shape[1])),kv.vectors))],             
                            input_length=max_sent_len,
                            mask_zero=   True,
                            trainable=   False)(input_sequence)
    
    x = layers.Bidirectional( LSTM( units=units, activation='tanh'))(x)
    combined = layers.concatenate([x, input_sent_num])
    
    y = layers.Dense(units, activation='relu')(combined)
    y = layers.Dense(units, activation='relu')(y)
    output = layers.Dense(5, activation='softmax')(y)
    
    aux_line_model = Model(inputs=[input_sequence, input_sent_num], outputs=output)
    
    print(aux_line_model.summary())

    aux_line_model.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(aux_line_model.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)

    callbacks_list = [early, redonplat,checkpoint] 
    
    aux_line_model.fit({'input_sequence': X_train_idx, 'input_sent_num': X_train_lines}, Y_train, 
              epochs=epochs, batch_size=1000,  verbose=1, 
              validation_data=({'input_sequence': X_dev_idx, 'input_sent_num': X_dev_lines},Y_dev), 
              callbacks=callbacks_list)
    

In [None]:
Y_pred = aux_line_model.predict({'input_sequence': X_test_idx, 'input_sent_num': X_test_lines})
Y_pred = np.argmax(Y_pred, axis=-1)
del aux_line_model

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

-----------------------

# Bidirectional LSTM + Conv1D layers

This model combines an initial Bidirectional LSTM that aggregates word vectors into a single embedding with an attention layer and one dimensional convolutions. Given that Bidirectional LSTMs are commonly used in text classification, and covolutions are an effective way to get capacity, this model was deemed worth investigating. LSTM cell size was limited by computation time, larger hidden state sizes could create marginal improvements.

## Without Line Numbers

### Hyperparameters

In [None]:
dropout=0
units=32 # = filters
max_sent_len = len(X_train_idx[0])
conv_layers = 3
lr = 0.01
epochs = 60

save_name = f"./TrainedModels/biLSTM_conv_{suffix}" + ".h5"

### Attention Module

In [None]:
class attention(keras.layers.Layer):
    def __init__(self,return_sequences=False):
        #super(attention,self).__init__(**kwargs)
        self.return_sequences = return_sequences

        super(attention,self).__init__()
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(attention, self).build(input_shape)
 
    def call(self,x):

        e = K.tanh(K.dot(x,self.W)+self.b)
        e = K.squeeze(e, axis=-1)   
        alpha = K.softmax(e)
        alpha = K.expand_dims(alpha, axis=-1)
        
        #final context vector
        context = x * alpha
        
        if self.return_sequences:
            return context
        
        context = K.sum(context, axis=1)
        return context

In [None]:
with tf.device(device_name):
    blstm_conv_model = Sequential()

    embed_layer = Embedding(input_dim=kv.vectors.shape[0]+1, 
                            output_dim=kv.vectors.shape[1], 
                            weights=[np.vstack((np.zeros((1, kv.vectors.shape[1])),kv.vectors))],             
                            input_length=max_sent_len,
                            mask_zero=True,
                            trainable=False)
    blstm_conv_model.add(embed_layer)
    blstm_conv_model.add(layers.Bidirectional(
            LSTM(
                units=32,
                activation='tanh',
                return_sequences = True,
            )) )
    blstm_conv_model.add(attention(return_sequences = True))
    
    for _ in range(conv_layers):

        blstm_conv_model.add(keras.layers.Conv1D(filters=units, kernel_size=3, strides=1, padding="valid", activation='relu'))
        blstm_conv_model.add(keras.layers.Dropout(rate=dropout))
        blstm_conv_model.add(keras.layers.MaxPool1D(pool_size=2, strides=2, padding="valid"))
    
    blstm_conv_model.add(keras.layers.Flatten())
    
    blstm_conv_model.add(layers.Dense(units, activation='relu'))
    blstm_conv_model.add(layers.Dense(5, activation='softmax'))
    
        
    print(blstm_conv_model.summary())

    blstm_conv_model.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(blstm_conv_model.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)
    
    
    callbacks_list = [early, redonplat,checkpoint] 

    blstm_conv_model.fit(X_train_idx, Y_train, epochs=epochs, batch_size=1000, 
                                    verbose=1, validation_data=(X_dev_idx,Y_dev), callbacks=callbacks_list )

In [None]:
Y_pred = blstm_conv_model.predict(X_test_idx)
Y_pred = np.argmax(Y_pred, axis=-1)
del blstm_conv_model

print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

# Classification with Small Dataset

We run the versions of Bidirectional LSTM + fully connected layers on the small 20k dataset as well.

## Bidirectional LSTM without line numbers

In [None]:
# read data
train_data_small = pd.read_pickle (f"PreprocessedData/train_{suffix}_w2v_small.pkl")
dev_data_small = pd.read_pickle (f"PreprocessedData/dev_{suffix}_w2v_small.pkl")
test_data_small = pd.read_pickle (f"PreprocessedData/test_{suffix}_w2v_small.pkl")

In [None]:
X_train_idx_small = np.stack(train_data_small["idx"].values)
X_train_lines_small = np.reshape(train_data_small["line_relative"].values, (-1, 1))
Y_train_small = train_data_small['label'].values

X_dev_idx_small = np.stack(dev_data_small["idx"].values)
X_dev_lines_small = np.reshape(dev_data_small["line_relative"].values, (-1, 1))
Y_dev_small = dev_data_small['label'].values

X_test_idx_small = np.stack(test_data_small["idx"].values)
X_test_lines_small = np.reshape(test_data_small["line_relative"].values, (-1, 1))
Y_test_small = test_data_small['label'].values

In [None]:
# load saved model
kv_small = KeyedVectors.load_word2vec_format(f"./TrainedModels/w2v_200_{suffix}_small.bin", binary=True) 

In [None]:
dimension=kv_small.vector_size
units=32
lr = 0.001
max_sent_len = len(X_train_idx_small[0])
epochs = 40

save_name_small = f"./TrainedModels/biLSTM_noline_{suffix}_small" + ".h5"

In [None]:
with tf.device(device_name):
    model_small = Sequential()

    embed_layer = Embedding(input_dim=   kv_small.vectors.shape[0]+1, 
                            output_dim=  kv_small.vectors.shape[1], 
                            weights=     [np.vstack((np.zeros((1, kv_small.vectors.shape[1])),kv_small.vectors))],             
                            input_length=max_sent_len,
                            mask_zero=   True,
                            trainable=   False)
    model_small.add(embed_layer)
    model_small.add(layers.Bidirectional(
            LSTM(
                units=units,
                activation='tanh'
            )) )

    model_small.add(layers.Dense(units, activation='relu'))
    model_small.add(layers.Dense(units, activation='relu'))
    model_small.add(layers.Dense(5, activation='softmax'))

    model_small.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(model_small.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name_small, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)

    callbacks_list = [early, redonplat,checkpoint]  

    model_small.fit(X_train_idx_small, Y_train_small, epochs=epochs, batch_size=1000, 
                                    verbose=1, validation_data=(X_dev_idx_small,Y_dev_small), callbacks=callbacks_list )
    

In [None]:
Y_pred_small = model_small.predict(X_test_idx_small)
Y_pred_small = np.argmax(Y_pred_small, axis=-1)


print("Accuracy: " ,accuracy_score(Y_test_small, Y_pred_small))
print("F1 Score: " ,f1_score(Y_test_small, Y_pred_small, average='weighted') )
cm = confusion_matrix(Y_test_small, Y_pred_small, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

## Bidirectional LSTM with line numbers

In [None]:
save_name_small = f"./TrainedModels/biLSTM_line_{suffix}_small" + ".h5"

In [None]:
with tf.device(device_name):
    
    input_sequence = Input(shape=(max_sent_len,), name='input_sequence')
    input_sent_num = Input(shape=(1,), name='input_sent_num')


    # ---- the sequential processing section

    x = Embedding(input_dim=  kv_small.vectors.shape[0]+1, 
                            output_dim=  kv_small.vectors.shape[1], 
                            weights=     [np.vstack((np.zeros((1, kv_small.vectors.shape[1])),kv_small.vectors))],             
                            input_length=max_sent_len,
                            mask_zero=   True,
                            trainable=   False)(input_sequence)
    
    x = layers.Bidirectional( LSTM( units=units, activation='tanh'))(x)
    combined = layers.concatenate([x, input_sent_num])
    
    y = layers.Dense(units, activation='relu')(combined)
    y = layers.Dense(units, activation='relu')(y)
    output = layers.Dense(5, activation='softmax')(y)
    
    line_model_small = Model(inputs=[input_sequence, input_sent_num], outputs=output)
    
    print(line_model_small.summary())

    line_model_small.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(line_model_small.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name_small, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)

    callbacks_list = [early, redonplat,checkpoint] 
    
    line_model_small.fit({'input_sequence': X_train_idx_small, 'input_sent_num': X_train_lines_small}, Y_train_small, 
              epochs=epochs, batch_size=1000,  verbose=1, 
              validation_data=({'input_sequence': X_dev_idx_small, 'input_sent_num': X_dev_lines_small},Y_dev_small), 
              callbacks=callbacks_list)
    

In [None]:
Y_pred_small = line_model_small.predict({'input_sequence': X_test_idx_small, 'input_sent_num': X_test_lines_small})
Y_pred_small = np.argmax(Y_pred_small, axis=-1)


print("Accuracy: " ,accuracy_score(Y_test_small, Y_pred_small))
print("F1 Score: " ,f1_score(Y_test_small, Y_pred_small, average='weighted') )
cm = confusion_matrix(Y_test_small, Y_pred_small, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

# Classification using Difference of  Averaged Sentence Embeddings

Since memory related problems restricted our ability to use neighbouring sentences' word embeddings and aggreagate results, we opted for using averaged sentence embeddings in a way that may carry some sequential information as well. This idea was inspired from the analogy examples explored in the Word2Vec_Embedding_Generation notebook.

For each sentence, the averaged sentence embeddings of the previous and next sentences are substracted from its embedding to get an embedding representing semantic change. 

This model does not converge - averaged embeddings may not be semantically representative/ discriminative enough for their difference to carry enough information.

In [None]:
#getting differenced vectors
X_dev_diffed, Y_dev_diffed = project2Lib.get_diffed_vecs(dev_data)
X_test_diffed, Y_test_diffed  = project2Lib.get_diffed_vecs(test_data)
X_train_diffed, Y_train_diffed  = project2Lib.get_diffed_vecs(train_data)

#deallocating large datasets to free up memory
del X_train_idx
del Y_train
del X_dev_idx
del Y_dev
del X_test_idx
del Y_test

del train_data
del dev_data
del test_data

In [None]:
dimension=kv.vector_size
units=32
lr = 0.001
epochs = 40
dropout=0.2

save_name = f"./TrainedModels/diffed_{suffix}" + ".h5"

In [None]:
with tf.device(device_name):
    diff_model = Sequential()

    diff_model.add(layers.Conv1D(input_shape=(dimension,1), filters=units//2, kernel_size=3, strides=1, padding="valid", activation='relu'))
    diff_model.add(layers.Dropout(rate=dropout))
    diff_model.add(layers.MaxPool1D(pool_size=2, strides=2, padding="valid"))
    diff_model.add(layers.Conv1D(filters=dimension, kernel_size=3, strides=1, padding="valid", activation='relu'))
    diff_model.add(layers.Dropout(rate=dropout))
    diff_model.add(layers.MaxPool1D(pool_size=2, strides=2, padding="valid"))
    diff_model.add(layers.Conv1D(filters=dimension, kernel_size=3, strides=1, padding="valid", activation='relu'))
    diff_model.add(layers.Dropout(rate=dropout))
    diff_model.add(layers.MaxPool1D(pool_size=2, strides=2, padding="valid"))
    diff_model.add(layers.Flatten())
    diff_model.add(layers.Dense(32, activation='relu'))
    diff_model.add(layers.Dense(5, activation='softmax'))
    
    print(diff_model.summary())

    diff_model.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(diff_model.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)

    callbacks_list = [early, redonplat,checkpoint]   

    diff_model.fit(X_train_diffed.reshape(-1,dimension,1), Y_train_diffed, epochs=epochs, batch_size=1000, 
                                    verbose=1, validation_data=(X_dev_diffed.reshape(-1,dimension,1),Y_dev_diffed), callbacks=callbacks_list )

In [None]:
Y_pred = diff_model.predict(X_test_diffed.reshape(-1,dimension,1))
Y_pred = np.argmax(Y_pred, axis=-1)


print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()

# CNN on Concatenated Embeddings - WAY TOO SLOW

The idea behind this model is to find an alterantive to sequential processing. Word embeddings are concatenated and the padded sections of the sequence are masked in the matrix. The embedded sentence is therefore in the form of a 2D matrix that will not be treated sequentially but given to convolutional layers. 

- Model is too slow to run effectively, its design must have issues causing an unforeseen computational bottleneck

In [None]:
dropout=0
units=32 # = filters
max_sent_len = len(X_train_idx[0])
lr = 0.001
epochs = 40
dimension=kv.vector_size

save_name = f"./TrainedModels/conv2d_{suffix}" + ".h5"

In [None]:
with tf.device(device_name):
    cnn_model = Sequential()

    embed_layer = Embedding(input_dim=kv.vectors.shape[0]+1, 
                            output_dim=kv.vectors.shape[1], 
                            weights=[np.vstack((np.zeros((1, kv.vectors.shape[1])),kv.vectors))],             
                            input_length=max_sent_len,
                            mask_zero=True,
                            trainable=False)
    cnn_model.add(embed_layer)

    cnn_model.add(layers.Reshape((max_sent_len,dimension,1),input_shape=(max_sent_len,dimension)))

    cnn_model.add(layers.Conv2D(8, kernel_size=5,    activation="relu"))
    #cnn_model.add(layers.Dropout(rate=dropout))
    cnn_model.add(layers.MaxPool2D(pool_size=2,  ))
    cnn_model.add(layers.Conv2D(8, kernel_size=3,   activation="relu"))
    cnn_model.add(layers.Dropout(rate=dropout))
    cnn_model.add(layers.MaxPool2D(pool_size=2, ))
    cnn_model.add(layers.Conv2D(16, kernel_size=3,  activation="relu"))
    #cnn_model.add(layers.Dropout(rate=dropout))
    cnn_model.add(layers.MaxPool2D(pool_size=2, ))
    cnn_model.add(layers.Conv2D(16, kernel_size=3,   activation="relu"))
    cnn_model.add(layers.Dropout(rate=dropout))
    cnn_model.add(layers.MaxPool2D(pool_size=2, ))
    cnn_model.add(layers.Conv2D(16, kernel_size=3,   activation="relu"))
    cnn_model.add(layers.Dropout(rate=dropout))
    cnn_model.add(layers.MaxPool2D(pool_size=2, ))

    
    cnn_model.add(keras.layers.Flatten())
    
    cnn_model.add(layers.Dense(units, activation='relu'))
    cnn_model.add(layers.Dense(5, activation='softmax'))
    
        
    print(cnn_model.summary())

    cnn_model.compile(loss=losses.sparse_categorical_crossentropy, 
                              optimizer="adam", metrics=['accuracy'])


    K.set_value(cnn_model.optimizer.learning_rate, lr)

    early = EarlyStopping(monitor="val_loss", patience=10, verbose=1)
    redonplat = ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=1)
    checkpoint = ModelCheckpoint( filepath=save_name, save_weights_only=True, 
                                                 monitor='val_accuracy', mode='max', save_best_only=True)
    
    
    callbacks_list = [early, redonplat,checkpoint] 

    cnn_model.fit(X_train_idx, Y_train, epochs=epochs, batch_size=500, 
                                    verbose=1, validation_data=(X_dev_idx,Y_dev), callbacks=callbacks_list )

In [None]:
Y_pred = cnn_model.predict(X_test_idx)
Y_pred = np.argmax(Y_pred, axis=-1)


print("Accuracy: " ,accuracy_score(Y_test, Y_pred))
print("F1 Score: " ,f1_score(Y_test, Y_pred, average='weighted') )
cm = confusion_matrix(Y_test, Y_pred, normalize = "true")
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
cmd.plot()