## Dependencies

In [None]:
!pip install gensim

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn as sk
import gensim as gs
import ast
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
#! check if tensorflow is using GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"TensorFlow is using {len(gpus)} GPU(s).")
    for gpu in gpus:
        print(f"GPU: {gpu.name}")
else:
    print("TensorFlow is not using any GPUs.")

## Load, vectorize and pad the training data with labels

In [4]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f.readlines()]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f.readlines()]
    return data, labels

In [5]:
data,labels=load_data_labels('training_data_processed.txt','order_category_labels.txt')

In [None]:
print(data[:5])
print(labels[:5])

In [7]:
model = gs.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)


In [8]:
#! add the unknown token to the vocabulary
unk_token = "UNK"
for sentence in data:
    for i, word in enumerate(sentence):
        if word not in model and word != 'a':
            sentence[i] = unk_token
#! extract the vocabulary from the data
vocab = set()  
for sentence in data:
    vocab.update(sentence)


In [None]:
print(data[:5])

In [10]:
#! get the embedings matrix
embedding_dim = 300  
embedding_matrix = np.random.randn(len(vocab), embedding_dim)
word_to_index = {word: idx for idx, word in enumerate(vocab)}
for word, idx in word_to_index.items():
    if word in model:
        embedding_matrix[idx] = model[word]  #! Use pre-trained embedding
    elif word =='a':
        embedding_matrix[idx] = model['one']
    else:
        embedding_matrix[idx] = np.zeros(embedding_dim)


In [11]:
max_sequence_length = 100  #! Maximum sequence length after padding
input_dim = len(vocab)  # Vocabulary size
embedding_dim = 300  # Embedding dimension
output_dim = 3  #

In [12]:
#! convert words to indices, make all the sentences of the same length by padding them with non used value -1
sequences = [[word_to_index[word] for word in text] for text in data]
X = pad_sequences(sequences, padding='post',value=-1,maxlen=max_sequence_length)
#! make all the labels of the same length by padding them with none value ==> 2
Y = pad_sequences(labels,padding='post',value=2,maxlen=max_sequence_length)

In [13]:
#! model
Bidirectional_LSTM_model = tf.keras.Sequential([
    tf.keras.layers.Masking(mask_value=-1),  #! Masking layer to handle the padded values
    tf.keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True,mask_zero=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(output_dim, activation='softmax')
])
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss',       #! Monitor the validation loss (you can use 'accuracy' or another metric)
    factor=0.5,               #! Factor by which the learning rate will be reduced
    patience=5,               #! Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-9,              #! Lower bound on the learning rate
    verbose=1                 #! Print a message when the learning rate is reduced
)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
Bidirectional_LSTM_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
print(input_dim)

In [None]:
Bidirectional_LSTM_model.fit(X, Y, epochs=2, batch_size=1024, callbacks=[lr_scheduler])

In [None]:
Bidirectional_LSTM_model.summary()

## Training Set Evaluation

In [None]:
preds_train=Bidirectional_LSTM_model.predict(X[:100000])

In [18]:
preds_train=np.argmax(preds_train,axis=2)

In [None]:
count=0
for i in range(100000):
    mask=X[i]!=-1
    if np.array_equal(preds_train[i][mask],Y[i][mask]):
        count+=1
print(count/100000)

## Dev Set Evaluation

In [20]:
#! load dev set and labels
dev_data,dev_labels=load_data_labels('dev_data_processed.txt','dev_order_category_labels.txt')

In [21]:
#! replace any words not in the vocabulary with the unknown token
for sentence in dev_data:
    for i, word in enumerate(sentence):
        if word not in vocab:
            sentence[i] = unk_token
#! convert words to indices, make all the sentences of the same length by padding them with non used value -1
sequences_d = [[word_to_index[word] for word in text] for text in dev_data]
Xd = pad_sequences(sequences_d, padding='post',value=-1,maxlen=max_sequence_length)
#! make all the labels of the same length by padding them with none value ==> 2
Yd = pad_sequences(dev_labels,padding='post',value=2,maxlen=max_sequence_length)

In [None]:
preds_dev=Bidirectional_LSTM_model.predict(Xd)

In [23]:
preds_dev=np.argmax(preds_dev,axis=2)

In [None]:
count=0
for i in range(len(dev_data)):
    mask=Xd[i]!=-1
    if np.array_equal(preds_dev[i][mask],Yd[i][mask]):
        count+=1
    print(i)
    print(preds_dev[i][mask])
    print(Yd[i][mask])
    print("-----------------------------------------------------------")
print(count/len(dev_data))