## Dependencies

In [41]:
!pip install gensim



In [42]:
import numpy as np
import tensorflow as tf
import gensim as gs
import ast
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [43]:
#! check if tensorflow is using GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"TensorFlow is using {len(gpus)} GPU(s).")
    for gpu in gpus:
        print(f"GPU: {gpu.name}")
else:
    print("TensorFlow is not using any GPUs.")

TensorFlow is using 1 GPU(s).
GPU: /physical_device:GPU:0


In [44]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels

In [45]:
data, labels = load_data_labels('training_data_processed.txt', 'train_order_category_labels.txt')
print(data[:5])
print(labels[:5])

KeyboardInterrupt: 

In [None]:
dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')


In [None]:
pretrained_model = gs.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
#! get v Aand replace unknown words with unk token
def process_sentence(sentence, model):
    for i, word in enumerate(sentence):
        if word not in model and word  != 'a':
            sentence[i] = 'unk'
    return sentence

data = [process_sentence(sentence, pretrained_model) for sentence in data]
print(data[:5])
vocab=set()
for sentence in data:
    vocab.update(sentence)
#! get word index for each word in vocab
word2idx = {word: idx for idx, word in enumerate(vocab)}

[['i', 'like', 'a', 'pizza', 'with', 'carrot', 'ricotta', 'unk', 'green', 'olive', 'with', 'thin', 'crust'], ['i', 'like', 'three', 'pizza', 'no', 'american', 'cheese', 'unk', 'one', 'sprite', 'unk', 'three', 'san', 'unk'], ['a', 'sprite', 'unk', 'five', 'three', 'liter', 'coke', 'zero', 'unk', 'three', 'unk', 'ml', 'water'], ['four', 'seven', 'up', 'unk', 'a', 'medium', 'ginger', 'ale', 'unk', 'five', 'unk', 'fluid', 'ounce', '7', 'up'], ['i', 'like', 'a', 'pizza', 'with', 'caramelize', 'red', 'onion', 'arugula', 'unk', 'lettuce']]


In [None]:
embedding_dim=300
input_dim=len(vocab)
output_dim=3
max_length=100

In [None]:
for tokens in dev_data:
    for i,word in enumerate(tokens):
        if word not in vocab!= 'a':
            tokens[i] = 'unk'
X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=2)

In [None]:
#! get embeddings matrix
def get_embeddings_matrix(model, vocab):
    vocab_size = len(vocab)
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for i, word in enumerate(vocab):
        if word in model:
            embedding_matrix[i] = model[word]
        elif word == 'a':
            embedding_matrix[i] = model['one']
    return embedding_matrix
embedding_matrix = get_embeddings_matrix(pretrained_model, vocab)

In [None]:
#! replace words with their index in vocab and pad sentences
X=[[word2idx[word] for word in sentence] for sentence in data]
X=pad_sequences(X, maxlen=max_length, padding='post', value=-1)
Y=pad_sequences(labels, maxlen=max_length, padding='post', value=2)

In [None]:
#! model
Bidirectional_LSTM_model = tf.keras.Sequential([
    tf.keras.layers.Masking(mask_value=-1),  #! Masking layer to handle the padded values
    tf.keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True,mask_zero=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Dropout(0.8),
    tf.keras.layers.Dense(output_dim, activation='softmax')
])
## dropout->.8 , layers->48 best acc->70%
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',           #! Monitor the validation loss (you can use 'accuracy' or another metric)
    factor=0.5,               #! Factor by which the learning rate will be reduced
    patience=5,               #! Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-9,              #! Lower bound on the learning rate
    verbose=1                 #! Print a message when the learning rate is reduced
)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.003)
Bidirectional_LSTM_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [46]:
Bidirectional_LSTM_model.fit(
    X, 
    Y, 
    validation_data=(X_d, Y_d),  # Add validation data
    callbacks=[lr_scheduler], 
    epochs=10,
    batch_size=512
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [None]:
Bidirectional_LSTM_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, 100)               0         
                                                                 
 embedding_1 (Embedding)     (None, 100, 300)          64800     
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 64)          85248     
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 100, 64)           0         
                                                                 
 dense_1 (Dense)             (None, 100, 3)            195       
                                                                 
Total params: 150,243
Trainable params: 150,243
Non-trainable params: 0
________________________________________________

## Evaluate model on training data

In [None]:
preds_train = Bidirectional_LSTM_model.predict(X)
preds_train = np.argmax(preds_train, axis=-1)



In [None]:
count = 0
for i in range(len(X)):
    mask=X[i]!=-1
    if np.all(preds_train[i][mask]==Y[i][mask]):
        count+=1
print(f"Accuracy on training data: {count/len(X)}")

Accuracy on training data: 0.9535


In [None]:
# Predict on the development set
preds_dev = Bidirectional_LSTM_model.predict(X)
preds_dev = np.argmax(preds_dev, axis=-1)

count = 0  # Count of completely correct sequences
last_index_error_count = 0  # Count of sequences with only last index error

for i in range(len(data)):
    # Get original sequence lengths before padding
    original_length = len(data[i])
    
    # Extract predictions and true labels for the original sequence
    pred_seq = preds_dev[i][:original_length]
    true_seq = labels[i]  # Original labels are unpadded
    
    # Check if the sequence is entirely correct
    if (pred_seq == true_seq).all():
        count += 1
    else:
        # Check if only the last index is incorrect
        if (pred_seq[:-1] == true_seq[:-1]).all() and (pred_seq[-1] != true_seq[-1]):
            last_index_error_count += 1
        
        # Print debug information for mismatches
        print(f"Index with mismatch: {i}")
        print(f"Predicted: {pred_seq}")
        print(f"True:      {true_seq}")
        print("--------------------------------------------")

# Print results
print(f"Sequences with only last index error: {last_index_error_count}")


Index with mismatch: 21
Predicted: [2 2 0 0 0 0 0 0 0 0 0 2]
True:      [2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 223
Predicted: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
True:      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 234
Predicted: [1 1 2 1 1 1 1 1 2 1 1 2]
True:      [1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1]
--------------------------------------------
Index with mismatch: 239
Predicted: [2 2 0 0 0 0 0 0 0 0 0 0]
True:      [2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 242
Predicted: [2 2 0 0 0 0 0 0 0 0 0 0 0]
True:      [2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 254
Predicted: [2 2 0 0 0 0 2 0 0 0 2]
True:      [2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 262
Predicted: [2 2 0 0 0 0 0 0 0 2

## Evaluate model on dev data

In [None]:
# dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')
# for tokens in dev_data:
#     for i,word in enumerate(tokens):
#         if word not in vocab!= 'a':
#             tokens[i] = 'unk'
# X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
# X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
# Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=2)

In [None]:
# Predict on the development set
preds_dev = Bidirectional_LSTM_model.predict(X_d)
preds_dev = np.argmax(preds_dev, axis=-1)

count = 0  # Count of completely correct sequences
last_index_error_count = 0  # Count of sequences with only last index error

for i in range(len(dev_data)):
    # Get original sequence lengths before padding
    original_length = len(dev_data[i])
    
    # Extract predictions and true dev_labels for the original sequence
    pred_seq = preds_dev[i][:original_length]
    true_seq = dev_labels[i]  # Original dev_labels are unpadded
    
    # Check if the sequence is entirely correct
    if (pred_seq == true_seq).all():
        count += 1
    else:
        # Check if only the last index is incorrect
        if (pred_seq[:-1] == true_seq[:-1]).all() and (pred_seq[-1] != true_seq[-1]):
            last_index_error_count += 1
        
        # Print debug information for mismatches
        print(f"Index with mismatch: {i}")
        print(f"Predicted: {pred_seq}")
        print(f"True:      {true_seq}")
        print("--------------------------------------------")

# Print results
print(f"Accuracy on dev_data: {count / len(dev_data):.4f}")
print(f"Sequences with only last index error: {last_index_error_count}")


Index with mismatch: 9
Predicted: [1 2 2 0 0 0 0 0 0 0]
True:      [2, 2, 2, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 14
Predicted: [0 2 0 0 0 0 0 0 0 0 0]
True:      [2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 23
Predicted: [0 2 0 0 0 0 0 0 0 0]
True:      [2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------
Index with mismatch: 25
Predicted: [1 2 2 2 0 0 0 0 0 0 0 2]
True:      [2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2]
--------------------------------------------
Index with mismatch: 27
Predicted: [2 2 0 0 0 0 0 0 0]
True:      [2, 2, 0, 0, 0, 2, 0, 0, 0]
--------------------------------------------
Index with mismatch: 39
Predicted: [1 2 2 1 1 1 1 1 1 1 1]
True:      [2, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1]
--------------------------------------------
Index with mismatch: 44
Predicted: [0 2 0 0 0 0 0 0 0 0]
True:      [2, 2, 0, 0, 0, 0, 0, 0, 0, 0]
-----------------------