## Dependencies

In [1]:
!pip install gensim



In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Masking, Embedding, Bidirectional, LSTM, Attention, Dropout, Dense
from tensorflow.keras import Model, Input
import gensim as gs
import ast
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [3]:
#! check if tensorflow is using GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"TensorFlow is using {len(gpus)} GPU(s).")
    for gpu in gpus:
        print(f"GPU: {gpu.name}")
else:
    print("TensorFlow is not using any GPUs.")

TensorFlow is using 1 GPU(s).
GPU: /physical_device:GPU:0


In [52]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels

In [None]:
# data, labels = load_data_labels('training_data_processed.txt', 'train_order_category_labels.txt')
data, labels = load_data_labels('train_data_order_details.txt', 'train_labels_order_details.txt')
# data2, labels2 = load_data_labels('synthetic_orders.txt', 'synthetic_labels.txt')
# data.extend(data2)
# labels.extend(labels2)
print(len(data))
print(len(labels))
print(data[:5])
print(labels[:5])

51000
51000
[['i', 'like', 'one', 'pizza', 'with', 'red', 'onion', 'fry', 'onion', 'and', 'mozarella', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'anchovy', 'caramelize', 'red', 'onion', 'and', 'roast', 'green', 'pepper', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'applewood', 'bacon', 'grill', 'pineapple', 'and', 'shrimp', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'pesto', 'sauce', 'roast', 'pepper', 'and', 'peppperonis', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'peperronni', 'spicy', 'red', 'sauce', 'and', 'mushroom', 'without', 'thin', 'crust']]
[[0, 0, 1, 0, 0, 4, 4, 4, 4, 0, 4, 0, 7, 7], [0, 0, 1, 0, 0, 4, 4, 4, 4, 0, 4, 4, 4, 0, 7, 7], [0, 0, 1, 0, 0, 4, 4, 4, 4, 0, 4, 0, 7, 7], [0, 0, 1, 0, 0, 4, 4, 4, 4, 0, 4, 0, 7, 7], [0, 0, 1, 0, 0, 4, 4, 4, 4, 0, 4, 0, 7, 7]]


In [104]:
# dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')
dev_data, dev_labels = load_data_labels('dev_data_order_details.txt', 'dev_labels_order_details.txt')

In [7]:
pretrained_model = gs.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [105]:
#! get v Aand replace unknown words with unk token
def process_sentence(sentence, model):
    for i, word in enumerate(sentence):
        if word not in model:
            sentence[i] = 'unk'
    return sentence

data = [process_sentence(sentence, pretrained_model) for sentence in data]
print(data[:5])
vocab=set()
for sentence in data:
    vocab.update(sentence)
#! get word index for each word in vocab
word2idx = {word: idx for idx, word in enumerate(vocab)}

[['i', 'like', 'one', 'pizza', 'with', 'red', 'onion', 'fry', 'onion', 'unk', 'mozarella', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'anchovy', 'caramelize', 'red', 'onion', 'unk', 'roast', 'green', 'pepper', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'applewood', 'bacon', 'grill', 'pineapple', 'unk', 'shrimp', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'pesto', 'sauce', 'roast', 'pepper', 'unk', 'unk', 'without', 'thin', 'crust'], ['i', 'like', 'one', 'pizza', 'with', 'unk', 'spicy', 'red', 'sauce', 'unk', 'mushroom', 'without', 'thin', 'crust']]


In [106]:
embedding_dim=300
input_dim=len(vocab)
output_dim=11
max_length=100

In [107]:
for tokens in dev_data:
    for i,word in enumerate(tokens):
        if word not in vocab:
            tokens[i] = 'unk'
X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=0)

In [108]:
#! get embeddings matrix
def get_embeddings_matrix(model, vocab):
    vocab_size = len(vocab)
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for i, word in enumerate(vocab):
        if word in model:
            embedding_matrix[i] = model[word]
    return embedding_matrix
embedding_matrix = get_embeddings_matrix(pretrained_model, vocab)

In [109]:
#! replace words with their index in vocab and pad sentences
X=[[word2idx[word] for word in sentence] for sentence in data]
X=pad_sequences(X, maxlen=max_length, padding='post', value=-1)
Y=pad_sequences(labels, maxlen=max_length, padding='post', value=0)

In [110]:
# input_seq = Input(shape=(None,), dtype=tf.int32)
# masked_input = Masking(mask_value=-1)(input_seq)

# embedding = Embedding(input_dim=input_dim, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True, mask_zero=False)(masked_input)

# lstm_output = Bidirectional(LSTM(32, return_sequences=True))(embedding)

# query = LSTM(32, return_sequences=True)(lstm_output)  
# key = LSTM(32, return_sequences=True)(lstm_output)    
# value = LSTM(32, return_sequences=True)(lstm_output)  

# attention_output = Attention(use_scale=True, causal=False)([query, key, value])

# dropout_output = Dropout(0.8)(attention_output)
# output = Dense(output_dim, activation='softmax')(dropout_output)

# model = Model(inputs=input_seq, outputs=output)

# optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

# model.compile(
#     optimizer=optimizer,
#     loss='sparse_categorical_crossentropy',
#     metrics=['accuracy']
# )

# model.summary()

# lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
#     monitor='val_loss',
#     factor=0.5,
#     patience=5,
#     min_lr=1e-9,
#     verbose=1
# )

In [111]:
# model.fit(
#     X, 
#     Y, 
#     validation_data=(X_d, Y_d),  # Add validation data
#     callbacks=[lr_scheduler], 
#     epochs=25,
#     batch_size=1024
# )


In [112]:
#! model
model = tf.keras.Sequential([
    tf.keras.layers.Masking(mask_value=-1),  #! Masking layer to handle the padded values
    tf.keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True,mask_zero=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(output_dim, activation='softmax')
])
## dropout->.8 , layers->48 best acc->70%
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',           #! Monitor the validation loss (you can use 'accuracy' or another metric)
    factor=0.5,               #! Factor by which the learning rate will be reduced
    patience=5,               #! Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-9,              #! Lower bound on the learning rate
    verbose=1                 #! Print a message when the learning rate is reduced
)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [113]:
# X was index of token in embedding matrix
# X -> 

model.fit(
    X, 
    Y, 
    validation_data=(X_d, Y_d),  # Add validation data
    callbacks=[lr_scheduler], 
    epochs=25,
    batch_size=512
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x2a08aff58e0>

In [114]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_4 (Masking)         (None, 100)               0         
                                                                 
 embedding_4 (Embedding)     (None, 100, 300)          112800    
                                                                 
 bidirectional_4 (Bidirectio  (None, 100, 128)         186880    
 nal)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 100, 128)          0         
                                                                 
 dense_4 (Dense)             (None, 100, 11)           1419      
                                                                 
Total params: 301,099
Trainable params: 301,099
Non-trainable params: 0
________________________________________________

## Evaluate model on training data

In [115]:
preds_train = model.predict(X)
preds_train = np.argmax(preds_train, axis=-1)



In [116]:
count = 0
for i in range(len(X)):
    mask=X[i]!=-1
    if np.all(preds_train[i][mask]==Y[i][mask]):
        count+=1
print(f"Accuracy on training data: {count/len(X)}")

Accuracy on training data: 0.9972745098039215


In [117]:
# # Predict on the development set
# preds_dev = Bidirectional_LSTM_model.predict(X)
# preds_dev = np.argmax(preds_dev, axis=-1)

# count = 0  # Count of completely correct sequences
# last_index_error_count = 0  # Count of sequences with only last index error

# for i in range(len(data)):
#     # Get original sequence lengths before padding
#     original_length = len(data[i])
    
#     # Extract predictions and true labels for the original sequence
#     pred_seq = preds_dev[i][:original_length]
#     true_seq = labels[i]  # Original labels are unpadded
    
#     # Check if the sequence is entirely correct
#     if (pred_seq == true_seq).all():
#         count += 1
#     else:
#         # Check if only the last index is incorrect
#         if (pred_seq[:-1] == true_seq[:-1]).all() and (pred_seq[-1] != true_seq[-1]):
#             last_index_error_count += 1
        
#         # Print debug information for mismatches
#         print(f"Index with mismatch: {i}")
#         print(f"Predicted: {pred_seq}")
#         print(f"True:      {true_seq}")
#         print("--------------------------------------------")

# # Print results
# print(f"Sequences with only last index error: {last_index_error_count}")


## Evaluate model on dev data

In [118]:
# dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')
# for tokens in dev_data:
#     for i,word in enumerate(tokens):
#         if word not in vocab!= 'a':
#             tokens[i] = 'unk'
# X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
# X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
# Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=2)

In [119]:
# Predict on the development set
preds_dev = model.predict(X_d)
preds_dev = np.argmax(preds_dev, axis=-1)

count = 0  # Count of completely correct sequences
last_index_error_count = 0  # Count of sequences with only last index error

for i in range(len(dev_data)):
    # Get original sequence lengths before padding
    original_length = len(dev_data[i])
    
    # Extract predictions and true dev_labels for the original sequence
    pred_seq = preds_dev[i][:original_length]
    true_seq = dev_labels[i]  # Original dev_labels are unpadded
    
    # Check if the sequence is entirely correct
    if (pred_seq == true_seq).all():
        count += 1
    else:
        # Check if only the last index is incorrect
        if (pred_seq[:-1] == true_seq[:-1]).all() and (pred_seq[-1] != true_seq[-1]):
            last_index_error_count += 1
        
        # Print debug information for mismatches
        print(f"Index with mismatch: {i}")
        
        print(f"Predicted: {pred_seq}")
        print(f"True:      {true_seq}")
        print("--------------------------------------------")

# Print results
print(f"Accuracy on dev_data: {count / len(dev_data):.4f}")
print(f"Sequences with only last index error: {last_index_error_count}")


Index with mismatch: 74
Predicted: [0 0 0 0 1 2 3 3 3 0 0 1 2 3 3 4 0]
True:      [0, 0, 0, 0, 1, 2, 3, 3, 4, 0, 0, 1, 2, 3, 3, 4, 0]
--------------------------------------------
Index with mismatch: 177
Predicted: [0 0 0 0 1 2 0 0 4 4 0 4 0 0 0 1 9 0 8 0 0]
True:      [0, 1, 0, 0, 0, 2, 0, 0, 4, 4, 0, 4, 0, 0, 0, 1, 9, 0, 8, 0, 0]
--------------------------------------------
Index with mismatch: 188
Predicted: [0 0 0 0 1 2 4 0 1 2 4 0 0 4 4 0 1 2 0 0 4 4 4 0 4 4]
True:      [0, 1, 0, 0, 0, 2, 4, 0, 1, 2, 4, 0, 0, 4, 4, 0, 1, 2, 0, 0, 4, 4, 4, 0, 4, 4]
--------------------------------------------
Index with mismatch: 192
Predicted: [0 0 0 0 0 1 2 0 0 4 0 1 3 3 0 0 0 0 0 0 6 0 0 0]
True:      [0, 0, 0, 0, 0, 1, 2, 0, 0, 4, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0]
--------------------------------------------
Index with mismatch: 193
Predicted: [0 0 0 0 1 2 2 3 0 0 1 2 8]
True:      [0, 1, 0, 0, 0, 2, 2, 3, 0, 0, 1, 2, 8]
--------------------------------------------
Index with mismatch: 