## Dependencies

In [6]:
!pip install gensim



In [7]:
import numpy as np
import tensorflow as tf
import gensim as gs
import ast
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random


In [8]:
#! check if tensorflow is using GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"TensorFlow is using {len(gpus)} GPU(s).")
    for gpu in gpus:
        print(f"GPU: {gpu.name}")
else:
    print("TensorFlow is not using any GPUs.")

TensorFlow is using 1 GPU(s).
GPU: /physical_device:GPU:0


In [9]:
def load_dev(data_path, labels_path):  # load dev data
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels

def load_training(data_path, labels_path, num_lines, last_n_lines):
    def get_last_n_lines(file_path, n):
        with open(file_path, 'r') as file:
            file.seek(0, 2)  # Move to the end of the file
            file_size = file.tell()
            buffer_size = 1024
            lines = []

            while len(lines) <= n and file_size > 0:
                # Move backward by buffer size
                file_size -= buffer_size
                if file_size < 0:
                    file_size = 0
                file.seek(file_size)
                buffer = file.read(buffer_size)
                lines = buffer.splitlines() + lines
            
            # Take the last `n` lines
            return [ast.literal_eval(line.strip()) for line in lines[-n:]]
    
    def get_random_lines(file_path, indices):
        lines = []
        with open(file_path, 'r') as file:
            for i in indices:
                file.seek(0)  # Reset file pointer
                for current_line, line_content in enumerate(file):
                    if current_line == i:
                        lines.append(ast.literal_eval(line_content.strip()))
                        break
        return lines

    # Step 1: Load the last `last_n_lines` lines from both files
    data_last_lines = get_last_n_lines(data_path, last_n_lines)
    labels_last_lines = get_last_n_lines(labels_path, last_n_lines)
    print("Last lines loaded")
    # Step 2: Determine random indices for remaining lines
    remaining_lines = num_lines - len(data_last_lines)
    with open(data_path, 'r') as file:
        total_lines = sum(1 for _ in file)  # Total number of lines in the file

    random_indices = random.sample(range(total_lines), remaining_lines)

    # Step 3: Load the random lines from both files
    data_random_lines = get_random_lines(data_path, random_indices)
    labels_random_lines = get_random_lines(labels_path, random_indices)
    print("Random lines loaded")
    # Combine last lines and random lines
    data = data_last_lines + data_random_lines
    labels = labels_last_lines + labels_random_lines

    return data, labels





In [10]:
data, labels = load_training('training_data_processed1.txt', 'train_order_category_labels.txt',100,10)
print(data[:5])
print(labels[:5])

SyntaxError: unexpected EOF while parsing (<unknown>, line 1)

In [44]:
dev_data, dev_labels = load_dev('dev_data_processed.txt', 'dev_order_category_labels.txt')


In [7]:
pretrained_model = gs.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [73]:
#! get v Aand replace unknown words with unk token
def process_sentence(sentence, model):
    for i, word in enumerate(sentence):
        if word not in model and word  != 'a':
            sentence[i] = 'unk'
    return sentence

data = [process_sentence(sentence, pretrained_model) for sentence in data]
print(data[:5])
vocab=set()
for sentence in data:
    vocab.update(sentence)
#! get word index for each word in vocab
word2idx = {word: idx for idx, word in enumerate(vocab)}

[['two', 'unk', 'fl', 'ounce', 'diet', 'sprite', 'in', 'can', 'unk', 'three', 'ginger', 'ale', 'unk', 'four', 'unk', 'ml', 'san', 'unk'], ['three', 'pizza', 'no', 'american', 'cheese', 'unk', 'i', 'want', 'three', 'pie', 'no', 'bacon'], ['i', 'like', 'just', 'one', 'pizza', 'with', 'artichoke', 'unk', '2', '7', 'up'], ['i', 'like', 'a', 'personal', 'pizza', 'with', 'cherry', 'tomato', 'bbq', 'pull', 'pork', 'unk', 'roast', 'tomato'], ['i', 'like', 'a', 'big', 'new', 'yorker', 'pizza', 'with', 'american', 'cheese', 'unk', 'extra', 'mozzarella']]


In [74]:
embedding_dim=300
input_dim=len(vocab)
output_dim=5
max_length=100

In [75]:
for tokens in dev_data:
    for i,word in enumerate(tokens):
        if word not in vocab!= 'a':
            tokens[i] = 'unk'
X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=4)

In [76]:
#! get embeddings matrix
def get_embeddings_matrix(model, vocab):
    vocab_size = len(vocab)
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for i, word in enumerate(vocab):
        if word in model:
            embedding_matrix[i] = model[word]
        elif word == 'a':
            embedding_matrix[i] = model['one']
    return embedding_matrix
embedding_matrix = get_embeddings_matrix(pretrained_model, vocab)

In [77]:
#! replace words with their index in vocab and pad sentences
X=[[word2idx[word] for word in sentence] for sentence in data]
X=pad_sequences(X, maxlen=max_length, padding='post', value=-1)
Y=pad_sequences(labels, maxlen=max_length, padding='post', value=4)

In [78]:
#! model
Bidirectional_LSTM_model = tf.keras.Sequential([
    tf.keras.layers.Masking(mask_value=-1),  #! Masking layer to handle the padded values
    tf.keras.layers.Embedding(input_dim=input_dim, output_dim=embedding_dim, weights=[embedding_matrix], trainable=True,mask_zero=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(output_dim, activation='softmax')
])
## dropout->.8 , layers->48 best acc->70%
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',           #! Monitor the validation loss (you can use 'accuracy' or another metric)
    factor=0.5,               #! Factor by which the learning rate will be reduced
    patience=5,               #! Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-9,              #! Lower bound on the learning rate
    verbose=1                 #! Print a message when the learning rate is reduced
)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
Bidirectional_LSTM_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [79]:
Bidirectional_LSTM_model.fit(
    X, 
    Y, 
    validation_data=(X_d, Y_d),  # Add validation data
    callbacks=[lr_scheduler], 
    epochs=10,
    batch_size=1024
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d62b9e87c0>

In [80]:
Bidirectional_LSTM_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_5 (Masking)         (None, 100)               0         
                                                                 
 embedding_5 (Embedding)     (None, 100, 300)          64800     
                                                                 
 bidirectional_5 (Bidirectio  (None, 100, 64)          85248     
 nal)                                                            
                                                                 
 dropout_5 (Dropout)         (None, 100, 64)           0         
                                                                 
 dense_5 (Dense)             (None, 100, 5)            325       
                                                                 
Total params: 150,373
Trainable params: 150,373
Non-trainable params: 0
________________________________________________

## Evaluate model on training data

In [81]:
preds_train = Bidirectional_LSTM_model.predict(X)
preds_train = np.argmax(preds_train, axis=-1)



In [82]:
count = 0
for i in range(len(X)):
    mask=X[i]!=-1
    if np.all(preds_train[i][mask]==Y[i][mask]):
        count+=1
print(f"Accuracy on training data: {count/len(X)}")

Accuracy on training data: 0.76324


In [83]:
# # Predict on the development set
# preds_dev = Bidirectional_LSTM_model.predict(X)
# preds_dev = np.argmax(preds_dev, axis=-1)

# count = 0  # Count of completely correct sequences
# last_index_error_count = 0  # Count of sequences with only last index error

# for i in range(len(data)):
#     # Get original sequence lengths before padding
#     original_length = len(data[i])
    
#     # Extract predictions and true labels for the original sequence
#     pred_seq = preds_dev[i][:original_length]
#     true_seq = labels[i]  # Original labels are unpadded
    
#     # Check if the sequence is entirely correct
#     if (pred_seq == true_seq).all():
#         count += 1
#     else:
#         # Check if only the last index is incorrect
#         if (pred_seq[:-1] == true_seq[:-1]).all() and (pred_seq[-1] != true_seq[-1]):
#             last_index_error_count += 1
        
#         # Print debug information for mismatches
#         print(f"Predicted: {pred_seq}")
#         print(f"True:      {true_seq}")
#         print("--------------------------------------------")

# # Print results
# print(f"Sequences with only last index error: {last_index_error_count}")


## Evaluate model on dev data

In [84]:
# dev_data, dev_labels = load_dev('dev_data_processed.txt', 'dev_order_category_labels.txt')
# for tokens in dev_data:
#     for i,word in enumerate(tokens):
#         if word not in vocab!= 'a':
#             tokens[i] = 'unk'
# X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
# X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
# Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=2)

In [85]:
# Predict on the development set
preds_dev = Bidirectional_LSTM_model.predict(X_d)
preds_dev = np.argmax(preds_dev, axis=-1)

count = 0  # Count of completely correct sequences
last_index_error_count = 0  # Count of sequences with only last index error

for i in range(len(dev_data)):
    # Get original sequence lengths before padding
    original_length = len(dev_data[i])
    
    # Extract predictions and true dev_labels for the original sequence
    pred_seq = preds_dev[i][:original_length]
    true_seq = dev_labels[i]  # Original dev_labels are unpadded
    
    # Check if the sequence is entirely correct
    if (pred_seq == true_seq).all():
        count += 1
    else:
        # Check if only the last index is incorrect
        if (pred_seq[:-1] == true_seq[:-1]).all() and (pred_seq[-1] != true_seq[-1]):
            last_index_error_count += 1
        
        # Print debug information for mismatches
        print(f"Predicted: {pred_seq}")
        print(f"True:      {true_seq}")
        print("--------------------------------------------")

# Print results
print(f"Accuracy on dev_data: {count / len(dev_data):.4f}")
print(f"Sequences with only last index error: {last_index_error_count}")


Predicted: [4 4 4 4 0 4 4 4 4 4]
True:      [4, 4, 4, 4, 0, 4, 4, 4, 4, 1]
--------------------------------------------
Predicted: [4 4 4 4 0 4 4 4 4 1 1 4 4 1 0 4 4 4 1 4 1 4 2 4 3]
True:      [4, 4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 2, 4, 3]
--------------------------------------------
Predicted: [4 4 4 4 4 0 4 4 4 4 4 4]
True:      [4, 4, 4, 4, 4, 0, 4, 4, 4, 4, 1, 4]
--------------------------------------------
Predicted: [4 4 0 4 4 4 4 4 4 1 4 4 1]
True:      [4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1]
--------------------------------------------
Predicted: [4 4 4 0 4 4 4 4 4 4 4 4]
True:      [4, 4, 4, 0, 4, 4, 4, 4, 4, 4, 1, 4]
--------------------------------------------
Predicted: [4 4 0 4 4 4 4 4 4 4 4 4 4 4 4 1 4 4 1]
True:      [4, 4, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1]
--------------------------------------------
Predicted: [4 4 4 4 0 4 4 4 4 4 2 4 3]
True:      [4, 4, 4, 4, 0, 4, 4, 4, 1, 4, 2, 4, 3]
-----------------------------------