In [None]:
!pip install gensim

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Masking, Embedding, Bidirectional, LSTM, Attention, Dropout, Dense
from tensorflow.keras import Model, Input
import gensim as gs
import ast
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
#! check if tensorflow is using GPU
gpus = tf.config.list_physical_devices('GPU')

if gpus:
    print(f"TensorFlow is using {len(gpus)} GPU(s).")
    for gpu in gpus:
        print(f"GPU: {gpu.name}")
else:
    print("TensorFlow is not using any GPUs.")

In [None]:
def load_data_labels(data_path, labels_path):
    with open(data_path, 'r') as f:
        data = [ast.literal_eval(line.strip()) for line in f]
    with open(labels_path, 'r') as f:
        labels = [ast.literal_eval(line.strip()) for line in f]
    return data, labels

In [None]:
data, labels = load_data_labels('training_data_processed.txt', 'train_order_category_labels.txt')
print(len(data))
print(len(labels))
print(data[:5])
print(labels[:5])

In [None]:
dev_data, dev_labels = load_data_labels('dev_data_processed.txt', 'dev_order_category_labels.txt')

In [None]:
pretrained_model = gs.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
#! get v Aand replace unknown words with unk token
def process_sentence(sentence, model):
    for i, word in enumerate(sentence):
        if word not in model and word  != 'a':
            sentence[i] = 'unk'
    return sentence

data = [process_sentence(sentence, pretrained_model) for sentence in data]
print(data[:5])
vocab=set()
for sentence in data:
    vocab.update(sentence)
#! get word index for each word in vocab
word2idx = {word: idx for idx, word in enumerate(vocab)}

In [None]:
embedding_dim=300
input_dim=len(vocab)
output_dim=3
max_length=100

In [None]:
for tokens in dev_data:
    for i,word in enumerate(tokens):
        if word not in vocab!= 'a':
            tokens[i] = 'unk'
X_d=[[word2idx[word] for word in sentence] for sentence in dev_data]
X_d=pad_sequences(X_d, maxlen=max_length, padding='post', value=-1)
Y_d=pad_sequences(dev_labels, maxlen=max_length, padding='post', value=2)

In [None]:
#! get embeddings matrix
def get_embeddings_matrix(model, vocab):
    vocab_size = len(vocab)
    embedding_dim = model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for i, word in enumerate(vocab):
        if word in model:
            embedding_matrix[i] = model[word]
        elif word == 'a':
            embedding_matrix[i] = model['one']
    return embedding_matrix
embedding_matrix = get_embeddings_matrix(pretrained_model, vocab)