### Get embeddings
We are retraining embeddings based on the cbow architecture in word2vec. 

First, tokenize the combined text8 and ms-marco dataset.

In [1]:
from tokenizer import preprocess, create_lookup_tables
import pickle

# open the combined_text8_msmarco.txt file and read the text
with open('./data/combined_text8_msmarco.txt', 'r') as f:
    combined_text = f.read()

# tokenize the text and save as a .pkl file
combined_tokens = preprocess(combined_text)
with open('./data/temp/combined_corpus.pkl', 'wb') as f:
    pickle.dump(combined_tokens, f)

# create the lookup tables
vocab_to_int, int_to_vocab = create_lookup_tables(combined_tokens)

# save the lookup tables to a .pkl file
with open('./data/temp/combined_vocab_to_int.pkl', 'wb') as f:
    pickle.dump(vocab_to_int, f)

with open('./data/temp/combined_int_to_vocab.pkl', 'wb') as f:
    pickle.dump(int_to_vocab, f)


Number of words before filtering: 62266764
Number of words after filtering: 61436223
Number of words in vocab: 127665
Number of words in vocab_to_int: 127666
Number of words in int_to_vocab: 127666


Now that we have our new vocabulary from the combined text8 wiki data and ms-marco data, let's generate the embeddings using the CBOW architecture.

In [2]:
#print(corpus[:100])

# Generate the training data from the corpus
# The training data looks like a list of tuples,
# where each tuple contains a list of context words and the target word (not the IDs)


def generate_training_data(corpus):
   data = []

   # start from index 2 and end 2 positions before the last word
   # this ensures we always have 2 words before and after the target word
   # for a 5-len sliding window


   for i in range(2, len(corpus) - 2):
       # Get the context words
       # 'i' is the index of the target word
       # [i-2:i] gets the two words before the target word
       # [i+1:i+3] gets the two words after the target word
       context_words = corpus[i-2:i] + corpus[i+1:i+3]
      
       # Get the target word
       target_word = corpus[i]


       # Append the tuple to the data list
       data.append((context_words, target_word))


   return data

In [3]:
# usage

# load the corpus
#with open('./data/temp/combined_corpus.pkl', 'rb') as f:
    #corpus = pickle.load(f)

training_data = generate_training_data(combined_tokens)

print("CBOW training data generated")



CBOW training data generated


In [11]:
# save the training data to a .pkl file
with open('./data/temp/combined_training_data.pkl', 'wb') as f:
    pickle.dump(training_data, f)


In [13]:
del training_data  # Free up memory

NameError: name 'training_data' is not defined

In [9]:
# quick eyeball check to see if the training data is correct
# show the last 30 tuples in the training data
training_data[-30:]


[(['esp', 'magnetic', 'or', 'electric'], 'poles'),
 (['magnetic', 'poles', 'electric', 'charge'], 'or'),
 (['poles', 'or', 'charge', '<PERIOD>'], 'electric'),
 (['or', 'electric', '<PERIOD>', '3'], 'charge'),
 (['electric', 'charge', '3', '<PERIOD>'], '<PERIOD>'),
 (['charge', '<PERIOD>', '<PERIOD>', '<LEFT_PAREN>'], '3'),
 (['<PERIOD>', '3', '<LEFT_PAREN>', 'general'], '<PERIOD>'),
 (['3', '<PERIOD>', 'general', 'physics'], '<LEFT_PAREN>'),
 (['<PERIOD>', '<LEFT_PAREN>', 'physics', '<RIGHT_PAREN>'], 'general'),
 (['<LEFT_PAREN>', 'general', '<RIGHT_PAREN>', 'the'], 'physics'),
 (['general', 'physics', 'the', 'particular'], '<RIGHT_PAREN>'),
 (['physics', '<RIGHT_PAREN>', 'particular', 'state'], 'the'),
 (['<RIGHT_PAREN>', 'the', 'state', 'of'], 'particular'),
 (['the', 'particular', 'of', 'a'], 'state'),
 (['particular', 'state', 'a', 'part'], 'of'),
 (['state', 'of', 'part', 'of'], 'a'),
 (['of', 'a', 'of', 'a'], 'part'),
 (['a', 'part', 'a', 'body'], 'of'),
 (['part', 'of', 'body', 

In [14]:
print(len(vocab_to_int))

127666
