In [1]:
from gensim.models.word2vec import KeyedVectors
import numpy as np
import json

## Corpus Format Alignment

In [2]:
# Filepaths
codenames_raw = '../data/raw_data/codenames_word_list.txt'
dictionary_raw = [
    '../data/raw_data/google-10000-english-usa-no-swears-long.txt',
    '../data/raw_data/google-10000-english-usa-no-swears-medium.txt',
    '../data/raw_data/google-10000-english-usa-no-swears-short.txt' 
]
codenames_output = '../data/processed_data/codenames_vecs.json'
dictionary_output = '../data/processed_data/dictionary_vecs.json'

In [3]:
# Process codenames word list
with open(codenames_raw) as f:
    raw_text = f.readlines()
codenames_word = np.array([x.strip().lower() for x in raw_text])

In [4]:
# Process dictionary corpus
processed_text = []
for file in dictionary_raw:
    with open(file) as f:
        raw_text = f.readlines()
        processed_text += [x.strip().lower() for x in raw_text]
dict_word = np.array(processed_text)

## Match Vector Embeddings

In [5]:
def lowercase_corpus(kv):
    words = []
    vecs = []
    word_list = kv.index_to_key
    count = 0
    for word in word_list:
        words.append(word.lower())
        vecs.append(kv[word])
    kv.add_vectors(words, vecs)
    return kv

In [6]:
kv = KeyedVectors.load_word2vec_format('../data/word_corpus/model.bin', binary=True)
kv = lowercase_corpus(kv)
codenames_vecs = {x: kv[x].tolist() for x in codenames_word if x in kv.key_to_index}
dictionary_vecs = {x: kv[x].tolist() for x in dict_word if x in kv.key_to_index}

In [7]:
with open(codenames_output, 'w') as f:
    json.dump(codenames_vecs, f)
with open(dictionary_output, 'w') as f:
    json.dump(dictionary_vecs, f)