In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from collections import Counter
import spacy
nlp = spacy.load('en_core_web_md')

# Load doc data

In [2]:
D = pd.read_hdf('data.hdf', 'data')
docs = list(D['docs'])
docs = [d.split(' ') for d in docs]

# Get unigram distribution

In [3]:
tokens = []
for d in docs:
    tokens += d

term_counts = Counter(tokens)

# number of unique words
len(term_counts)

13812

In [4]:
# (term, term_count)
term_counts = [(k, term_counts[k]) for k in term_counts]

In [5]:
# sort by counts 
term_counts = sorted(term_counts, key=lambda x: x[1], reverse=True)
term_counts[:10]

[('ax', 62552),
 ('know', 7385),
 ('good', 6652),
 ('like', 6573),
 ('people', 6473),
 ('don', 6457),
 ('think', 6206),
 ('time', 5747),
 ('use', 5531),
 ('max', 4747)]

In [6]:
term_counts[-10:]

[('terrify', 9),
 ('dividians', 8),
 ('brag', 8),
 ('winmarks', 8),
 ('penio', 8),
 ('ahhh', 8),
 ('harvest', 8),
 ('ithaca', 8),
 ('freewill', 8),
 ('encrypting', 7)]

In [7]:
# total number of counts
counts_sum = sum([t[1] for t in term_counts])
counts_sum

1582513

In [8]:
unigram_dist = {i: v[1]/counts_sum for i, v in enumerate(term_counts, 0)}

# Encode documents

In [9]:
# encode words with integers
encode = {v[0]: i for i, v in enumerate(term_counts, 0)}
decode = {i: v[0] for i, v in enumerate(term_counts, 0)}
# 0 - the most common word, 1 - the second most common word, ...

In [10]:
encoded_docs = D['docs'].apply(lambda x: x.split(' ')).apply(lambda x: [encode[w] for w in x])

In [11]:
encoded_docs.head()

0    [84, 2685, 542, 288, 3019, 791, 180, 39, 822, ...
1    [1206, 675, 70, 603, 344, 98, 53, 2157, 215, 4...
2    [754, 10, 1716, 10509, 13, 202, 97, 12, 3, 244...
3    [6, 253, 98, 2687, 757, 140, 253, 98, 2687, 75...
4    [104, 36, 8, 13, 16, 1281, 181, 1040, 108, 127...
Name: docs, dtype: object

# Create windows

In [12]:
hws = 5 # half window_size

In [13]:
# doc: a list of words
def get_windows(doc, hws):
    # it returns a list of tuples,
    # each tuple looks like this:
    # (word w, [hws words that come before w] + [hws words that come after w])
    
    length = len(doc)
    
    if length > 2*hws:
        
        inside = [(w, doc[(i - hws):i] + doc[(i + 1):(i + hws + 1)]) 
                  for i, w in enumerate(doc[hws:-hws], hws)]
        
        # For words that are near the beginning or
        # the end of doc tuples are slightly different
        
        beginning = [(w, doc[:i] + doc[(i + 1):(2*hws + 1)]) 
                     for i, w in enumerate(doc[:hws], 0)]
        
        end = [(w, doc[-(2*hws + 1):i] + doc[(i + 1):]) 
               for i, w in enumerate(doc[-hws:], length - hws)]
        
        return beginning + inside + end
    else:
        print('Error! Not enough words in doc')

In [14]:
windows = encoded_docs.apply(lambda x: get_windows(x, hws))

# Prepare data for lda2vec

In [15]:
data = []
for index, list_of_windows in tqdm(windows.iteritems()):
    # index represents id of a document, 
    # list_of_windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in list_of_windows]

16116it [00:02, 6612.25it/s] 


In [16]:
data = np.array(data, dtype='int64')

In [17]:
# a row in the data contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [18]:
# number of windows
data.shape[0]

1582513

# Prepare word vectors

In [19]:
def has_vector(word):
    return nlp.vocab[word].has_vector

def get_vector(word):
    return nlp.vocab[word].vector

In [20]:
vocab_size = len(term_counts)
embedding_dim = 300

word_vectors = np.zeros((vocab_size, embedding_dim), 'float32')

In [21]:
no_vec = [t for t in term_counts if not has_vector(t[0])]
with_vec = [t for t in term_counts if has_vector(t[0])]
len(no_vec)

939

In [22]:
def approximately_equal(x, y):
    return abs(x - y) < 5

In [23]:
def get_initial_word_vector(word_count):
    close_words = [t[0] for t in with_vec if approximately_equal(t[1], word_count)]
    if len(close_words) == 0:
        print('no words with similar count')
    return np.array([get_vector(w) for w in close_words]).mean(0)

In [24]:
for i, t in tqdm(enumerate(term_counts)):
    if t in with_vec:
        word_vectors[i] = get_vector(t[0])
    else:
        word_vectors[i] = get_initial_word_vector(t[1])

13812it [00:27, 504.57it/s] 


# Save data

In [25]:
np.save('window_data.npy', data) # ~145 MB
np.save('word_vectors.npy', word_vectors) # ~16 MB
np.save('unigram_distribution.npy', unigram_dist)
np.save('decode.npy', decode)