In [1]:
from sklearn.datasets import fetch_20newsgroups
import spacy
import numpy as np
from tqdm import tqdm
from utils import preprocess, get_windows

In [2]:
MIN_COUNTS = 5
MIN_LENGTH = 11
HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [3]:
# it is a big model
nlp = spacy.load('en_core_web_md')
    
# fix bug with stop words: https://github.com/explosion/spaCy/issues/922
nlp.vocab.add_flag(lambda s: s.lower() in spacy.en.word_sets.STOP_WORDS, spacy.attrs.IS_STOP);

# Load dataset

In [4]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = dataset['data']

# Preprocess dataset and create windows

In [5]:
encoded_docs, decoder, word_counts = preprocess(docs, nlp, MIN_LENGTH, MIN_COUNTS)

100%|██████████| 18846/18846 [00:21<00:00, 881.32it/s] 


In [6]:
data = []
for index, doc in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

15868it [00:03, 4334.40it/s]


In [7]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [8]:
# number of windows
data.shape[0]

1395141

# Unigram distribution

In [9]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

# Prepare word vectors

In [10]:
def has_vector(word):
    return nlp.vocab[word].has_vector

def get_vector(word):
    return nlp.vocab[word].vector

In [11]:
unique_tokens = [(decoder[i], word_counts[i]) for i in decoder]
unique_tokens = sorted(unique_tokens, key=lambda x: x[1], reverse=True)

vocab_size = len(unique_tokens)
embedding_dim = 300

word_vectors = np.zeros((vocab_size, embedding_dim), 'float32')

In [12]:
no_vec = [t for t in unique_tokens if not has_vector(t[0])]
with_vec = [t for t in unique_tokens if has_vector(t[0])]

# number of words without pretrained word vector
len(no_vec)

2426

In [13]:
# find initialization for vectors of 
# words without pretrained word vectors
vectors = []
for i, (token, count) in enumerate(with_vec):
    # if rare word with word vector
    if count < 50:
        vectors += [get_vector(token)]

vectors = np.array(vectors)
mean = vectors.mean(0)
std = vectors.std(0)
# https://stackoverflow.com/questions/32277377/cnn-initializing-unknown-words-from-word2vec

In [14]:
for i in tqdm(range(vocab_size)):
    if has_vector(decoder[i]):
        word_vectors[i] = get_vector(decoder[i])
    else:
        word_vectors[i] = mean + np.random.uniform(-std, std)

100%|██████████| 19117/19117 [00:00<00:00, 99436.74it/s] 


# Save data

In [15]:
np.save('data.npy', data) # ~128 MB
np.save('word_vectors.npy', word_vectors) # ~22 MB
np.save('unigram_distribution.npy', unigram_distribution)
np.save('decoder.npy', decoder)