In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from gensim.models import KeyedVectors
from tqdm import tqdm
import spacy
from utils import preprocess, get_windows

Using TensorFlow backend.


In [2]:
MIN_COUNTS = 10
# words with count < MIN_COUNTS
# will be removed

MIN_LENGTH = 11
# minimum document length

HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [3]:
nlp = spacy.load('en')

# Load dataset

In [5]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = dataset['data']

# Preprocess dataset and create windows

In [6]:
encoded_docs, decoder, word_counts = preprocess(docs, nlp, MIN_LENGTH, MIN_COUNTS)

100%|██████████| 18846/18846 [00:49<00:00, 377.13it/s]


In [7]:
data = []
for index, doc in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

16077it [00:03, 4597.01it/s]


In [8]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [9]:
# number of windows
data.shape[0]

1455008

# Unigram distribution

In [10]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

# Prepare word vectors

In [None]:
# https://code.google.com/archive/p/word2vec/
word2vec = KeyedVectors.load_word2vec_format('~/data/GoogleNews-vectors-negative300.bin', binary=True)  

def has_vector(word):
    return word in word2vec

def get_vector(word):
    return word2vec[word]

In [12]:
unique_tokens = [(decoder[i], word_counts[i]) for i in decoder]

vocab_size = len(unique_tokens)
embedding_dim = 300

word_vectors = np.zeros((vocab_size, embedding_dim), 'float32')

In [13]:
no_vec = [t for t in unique_tokens if not has_vector(t[0])]
with_vec = [t for t in unique_tokens if has_vector(t[0])]

# number of words without pretrained word vector
len(no_vec)

1347

In [14]:
# find initialization for vectors of 
# words without pretrained word vectors
vectors = []
for token, count in with_vec:
    # if rare word with word vector
    if count < 20:
        vectors += [get_vector(token)]

vectors = np.array(vectors)
mean = vectors.mean(0)
std = vectors.std(0)
# https://stackoverflow.com/questions/32277377/cnn-initializing-unknown-words-from-word2vec

In [15]:
for i in range(vocab_size):
    if has_vector(decoder[i]):
        word_vectors[i] = get_vector(decoder[i])
    else:
        word_vectors[i] = mean + np.random.uniform(-std, std)

# Save data

In [16]:
np.save('data.npy', data) # ~134 MB
np.save('word_vectors.npy', word_vectors) # ~42 MB
np.save('unigram_distribution.npy', unigram_distribution)
np.save('decoder.npy', decoder)