In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from gensim.models import KeyedVectors
from tqdm import tqdm
import spacy

import sys
sys.path.append('..')
from utils import preprocess, get_windows

Using TensorFlow backend.


In [3]:
MIN_COUNTS = 10
# words with count < MIN_COUNTS
# will be removed

MIN_LENGTH = 11
# minimum document length

HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [4]:
nlp = spacy.load('en')

# Load dataset

In [5]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = dataset['data']

In [6]:
# number of documents
len(docs)

18846

In [7]:
# store an index with a doc
docs = [(i, doc) for i, doc in enumerate(docs)]

# Preprocess dataset and create windows

In [8]:
encoded_docs, decoder, word_counts = preprocess(docs, nlp, MIN_LENGTH, MIN_COUNTS)

100%|██████████| 18846/18846 [00:48<00:00, 388.97it/s]


number of removed short documents: 2769
total number of tokens: 1455008
number of unknown tokens to be removed: 117818
number of additionally removed short documents: 346
total number of tokens: 1333991

minimum word count number: 9
this number can be less than MIN_COUNTS because of document removal


In [9]:
# create new index without gaps for documents 
# (gaps are because of removed documents)
doc_decoder = {i: doc_id for i, (doc_id, doc) in enumerate(encoded_docs)}

In [10]:
data = []
for index, (_, doc) in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

15731it [00:04, 3751.87it/s]


In [11]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [12]:
# number of windows (equals to the total number of tokens)
data.shape[0]

1333991

# Get unigram distribution

In [13]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

# Prepare word vectors

In [14]:
# https://code.google.com/archive/p/word2vec/
word2vec = KeyedVectors.load_word2vec_format('~/data/GoogleNews-vectors-negative300.bin', binary=True)  

def has_vector(word):
    return word in word2vec

def get_vector(word):
    return word2vec[word]

In [15]:
unique_tokens = [(decoder[i], word_counts[i]) for i in decoder]

vocab_size = len(unique_tokens)
embedding_dim = 300

word_vectors = np.zeros((vocab_size, embedding_dim), 'float32')

# number of unique words
vocab_size

11920

In [16]:
no_vec = [t for t in unique_tokens if not has_vector(t[0])]
with_vec = [t for t in unique_tokens if has_vector(t[0])]

# number of words without pretrained word vector
len(no_vec)

1346

In [17]:
# find initialization for vectors of 
# words without pretrained word vectors
vectors = []
for token, count in with_vec:
    # if rare word with word vector
    if count < 20:
        vectors += [get_vector(token)]

vectors = np.array(vectors)
mean = vectors.mean(0)
std = vectors.std(0)
# https://stackoverflow.com/questions/32277377/cnn-initializing-unknown-words-from-word2vec

In [18]:
for i in range(vocab_size):
    if has_vector(decoder[i]):
        word_vectors[i] = get_vector(decoder[i])
    else:
        word_vectors[i] = mean + np.random.uniform(-std, std)

# Save data

In [19]:
np.save('data.npy', data) # ~123 MB
np.save('word_vectors.npy', word_vectors) # ~14 MB
np.save('unigram_distribution.npy', unigram_distribution)
np.save('decoder.npy', decoder)
np.save('doc_decoder.npy', doc_decoder)