In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
!pip list|grep spacy

spacy (1.9.0)
[33mYou are using pip version 9.0.3, however version 20.2.4 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from tqdm import tqdm
import spacy
from gensim import corpora, models

import sys
sys.path.append('..')
from utils import preprocess, get_windows

In [3]:
MIN_COUNTS = 20
MAX_COUNTS = 1800
# words with count < MIN_COUNTS
# and count > MAX_COUNTS
# will be removed

MIN_LENGTH = 15
# minimum document length 
# (number of words)
# after preprocessing

# half the size of the context around a word
HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [4]:
nlp = spacy.load('en')

# Load dataset

In [5]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
docs = dataset['data']

In [6]:
# number of documents
len(docs)

18846

In [7]:
# store an index with a document
docs = [(i, doc) for i, doc in enumerate(docs)]

# Preprocess dataset and create windows

In [8]:
encoded_docs, decoder, word_counts = preprocess(
    docs, nlp, MIN_LENGTH, MIN_COUNTS, MAX_COUNTS
)

100%|██████████| 18846/18846 [00:35<00:00, 537.62it/s]


number of removed short documents: 3985
total number of tokens: 1439861
number of tokens to be removed: 393091
number of additionally removed short documents: 2032
total number of tokens: 1023189

minimum word count number: 14
this number can be less than MIN_COUNTS because of document removal


In [9]:
# new ids will be created for the documents.
# create a way of restoring initial ids:
doc_decoder = {i: doc_id for i, (doc_id, doc) in enumerate(encoded_docs)}

In [42]:
import pdb

In [43]:
data = []
# new ids are created here
for index, (_, doc) in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    pdb.set_trace()
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

0it [00:00, ?it/s]

> [0;32m<ipython-input-43-cbb6d20d8645>[0m(9)[0;36m<module>[0;34m()[0m
[0;32m      7 [0;31m    [0;31m# where word is in the document[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 9 [0;31m    [0mdata[0m [0;34m+=[0m [0;34m[[0m[0;34m[[0m[0mindex[0m[0;34m,[0m [0mw[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m][0m [0;34m+[0m [0mw[0m[0;34m[[0m[0;36m1[0m[0;34m][0m [0;32mfor[0m [0mw[0m [0;32min[0m [0mwindows[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m[0;34m[0m[0m
[0m[0;32m     11 [0;31m[0mdata[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0mdata[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;34m'int64'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  windows


[(3, [2390, 443, 203, 2929, 655, 93, 688, 2390, 880, 3624]), (2390, [3, 443, 203, 2929, 655, 93, 688, 2390, 880, 3624]), (443, [3, 2390, 203, 2929, 655, 93, 688, 2390, 880, 3624]), (203, [3, 2390, 443, 2929, 655, 93, 688, 2390, 880, 3624]), (2929, [3, 2390, 443, 203, 655, 93, 688, 2390, 880, 3624]), (655, [3, 2390, 443, 203, 2929, 93, 688, 2390, 880, 3624]), (93, [2390, 443, 203, 2929, 655, 688, 2390, 880, 3624, 48]), (688, [443, 203, 2929, 655, 93, 2390, 880, 3624, 48, 12]), (2390, [203, 2929, 655, 93, 688, 880, 3624, 48, 12, 12]), (880, [2929, 655, 93, 688, 2390, 3624, 48, 12, 12, 19]), (3624, [655, 93, 688, 2390, 880, 48, 12, 12, 19, 80]), (48, [93, 688, 2390, 880, 3624, 12, 12, 19, 80, 2660]), (12, [688, 2390, 880, 3624, 48, 12, 19, 80, 2660, 12]), (12, [2390, 880, 3624, 48, 12, 19, 80, 2660, 12, 3358]), (19, [880, 3624, 48, 12, 12, 80, 2660, 12, 3358, 2390]), (80, [3624, 48, 12, 12, 19, 2660, 12, 3358, 2390, 61]), (2660, [48, 12, 12, 19, 80, 12, 3358, 2390, 61, 1853]), (12, [12, 1

ipdb>  len(windows)


47


ipdb>  windows[0]


(3, [2390, 443, 203, 2929, 655, 93, 688, 2390, 880, 3624])


ipdb>  windows[1]


(2390, [3, 443, 203, 2929, 655, 93, 688, 2390, 880, 3624])


ipdb>  index


0


ipdb>  len(encoded_docs)


12829


ipdb>  doc


[3, 2390, 443, 203, 2929, 655, 93, 688, 2390, 880, 3624, 48, 12, 12, 19, 80, 2660, 12, 3358, 2390, 61, 1853, 32, 3392, 139, 791, 353, 1245, 2, 1304, 373, 972, 4957, 3392, 2, 1304, 289, 2390, 755, 2145, 2066, 232, 567, 791, 353, 2390, 170]


ipdb>  w[0]


*** invalid literal for int() with base 10: '[0]'


ipdb>  windows[0][0]


3


ipdb>  windows[0][1]


[2390, 443, 203, 2929, 655, 93, 688, 2390, 880, 3624]


ipdb>  q


0it [02:42, ?it/s]


BdbQuit: 

In [41]:
data.shape

(1023189, 12)

In [11]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [12]:
# number of windows (equals to the total number of tokens)
data.shape[0]

1023189

# Get unigram distribution

In [13]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

# Prepare word vectors

In [14]:
%%time
vocab_size = len(decoder)
embedding_dim = 50

# train a skip-gram word2vec model
texts = [[str(j) for j in doc] for i, doc in encoded_docs]
model = models.Word2Vec(texts, size=embedding_dim, window=5, workers=4, sg=1, negative=15, iter=70)
model.init_sims(replace=True)

word_vectors = np.zeros((vocab_size, embedding_dim)).astype('float32')
for i in decoder:
    word_vectors[i] = model.wv[str(i)]

CPU times: user 34min 19s, sys: 1.31 s, total: 34min 20s
Wall time: 8min 42s


In [16]:
# number of unique words
vocab_size

7460

# Prepare initialization for document weights

In [17]:
texts = [[decoder[j] for j in doc] for i, doc in encoded_docs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [18]:
%%time
n_topics = 25
lda = models.LdaModel(corpus, alpha=0.9, id2word=dictionary, num_topics=n_topics)
corpus_lda = lda[corpus]

CPU times: user 5.78 s, sys: 3 µs, total: 5.78 s
Wall time: 5.79 s


In [19]:
for i, topics in lda.show_topics(n_topics, formatted=False):
    print('topic', i, ':', ' '.join([t for t, _ in topics]))

topic 0 : dog sure hear gay friend turn long actually cop auto
topic 1 : bike pin cable lot fan battery buy sure cause little
topic 2 : phone chip encryption privacy clipper device escrow wire wiretap order
topic 3 : car fbi police tire engine belt buy dealer batf happen
topic 4 : kinsey appear book cover copy rider art sex bag comic
topic 5 : section entry code jpeg bit build rule input size output
topic 6 : earth launch orbit moon planet mission solar spacecraft space greek
topic 7 : reason probably son happen make actually far essence kind lot
topic 8 : window ripem message des pgp rsa public application copy bit
topic 9 : software version user display server ftp datum application format package
topic 10 : price technology computer model product cost old buy build ne
topic 11 : dos windows error giz modem window command master shell tcp
topic 12 : space nasa technology cost build design flight shuttle station nsa
topic 13 : israel jews israeli jewish arab kill human religion live ga

In [20]:
doc_weights_init = np.zeros((len(corpus_lda), n_topics))
for i in tqdm(range(len(corpus_lda))):
    topics = corpus_lda[i]
    for j, prob in topics:
        doc_weights_init[i, j] = prob

100%|██████████| 12829/12829 [00:05<00:00, 2374.01it/s]


# Save data

In [21]:
np.save('data.npy', data)
np.save('word_vectors.npy', word_vectors)
np.save('unigram_distribution.npy', unigram_distribution)
np.save('decoder.npy', decoder)
np.save('doc_decoder.npy', doc_decoder)
np.save('doc_weights_init.npy', doc_weights_init)