In [36]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

raw_data = fetch_20newsgroups(subset='all')
sentences = np.asarray(raw_data.data)
labels = raw_data.target

In [42]:
len(sentences)

18846

In [37]:
from torchtext.data import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab


def my_build_vocab_from_iterator(iterator, max_tokens=None) :
    counter = Counter()
    for tokens in iterator:
        counter.update(tokens)

    word_vocab = Vocab(counter, max_size=max_tokens)

    return word_vocab


def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

sentence_iter = iter(sentences)
tokenizer = get_tokenizer('basic_english')

vocab = my_build_vocab_from_iterator(yield_tokens(sentence_iter), max_tokens=19998)
print(len(vocab))

20000


In [38]:
from torchtext.vocab import GloVe

global_vectors = GloVe(name='840B', dim=300)

In [39]:
vocab_list = vocab.itos
weights_matrix = global_vectors.get_vecs_by_tokens(vocab_list)

print(weights_matrix.shape)
weights_matrix

torch.Size([20000, 300])


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0120,  0.2075, -0.1258,  ...,  0.1387, -0.3605, -0.0350],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [40]:
import pickle

fname = 'news_vocab_glove_weight.pk'
with open(fname, 'wb') as f:
    pickle.dump(weights_matrix, f)

In [41]:
vname = 'news_vocab_list.pk'
with open(vname, 'wb') as f:
    pickle.dump(vocab_list, f)