In [1]:
import logging
import re
from collections import Counter

import numpy as np
import torch
from sklearn.datasets import fetch_20newsgroups
from torch.autograd import Variable
from torch.utils.data import Dataset
from tqdm import tqdm

# Hyperparameters
N_EMBEDDING = 300
BASE_STD = 0.01
BATCH_SIZE = 512
NUM_EPOCH = 10
MIN_WORD_OCCURENCES = 10
X_MAX = 100
ALPHA = 0.75
BETA = 0.0001
RIGHT_WINDOW = 4

USE_CUDA = True

FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=FORMAT)

In [2]:
logging.info("Fetching data")
newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

2022-02-15 17:20:53,023 Fetching data


In [3]:
print(newsgroup.data[1])

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.


In [4]:
from fast_glove import GloveDataset, train_model

In [5]:
logging.info("Build dataset")
glove_data = GloveDataset(newsgroup.data[:1000], right_window=RIGHT_WINDOW)
logging.info("#Words: %s", glove_data.indexer.n_words)
logging.info("#Ngrams: %s", len(glove_data))
logging.info("Start training")
train_model(glove_data)

2022-02-15 17:20:59,617 Build dataset
2022-02-15 17:21:00,672 #Words: 2141
2022-02-15 17:21:00,673 #Ngrams: 168558
2022-02-15 17:21:00,674 Start training
  0%|          | 0/10 [00:00<?, ?it/s]2022-02-15 17:21:00,678 Start epoch 0

  0%|          | 0/329 [00:00<?, ?it/s][A
100%|██████████| 329/329 [00:02<00:00, 111.30it/s]
2022-02-15 17:21:03,641 Average loss for epoch 1: 61.20724
 10%|█         | 1/10 [00:02<00:26,  2.96s/it]2022-02-15 17:21:03,642 Start epoch 1

100%|██████████| 329/329 [00:00<00:00, 411.25it/s]
2022-02-15 17:21:04,448 Average loss for epoch 2: 21.61349
 20%|██        | 2/10 [00:03<00:13,  1.70s/it]2022-02-15 17:21:04,449 Start epoch 2

100%|██████████| 329/329 [00:00<00:00, 420.72it/s]
2022-02-15 17:21:05,236 Average loss for epoch 3: 13.78170
 30%|███       | 3/10 [00:04<00:08,  1.28s/it]2022-02-15 17:21:05,238 Start epoch 3

100%|██████████| 329/329 [00:00<00:00, 430.63it/s]
2022-02-15 17:21:06,008 Average loss for epoch 4: 11.91761
 40%|████      | 4/10 [00:05<00