In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
import numpy as np
import pandas as pd
import sys
import yaml
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer, ngrams_iterator

import nltk

sys.path.append("../")

from utils.model import CBOW_Model
from utils.dataset import CBOW_Dataset, SkipGram_Dataset
from utils.constants import MIN_WORD_FREQUENCY, EMBED_DIMENSION
from utils.trainer import Trainer

In [3]:
with open("../config.yaml", "r") as stream:
    config = yaml.safe_load(stream)

config

{'dataset': 'WikiText2',
 'data_dir': 'data/',
 'train_batch_size': 1000,
 'val_batch_size': 1000,
 'learning_rate': 0.025,
 'epochs': 20,
 'train_steps': None,
 'val_steps': None,
 'checkpoint_frequency': 5,
 'model_dir': 'weights/'}

In [36]:
model = torch.load("../weights/cbow_model_010.pt")
vocab = torch.load("../weights/vocab.pt")

In [37]:
embeddings_raw = list(model.parameters())[0]
embeddings_raw = embeddings_raw.cpu().detach().numpy()
embeddings = embeddings_raw / ((embeddings_raw ** 2).sum(axis=0) ** (1 / 2))
embeddings.shape

(300, 4122)

In [35]:
main_word_id = vocab["father"]

word_vec = embeddings[:, main_word_id]
word_vec = np.reshape(word_vec, (1, len(word_vec)))

dists = np.matmul(word_vec, embeddings).flatten()

print(vocab.lookup_token(main_word_id))
print()
top5 = np.argsort(-dists)[:10]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

father

father: 1.000
mother: 0.359
husband: 0.350
keamy: 0.346
homer: 0.319
childhood: 0.312
successor: 0.312
experiences: 0.309
birth: 0.308
relationship: 0.303


In [38]:
main_word_id = vocab["father"]

word_vec = embeddings[:, main_word_id]
word_vec = np.reshape(word_vec, (1, len(word_vec)))

dists = np.matmul(word_vec, embeddings).flatten()

print(vocab.lookup_token(main_word_id))
print()
top5 = np.argsort(-dists)[:10]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

father

father: 1.000
mother: 0.360
wife: 0.315
brother: 0.310
son: 0.273
parents: 0.242
career: 0.239
daughter: 0.233
journey: 0.229
friend: 0.213


In [20]:
emb1 = embeddings[:, vocab["leading"]]
emb2 = embeddings[:, vocab["lead"]]
emb3 = embeddings[:, vocab["do"]]

emb4 = emb1 - emb2 + emb3
emb4 = np.reshape(emb4, (1, len(emb4)))
dists = np.matmul(emb4, embeddings).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

do: 0.915
leading: 0.817
did: 0.455
does: 0.428
1939: 0.349


In [80]:
dataset_class = CBOW_Dataset

train_dataset = dataset_class(
    name=config["dataset"],
    set_type="train",
    data_dir=config["data_dir"],
    vocab=None,
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=config["train_batch_size"],
    shuffle=True,
    drop_last=True,
)

wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:00<00:00, 8.50MB/s]


In [84]:
train_dataloader.dataset.vocab

Vocab()