# import

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from torch.optim.lr_scheduler import LambdaLR
import torch.optim as optim

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import WikiText2

from functools import partial

from tqdm.auto import tqdm

import numpy as np

# Model
![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FCzgg5%2FbtqEttXkz91%2FLK5RqukCujicrxQ2kRWt0k%2Fimg.png)

# 1. CBOW Model
![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbTF4tu%2FbtqErKyNHfS%2FNUbfNSKkCF2ktHwVleDknK%2Fimg.png)

In [2]:
# nn.Embedding
# https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

EMBEDING_DIM = 300
EMBEDING_MAX_NORM = 1


class CBOW(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBEDING_DIM,
            max_norm=EMBEDING_MAX_NORM,
        )

        self.linear = nn.Linear(EMBEDING_DIM, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(axis=1)
        x = self.linear(x)
        return x

# 2. Skip-Gram Model
![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FdPRcWU%2FbtqEt5nV6nt%2FgdkM3YokcxtAQZVq5unMp1%2Fimg.png)

In [3]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBEDING_DIM,
            max_norm=EMBEDING_MAX_NORM,
        )

        self.linear = nn.Linear(EMBEDING_DIM, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.linear(x)
        return x

# 3. Data

In [4]:
MIN_WORD_FREQUENCY = 50


# build_vocab_from iterator #
# iterator – Iterator used to build Vocab. Must yield list or iterator of tokens.
# min_freq – The minimum frequency needed to include a token in the vocabulary.
# specials – Special symbols to add. The order of supplied tokens will be preserved.
# special_first – Indicates whether to insert symbols at the beginning or at the end.
# max_tokens – If provided, creates the vocab from the max_tokens - len(specials) most frequent tokens.

def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(
        map(tokenizer, data_iter),
        specials=['<UNK>'],
        min_freq=MIN_WORD_FREQUENCY,
    )

    vocab.set_default_index(vocab['<UNK>'])
    return vocab

In [5]:
# CBOW Collate_fn

CBOW_N_WORDS = 4
MAX_SEQUENCE_LENGTH = 256


def collate_CBOW(batch, text_pipeline):
    '''
    데이터로더에 사용될 collate_fn

    context는 N=CBOW_NWORDS개의 이전 단어들과 N개의 이후 단어들로 구성된다.

    긴 문장은 MAX_SEQUENCE_LENGTH를 넘지 않도록 잘라낸다.

    batch_input은 (CBOW_WORDS * 2개의 context들)로 구성된다.
    batch_output은 middle word로 구성된다.

    :param batch:
    :param text_pipeline:
    :return: batch_input, batch_output
    '''
    batch_input, batch_output = [], []

    for text in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            toekn_id_sequence = text_tokens_ids[idx:idx + CBOW_N_WORDS * 2 + 1]
            output = toekn_id_sequence.pop(CBOW_N_WORDS)
            input_ = toekn_id_sequence

            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)

    return batch_input, batch_output

In [6]:
# Skip-Gram Collate_fn

SKIPGRAM_N_WORDS = 4


def collate_SkipGram(batch, text_pipeline):
    '''
    :param batch:
    :param text_pipeline:

    :return: 'batch_input' : middle word, 'batch_output' : context words
    '''

    batch_input, batch_output = [], []

    for text in batch:
        text_token_ids = text_pipeline(text)

        if len(text_token_ids) < SKIPGRAM_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_token_ids = text_token_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_token_ids) - SKIPGRAM_N_WORDS * 2):
            token_id_sequence = text_token_ids[idx:idx + SKIPGRAM_N_WORDS * 2 + 1]
            input_ = token_id_sequence.pop(SKIPGRAM_N_WORDS)
            outputs = token_id_sequence

            for output in outputs:
                batch_input.append(input_)
                batch_output.append(output)

        batch_input = torch.tensor(batch_input, dtype=torch.long)
        batch_output = torch.tensor(batch_output, dtype=torch.long)

        return batch_input, batch_output

In [7]:
def get_data_iterator(dataset_name, dataset_type, dataset_dir):
    if dataset_name == 'WikiText2':
        data_iter = WikiText2(root=dataset_dir, split=(dataset_type))

    data_iter = to_map_style_dataset(data_iter)

    return data_iter


def get_dataloader_and_vocab(model_name, dataset_name, dataset_type, dataset_dir, batch_size, shuffle, vocab=None):
    data_iter = get_data_iterator(dataset_name, dataset_type, dataset_dir)
    tokenizer = get_tokenizer('basic_english', language='en')

    if not vocab:
        vocab = build_vocab(data_iter, tokenizer)

    text_pipeline = lambda x: vocab(tokenizer(x))

    if model_name == 'CBOW':
        collate_fn = collate_CBOW
    elif model_name == 'SkipGram':
        collate_fn = collate_SkipGram
    else:
        raise ValueError('model_name must be CBOW or SkipGram')

    dataloader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=partial(collate_fn, text_pipeline=text_pipeline),
    )

    return dataloader, vocab

# 4. Train / Validate

In [8]:
def get_lr_scheduler(optimizer, total_epochs: int, verbose: bool = True):
    '''
    논문에서는 0.025로 시작해서 각 에포크마다 선형적으로 감소하여 마지막에는 0이 되도록 한다.
    '''
    lr_lambda = lambda epoch: (total_epochs - epoch) / total_epochs
    lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda, verbose=verbose)

    return lr_scheduler

In [9]:
train_dataloader, vocab = get_dataloader_and_vocab(
    model_name='CBOW',
    dataset_name='WikiText2',
    dataset_type='train',
    dataset_dir='data/WikiText2',
    batch_size=32,
    shuffle=True,
    vocab=None,
)

val_dataloader, _ = get_dataloader_and_vocab(
    model_name='CBOW',
    dataset_name='WikiText2',
    dataset_type='valid',
    dataset_dir='data/WikiText2',
    batch_size=32,
    shuffle=False,
    vocab=vocab
)

In [13]:
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

vocab_size = len(vocab.get_stoi())
print(f'Vocabulary size: {vocab_size}')

model = CBOW(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.025)
lr_scheduler = get_lr_scheduler(optimizer, epochs, verbose=True)

cuda
Vocabulary size: 4100
Adjusting learning rate of group 0 to 2.5000e-02.


In [16]:
loss = {'train': [], 'valid': []}

for epoch in tqdm(range(epochs)):
    ######################## Train #########################
    model.train()
    running_loss = []

    for i, batch_data in enumerate(train_dataloader, 1):
        inputs = batch_data[0].to(device)
        labels = batch_data[1].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss.append(loss.item())

    train_loss = np.mean(running_loss)
    # loss['train'].append(epoch_loss)

    ######################### Validate #############################
    model.eval()
    running_loss = []

    with torch.no_grad():
        for i, batch_data in enumerate(val_dataloader, 1):
            inputs = batch_data[0].to(device)
            labels = batch_data[1].to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss.append(loss.item())

    val_loss = np.mean(running_loss)
    # loss['valid'].append(epoch_loss)

    print(
        'Epoch: {}/{}, Train Loss={:.5f}, Val Loss={:.5f}'.format(
            epoch + 1,
            epochs,
            train_loss,
            val_loss,
        )
    )

    lr_scheduler.step()

    ####################### Save checkpoint ##############################
    model_path = 'checkpoint.pt'
    torch.save(model, model_path)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1/5, Train Loss=5.01915, Val Loss=5.03971
Adjusting learning rate of group 0 to 2.0000e-02.
Epoch: 2/5, Train Loss=4.95284, Val Loss=5.02065
Adjusting learning rate of group 0 to 1.5000e-02.
Epoch: 3/5, Train Loss=4.88210, Val Loss=4.94576
Adjusting learning rate of group 0 to 1.0000e-02.
Epoch: 4/5, Train Loss=4.78571, Val Loss=4.88225
Adjusting learning rate of group 0 to 5.0000e-03.
Epoch: 5/5, Train Loss=4.64527, Val Loss=4.77131
Adjusting learning rate of group 0 to 0.0000e+00.


# 5. Inference

In [17]:
import numpy as np
import pandas as pd
import torch
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

In [18]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
embeddings_norm.shape

(4100, 300)

In [19]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()



In [20]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)
fig.write_html("word2vec_visualization.html")

In [21]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_norm[word_id]
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [23]:
for word, sim in get_top_similar("germany").items():
    print("{}: {:.3f}".format(word, sim))

italy: 0.497
spain: 0.449
states: 0.405
1962: 0.404
australia: 0.400
kingdom: 0.398
ranging: 0.390
1927: 0.385
mediterranean: 0.384
france: 0.383


In [24]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

king: 0.735
woman: 0.437
treaty: 0.402
spain: 0.386
bones: 0.378
