In [0]:
import numpy as np
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
corpus = [
    '노인종합복지관 안내서비스가 있다던데',
    '내가 알고 싶은 건 노인종합복지관 안내야',
    '노인종합복지관 안내 이게 뭐야',
    '노인종합복지관 안내가 뭔지 궁금하다',
    '노인종합복지관 안내 말해',
    '노인종합복지관 안내가 궁금합니다',
    '노인종합복지관 안내 알려줘',
]

In [13]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['노인종합복지관', '안내서비스가', '있다던데'], ['내가', '알고', '싶은', '건', '노인종합복지관', '안내야'], ['노인종합복지관', '안내', '이게', '뭐야'], ['노인종합복지관', '안내가', '뭔지', '궁금하다'], ['노인종합복지관', '안내', '말해'], ['노인종합복지관', '안내가', '궁금합니다'], ['노인종합복지관', '안내', '알려줘']]


In [14]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print(vocabulary)
print(word2idx)
print(idx2word)

['노인종합복지관', '안내서비스가', '있다던데', '내가', '알고', '싶은', '건', '안내야', '안내', '이게', '뭐야', '안내가', '뭔지', '궁금하다', '말해', '궁금합니다', '알려줘']
{'노인종합복지관': 0, '안내서비스가': 1, '있다던데': 2, '내가': 3, '알고': 4, '싶은': 5, '건': 6, '안내야': 7, '안내': 8, '이게': 9, '뭐야': 10, '안내가': 11, '뭔지': 12, '궁금하다': 13, '말해': 14, '궁금합니다': 15, '알려줘': 16}
{0: '노인종합복지관', 1: '안내서비스가', 2: '있다던데', 3: '내가', 4: '알고', 5: '싶은', 6: '건', 7: '안내야', 8: '안내', 9: '이게', 10: '뭐야', 11: '안내가', 12: '뭔지', 13: '궁금하다', 14: '말해', 15: '궁금합니다', 16: '알려줘'}


In [0]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [0]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [0]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
x = torch.zeros(vocabulary_size).float()
z1 = torch.matmul(W1,x)

In [0]:
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
z2 = torch.matmul(W2, z1)

In [19]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 5.395770072937012
Loss at epo 10: 4.814876556396484
Loss at epo 20: 4.393234729766846
Loss at epo 30: 4.082814693450928
Loss at epo 40: 3.8434722423553467
Loss at epo 50: 3.651871681213379
Loss at epo 60: 3.4944581985473633
Loss at epo 70: 3.362623929977417
Loss at epo 80: 3.250505208969116
Loss at epo 90: 3.1539103984832764
