<a href="https://colab.research.google.com/github/UDICatNCHU/PyTorch-SocialNetwork/blob/master/word2vec_skip_gram_social_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 手刻版 Hand-Crafted Word2Vec (Skip-gram version)

In [0]:
import torch
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [0]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]

In [0]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [0]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [237]:
idx2word

{0: 'he',
 1: 'is',
 2: 'a',
 3: 'king',
 4: 'she',
 5: 'queen',
 6: 'man',
 7: 'woman',
 8: 'warsaw',
 9: 'poland',
 10: 'capital',
 11: 'berlin',
 12: 'germany',
 13: 'paris',
 14: 'france'}

In [0]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [240]:
idx_pairs[:5]

array([[0, 1],
       [0, 2],
       [1, 0],
       [1, 2],
       [1, 3]])

In [242]:
for item in idx_pairs[:10]:
  print( idx2word[item[0]], idx2word[item[1]])

he is
he a
is he
is a
is king
a he
a is
a king
king is
king a


In [0]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size, dtype=torch.float)
    x[word_idx] = 1.0
    return x

In [99]:
embedding_dims = 5
W1 = torch.rand(vocabulary_size, embedding_dims, requires_grad=True)
W2 = torch.rand(embedding_dims, vocabulary_size, requires_grad=True)
num_epochs = 100
learning_rate = 0.0001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = get_input_layer(data)
        y_true = torch.tensor(np.array([target]), dtype=torch.long)
        z1 = torch.matmul(x, W1)
        z2 = torch.matmul(z1, W2)
        loss = F.cross_entropy(z2.view(1,-1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad
        W2.data -= learning_rate * W2.grad
        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss.item()}')

Loss at epo 0: 3.1296887397766113
Loss at epo 10: 3.1275291442871094
Loss at epo 20: 3.1254820823669434
Loss at epo 30: 3.123535633087158
Loss at epo 40: 3.1216793060302734
Loss at epo 50: 3.1199183464050293
Loss at epo 60: 3.1182594299316406
Loss at epo 70: 3.116689682006836
Loss at epo 80: 3.1152024269104004
Loss at epo 90: 3.1138038635253906


In [0]:
y_true = torch.tensor(np.array([1,2]), dtype=torch.long)


tensor([[1, 2]])

# 使用Pytorch Framework版 Word2Vec (Skip-gram version)

In [94]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fe7974536b0>

In [95]:
len(vocabulary)

15

In [235]:
class Word2Vec_SkipGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec_SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)


    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        out = self.linear(embeds)
        return out


losses = []
EMBEDDING_DIM = 10
loss_function = nn.CrossEntropyLoss()
model = Word2Vec_SkipGram(len(vocabulary), EMBEDDING_DIM)
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(5):
    total_loss = 0
    for data, target in idx_pairs:
        data_input = torch.tensor([data], dtype=torch.long)
        predicted = model(data_input)
        label = torch.tensor([target], dtype=torch.long)   
        
        optimizer.zero_grad()     
        loss = F.cross_entropy(predicted, label)
        loss.backward()
        optimizer.step()

    print(loss.item())

2.471090316772461
2.433135747909546
2.3987107276916504
2.3689889907836914
2.3439688682556152


In [183]:
idx2word[3]

'king'

In [184]:
idx2word[5]

'queen'

In [196]:
word2idx["king"]

3

In [0]:
lookup_tensors = torch.tensor([word2idx["king"], word2idx["queen"]] ,dtype=torch.long)

In [206]:
lookup_tensors

tensor([3, 5])

In [209]:
model(lookup_tensor)

tensor([[-1.2777,  5.4239,  5.4351, -1.2188, -1.1558, -1.3492, -1.5852, -1.3467,
         -1.3630, -0.2604, -2.4192, -3.2783,  0.0565, -2.5430, -5.3278],
        [-2.0963,  5.9476,  5.9539, -2.5179, -2.4708, -2.5742, -3.3025, -1.9582,
         -2.0920, -2.2185,  0.8063, -0.5378, -0.7538,  0.8653, -1.0555]],
       grad_fn=<AddmmBackward>)

In [0]:
vector_for_king = model(lookup_tensor).data[0]
vector_for_queen = model(lookup_tensor).data[1]

In [218]:
vector_for_king

tensor([-1.2777,  5.4239,  5.4351, -1.2188, -1.1558, -1.3492, -1.5852, -1.3467,
        -1.3630, -0.2604, -2.4192, -3.2783,  0.0565, -2.5430, -5.3278])

In [225]:
vector_for_queen

tensor([-2.0963,  5.9476,  5.9539, -2.5179, -2.4708, -2.5742, -3.3025, -1.9582,
        -2.0920, -2.2185,  0.8063, -0.5378, -0.7538,  0.8653, -1.0555])

In [231]:
F.cosine_similarity(vector_for_king.view(1,-1), vector_for_queen.view(1,-1))

tensor([0.7455])

# Excercise  Word2vec based on Word2vec

In [233]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self):
        pass

    def forward(self, inputs):
        pass

# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)


make_context_vector(data[0][0], word_to_ix)  # example

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


tensor([29, 28, 15, 25])