In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x1111f9bd0>

In [4]:
embd_dim = 5
word_to_ix = {"hello":0, "world":1}
embeds = nn.Embedding(len(word_to_ix),embd_dim)  # parameters: number of words and number of dimensions
lookup_tensor = torch.tensor([0], dtype=torch.long)
print(embeds, lookup_tensor)
hello_embed = embeds(lookup_tensor)
print(hello_embed)

Embedding(2, 5) tensor([0])
tensor([[ 3.5870, -1.8313,  1.5987, -1.2770,  0.3255]], grad_fn=<EmbeddingBackward>)


## Create custom embeddings for my corpus using [center, context] list for words

In [5]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]

In [6]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
print(tokenized_corpus)

[['he', 'is', 'a', 'king'], ['she', 'is', 'a', 'queen'], ['he', 'is', 'a', 'man'], ['she', 'is', 'a', 'woman'], ['warsaw', 'is', 'poland', 'capital'], ['berlin', 'is', 'germany', 'capital'], ['paris', 'is', 'france', 'capital']]


In [7]:
vocabulary = []

for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w:idx for (idx,w) in enumerate(vocabulary)}
idx2word = {idx:w for (idx,w) in enumerate(vocabulary)}

print(word2idx)
print(idx2word)

{'he': 0, 'is': 1, 'a': 2, 'king': 3, 'she': 4, 'queen': 5, 'man': 6, 'woman': 7, 'warsaw': 8, 'poland': 9, 'capital': 10, 'berlin': 11, 'germany': 12, 'paris': 13, 'france': 14}
{0: 'he', 1: 'is', 2: 'a', 3: 'king', 4: 'she', 5: 'queen', 6: 'man', 7: 'woman', 8: 'warsaw', 9: 'poland', 10: 'capital', 11: 'berlin', 12: 'germany', 13: 'paris', 14: 'france'}


In [19]:
import numpy as np

window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    #print('indices:',indices)
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            #print('for each:',context_word_pos, center_word_pos, w)
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))
            #print(idx_pairs)

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
print(idx_pairs)

[[ 0  1]
 [ 0  2]
 [ 1  0]
 [ 1  2]
 [ 1  3]
 [ 2  0]
 [ 2  1]
 [ 2  3]
 [ 3  1]
 [ 3  2]
 [ 4  1]
 [ 4  2]
 [ 1  4]
 [ 1  2]
 [ 1  5]
 [ 2  4]
 [ 2  1]
 [ 2  5]
 [ 5  1]
 [ 5  2]
 [ 0  1]
 [ 0  2]
 [ 1  0]
 [ 1  2]
 [ 1  6]
 [ 2  0]
 [ 2  1]
 [ 2  6]
 [ 6  1]
 [ 6  2]
 [ 4  1]
 [ 4  2]
 [ 1  4]
 [ 1  2]
 [ 1  7]
 [ 2  4]
 [ 2  1]
 [ 2  7]
 [ 7  1]
 [ 7  2]
 [ 8  1]
 [ 8  9]
 [ 1  8]
 [ 1  9]
 [ 1 10]
 [ 9  8]
 [ 9  1]
 [ 9 10]
 [10  1]
 [10  9]
 [11  1]
 [11 12]
 [ 1 11]
 [ 1 12]
 [ 1 10]
 [12 11]
 [12  1]
 [12 10]
 [10  1]
 [10 12]
 [13  1]
 [13 14]
 [ 1 13]
 [ 1 14]
 [ 1 10]
 [14 13]
 [14  1]
 [14 10]
 [10  1]
 [10 14]]


In [23]:
from torch.autograd import Variable

def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

embedding_dims = 5
vocabulary_size = len(vocabulary)
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 100
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data[0]
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')



Loss at epo 0: 4.187984466552734
Loss at epo 10: 3.8171966075897217
Loss at epo 20: 3.5643041133880615
Loss at epo 30: 3.379520893096924
Loss at epo 40: 3.238473892211914
Loss at epo 50: 3.1269023418426514
Loss at epo 60: 3.035698652267456
Loss at epo 70: 2.9588944911956787
Loss at epo 80: 2.892542600631714
Loss at epo 90: 2.8339884281158447


## Using GloVe pre-trained embeddings in PyTorch

In [36]:
import bcolz
import pickle

In [37]:
words = []
idx = 0
word2idx = {}
glove_path = '/Users/santanubhattacharjee/GitProjects/LearningPyTorch/glove'
vectors = bcolz.carray(np.zeros(1), rootdir=glove_path + '/6B.50.dat', mode='w')

with open(glove_path + '/glove.6B.50d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir=glove_path + '/6B.50.dat', mode='w')
vectors.flush()
pickle.dump(words, open(glove_path + '/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(glove_path + '/6B.50_idx.pkl', 'wb'))

In [38]:
vectors = bcolz.open(glove_path + '/6B.50.dat')[:]
words = pickle.load(open(glove_path + '/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(glove_path + '/6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [39]:
glove['the']

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])