In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import nltk
import time
from nltk.corpus import gutenberg
import gensim
import json
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'



### Get all sentences in Project Gutenberg

In [30]:
# sents = []
# for file in gutenberg.fileids():
#     for sent in gutenberg.sents(file):
#         sents.append(sent)
# len(sents)

### Train a word2vec model using gensim for comparison

In [29]:
# model = gensim.models.Word2Vec(sents, size=100, window=5, min_count=1, workers=4)
# model.save('word2vec_gensim.model')

### Create Vocabulary for all the words in Project Gutenberg

In [4]:
# words = []
# for file in gutenberg.fileids():
#     for word in gutenberg.words(file):
#         words.append(word)
# # words = list(set(words))
# len(words)

In [5]:
# fd = nltk.FreqDist(words)
# vocab = sorted(fd, key=fd.get, reverse=True)
# vocab[:10]

In [6]:
# with open('vocab', 'wb') as f:
#     for i in range(len(vocab)):
#         f.write('{} {}\n'.format(vocab[i], i).encode('utf-8'))

In [7]:
# with open('vocab.json', 'w') as f:
#     json.dump(vocab, f)

In [8]:
with open('vocab.json', 'r') as f:
    vocab = json.load(f)

###### Since we only have 2.6 million words, Skip-Gram should performs better

### Set hyperparameters 

In [9]:
settings = {
    'vocab_size': len(vocab),
    'window_size': 5,
    'num_epochs': 100,
    'embedding_dim': 50,
    'batch_size': 512,
    'num_heads': 12,
    'dim_head': 128,
    'learning_rate': 1e-5,
    'is_training': False
}

In [10]:
class myDataset(Dataset):
    
    def __init__(self, settings):
        self.window_size = settings['window_size']
        self.dim = settings['embedding_dim']
        # read from project gutenberg
        sents = []
        list(map(sents.extend, list(map(gutenberg.sents, gutenberg.fileids()))))
        print('\n{} sentences fetched.'.format(len(sents)))
        # load vocabulary file
        with open('vocab.json', 'r') as f:
            vocab = json.load(f)
        print('\n{} unique words found in corpus'.format(len(vocab)))
        self.word2id = dict((vocab[i], i) for i in range(len(vocab)))
        self.data = []
        for sent in sents:
            for i in range(len(sent)):
                try:
                    context = [self.word2id[word] for word in sent[max(0, i - self.window_size):i] + sent[i+1:min(
                        len(sent), i + 1 + self.window_size)]]
                    target = self.word2id[sent[i]]
                    while len(context) < 2*self.window_size:
                        context.append(0)
                    self.data.append((target, context))
                except KeyError:
                    print(sent[max(0, i - self.window_size):min(len(sent), i + 1 + self.window_size)])
        print('{} pairs found for training'.format(self.__len__()))
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        target = torch.Tensor([self.data[index][0]])
        context = torch.Tensor(self.data[index][1])
        return target, context

In [11]:
dataset = myDataset(settings)
dataloader = DataLoader(dataset, batch_size=settings['batch_size'], shuffle=True)


98552 sentences fetched.

51156 unique words found in corpus
['"*', 'The', 'saying', 'become', 'proverbial', 'in']
['"*', 'The', 'saying', 'become', 'proverbial', 'in', 'the']
['"*', 'The', 'saying', 'become', 'proverbial', 'in', 'the', 'village']
['"*', 'The', 'saying', 'become', 'proverbial', 'in', 'the', 'village', '.']
['"*', 'The', 'saying', 'become', 'proverbial', 'in', 'the', 'village', '.']
['"*', 'The', 'saying', 'become', 'proverbial', 'in', 'the', 'village', '.']
['"*']
['"*', 'Each', 'who', 'answers', '"', 'A']
['"*', 'Each', 'who', 'answers', '"', 'A', 'Talbotite']
['"*', 'Each', 'who', 'answers', '"', 'A', 'Talbotite', ',"']
['"*', 'Each', 'who', 'answers', '"', 'A', 'Talbotite', ',"', 'Rory']
['"*', 'Each', 'who', 'answers', '"', 'A', 'Talbotite', ',"', 'Rory', 'shakes']
['"*', 'Each', 'who', 'answers', '"', 'A', 'Talbotite', ',"', 'Rory', 'shakes', 'by']
['"*']
['"*']
['\'".']
['"*']
['nd', 'desart', 'ways', 'with', '?', 'oeril', 'gone', 'All', '?', 'might', ',?']
['de

In [12]:
class w2v_model(nn.Module):
    def __init__(self, settings):
        super(w2v_model, self).__init__()
        self.vocab_size = settings['vocab_size']
        self.batch_size = settings['batch_size']
        self.num_heads = settings['num_heads']
        self.dim_head = settings['dim_head']
        self.num_hidden = self.dim_head * self.num_heads
        self.seq_len = settings['window_size'] * 2
        self.embed_dim = settings['embedding_dim']

        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim)
#         self.embedding = torch.randn([self.vocab_size, self.embed_dim], requires_grad=True)
        self.W_Q = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_K = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_V = nn.Linear(self.embed_dim, self.num_hidden)
        self.cos_sim = nn.CosineSimilarity(dim=-1)

    def attention(self, target, context):
        Q = self.W_Q(target).view(self.batch_size, self.num_heads, self.dim_head)
        W = torch.zeros([self.batch_size, self.seq_len, self.num_heads, self.num_heads]).to(target.device)
        V = torch.zeros([self.batch_size, self.seq_len, self.num_hidden]).to(target.device)
        
        # zero-padding
        for i in range(self.batch_size):
            K_t = self.W_K(target[i]).view(self.num_heads, self.dim_head).transpose(0,1)
            for j in range(self.seq_len):
                W[i][j] = torch.matmul(Q[i], K_t) / (self.dim_head ** 0.5)
                V[i][j] = self.W_V(target[j])
        W = nn.Softmax(dim=-1)(W)
        V = V.view(self.batch_size, self.seq_len, self.num_heads, self.dim_head)
        tmp = torch.matmul(W, V).view(self.batch_size, self.seq_len, self.num_hidden)
        context_vector = torch.sum(tmp, dim=1).view(self.batch_size, self.num_hidden)
        target_vector = self.W_V(target).view(self.batch_size, self.num_hidden)
        return target_vector, context_vector
    
    def forward(self, t, c):
        target = self.embedding(t.long())
        context = self.embedding(c.long())
        v_t, v_c = self.attention(target, context)
        return v_t, v_c
#         sim = self.cos_sim(v_t, v_c)
#         return sim

In [13]:
if torch.cuda.is_available() and settings['is_training']:
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)

cpu


In [14]:
model = w2v_model(settings).to(device)
lossfunc = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=settings['learning_rate'], momentum=0.9)

In [16]:
model.load_state_dict(torch.load('MSE_SGD/epoch_50.pt', map_location='cpu'))
model.eval()

w2v_model(
  (embedding): Embedding(51156, 50)
  (W_Q): Linear(in_features=50, out_features=1536, bias=True)
  (W_K): Linear(in_features=50, out_features=1536, bias=True)
  (W_V): Linear(in_features=50, out_features=1536, bias=True)
  (cos_sim): CosineSimilarity()
)

In [32]:
ori_model = gensim.models.KeyedVectors.load('word2vec_gensim.model')

In [35]:
ori_model.wv.most_similar(['men'])

[('ladies', 0.778356671333313),
 ('women', 0.7729319930076599),
 ('whales', 0.7110983729362488),
 ('nations', 0.7061511278152466),
 ('people', 0.698354959487915),
 ('children', 0.6776918172836304),
 ('ones', 0.6673187017440796),
 ('those', 0.6658756732940674),
 ('cities', 0.6576958298683167),
 ('heathen', 0.6557672023773193)]

In [23]:
model.embedding(torch.Tensor([0]).long().to(device)).cpu().detach().numpy()

array([[-0.7288056 , -0.7894503 ,  0.2997309 , -0.634888  ,  1.1730815 ,
         1.0118821 ,  1.5643903 ,  0.84447783, -0.03041133, -1.0454463 ,
         2.1104195 ,  0.17344235, -1.0437306 , -0.11676285,  0.19619858,
         0.69553906,  0.22397263,  0.43834323, -1.1550168 , -1.3721713 ,
        -0.2732976 ,  0.63333327,  0.568821  , -0.55983514, -0.09155396,
         0.21911587,  1.5157677 , -1.0443634 ,  0.20765564,  0.11520106,
         0.0727839 , -0.00374739,  1.6814423 , -1.8273432 , -0.77540773,
        -0.9494474 , -1.5475677 ,  0.05887098, -0.36490342,  0.4780317 ,
         1.4733663 ,  1.0463972 ,  0.19754106,  0.38320884, -0.9215026 ,
        -0.6607583 , -0.22892128,  0.09999726, -0.29728153,  0.0046324 ]],
      dtype=float32)

In [24]:
cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
def get_embed(token):
    return model.embedding(torch.Tensor([vocab.index(token)]).long().to(device))
def most_similar(token, num_return):
    v_w1 = get_embed(token)
    word_sim = {}
    for i in range(len(vocab)):
        word = vocab[i]
        v_w2 = get_embed(word)
        theta = cos_sim(v_w1, v_w2)
        word_sim[word] = theta.detach().numpy()
    words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)
    for word, sim in words_sorted[:num_return]:
        yield (word, sim)

In [25]:
cos_sim(get_embed('men'),get_embed('women'))

tensor([0.0534], grad_fn=<DivBackward0>)

In [26]:
list(most_similar('men',10))

[('men', array([1.], dtype=float32)),
 ('builders', array([0.59341156], dtype=float32)),
 ('louing', array([0.5396602], dtype=float32)),
 ('Shibboleth', array([0.52968055], dtype=float32)),
 ('Hampton', array([0.527706], dtype=float32)),
 ('Obseruers', array([0.50242555], dtype=float32)),
 ('savourest', array([0.4790864], dtype=float32)),
 ('GENEROUS', array([0.47566143], dtype=float32)),
 ('Redeemer', array([0.4740326], dtype=float32)),
 ('patchy', array([0.47305432], dtype=float32))]

In [27]:
list(most_similar('women',10))

[('women', array([1.], dtype=float32)),
 ('Brazon', array([0.5396867], dtype=float32)),
 ('nails', array([0.5330049], dtype=float32)),
 ('overcoming', array([0.52965105], dtype=float32)),
 ('seate', array([0.5219931], dtype=float32)),
 ('loungingly', array([0.5204232], dtype=float32)),
 ('noting', array([0.51352113], dtype=float32)),
 ('cleared', array([0.5008819], dtype=float32)),
 ('Conduit', array([0.49569836], dtype=float32)),
 ('vnprouokes', array([0.4861741], dtype=float32))]

In [None]:
model.train()
start = time.time()
for epoch in range(settings['num_epochs']):
    for step in range(dataset.__len__()//settings['batch_size']):
        (t, c) = next(iter(dataloader))
        t, c = t.to(device), c.to(device)
        optimizer.zero_grad()
        v_t, v_c = model(t, c)
        loss = lossfunc(v_t, v_c.to(device))
        loss.backward()
        optimizer.step()
        if step % 10 == 9:
            print('epoch {} step {} loss: {:.6f} time used for 10 steps {:6f}'.format(
                epoch, step, loss.tolist(), time.time()-start))
            start = time.time()
    torch.save(model.state_dict(), 'MSE_SGD/epoch_{}.pt'.format(epoch))

epoch 0 step 9 loss: 0.971289 time used for 10 steps 26.969929
epoch 0 step 19 loss: 0.777529 time used for 10 steps 27.684524
epoch 0 step 29 loss: 0.610333 time used for 10 steps 27.541535
epoch 0 step 39 loss: 0.718405 time used for 10 steps 27.704566
epoch 0 step 49 loss: 0.669113 time used for 10 steps 27.596583
epoch 0 step 59 loss: 0.693762 time used for 10 steps 27.376994
epoch 0 step 69 loss: 0.769555 time used for 10 steps 27.946891
epoch 0 step 79 loss: 0.885623 time used for 10 steps 26.890848
epoch 0 step 89 loss: 0.669561 time used for 10 steps 27.529937
epoch 0 step 99 loss: 0.717460 time used for 10 steps 27.764776
epoch 0 step 109 loss: 0.715162 time used for 10 steps 27.423823
epoch 0 step 119 loss: 0.613195 time used for 10 steps 26.284391
epoch 0 step 129 loss: 0.732654 time used for 10 steps 25.617761
epoch 0 step 139 loss: 0.933855 time used for 10 steps 25.654312
epoch 0 step 149 loss: 0.814789 time used for 10 steps 26.032262
epoch 0 step 159 loss: 0.721976 time