In [25]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import nltk
import time
from nltk.corpus import gutenberg
import gensim
import json
import random
import numpy as np
from tensorboardX import SummaryWriter
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [26]:
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

### Get all sentences in Project Gutenberg

In [2]:
# sents = []
# for file in gutenberg.fileids():
#     for sent in gutenberg.sents(file):
#         sents.append(sent)
# len(sents)

In [54]:
(gutenberg.fileids())

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### Train a word2vec model using gensim for comparison

In [3]:
# model = gensim.models.Word2Vec(sents, size=100, window=5, min_count=1, workers=4)
# model.save('word2vec_gensim.model')

### Create Vocabulary for all the words in Project Gutenberg

In [4]:
# words = []
# for file in gutenberg.fileids():
#     for word in gutenberg.words(file):
#         words.append(word)
# # words = list(set(words))
# len(words)

In [5]:
# fd = nltk.FreqDist(words)
# vocab = sorted(fd, key=fd.get, reverse=True)
# vocab[:10]

In [6]:
# with open('vocab', 'wb') as f:
#     for i in range(len(vocab)):
#         f.write('{} {}\n'.format(vocab[i], i).encode('utf-8'))

In [7]:
# with open('vocab.json', 'w') as f:
#     json.dump(vocab, f)

In [8]:
with open('vocab.json', 'r') as f:
    vocab = json.load(f)

### Set hyperparameters 

In [9]:
settings = {
    'vocab_size': len(vocab),
    'window_size': 5,
    'num_epochs': 100,
    'embedding_dim': 50,
    'batch_size': 512,
    'num_heads': 12,
    'dim_head': 128,
    'learning_rate': 1e-5,
    'is_training': False
}

In [35]:
class myDataset(Dataset):
    
    def __init__(self, settings):
        self.window_size = settings['window_size']
        self.dim = settings['embedding_dim']
        # read from project gutenberg
        sents = []
        list(map(sents.extend, list(map(gutenberg.sents, gutenberg.fileids()))))
        print('\n{} sentences fetched.'.format(len(sents)))
        # load vocabulary file
        with open('vocab.json', 'r') as f:
            vocab = json.load(f)
        print('\n{} unique words found in corpus'.format(len(vocab)))
        self.word2id = dict((vocab[i], i) for i in range(len(vocab)))
        self.data = []
        for sent in sents:
            for i in range(len(sent)):
                try:
                    context = [self.word2id[word] for word in sent[max(0, i - self.window_size):i] + sent[i+1:min(
                        len(sent), i + 1 + self.window_size)]]
                    target = self.word2id[sent[i]]
                    while len(context) < 2*self.window_size:
                        context.append(0)
                    self.data.append((target, context))
                except KeyError:
                    pass
        print('\n{} pairs found for training'.format(self.__len__()))
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        target = torch.Tensor([self.data[index][0]])
        context = torch.Tensor(self.data[index][1])
        return target, context

In [36]:
dataset = myDataset(settings)


98552 sentences fetched.

51156 unique words found in corpus

2621762 pairs found for training


In [42]:
uni_leng = dataset.__len__() // 10
leng = dataset.__len__()
train_set, test_set, dev_set = torch.utils.data.random_split(dataset, [uni_leng*8, uni_leng, leng-9*uni_leng])

In [45]:
train_loader = DataLoader(train_set, batch_size=settings['batch_size'], shuffle=True)
test_loader = DataLoader(test_set, batch_size=settings['batch_size'], shuffle=True)
dev_loader = DataLoader(dev_set, batch_size=settings['batch_size'], shuffle=True)

In [46]:
class w2v_model(nn.Module):
    def __init__(self, settings):
        super(w2v_model, self).__init__()
        self.vocab_size = settings['vocab_size']
        self.batch_size = settings['batch_size']
        self.num_heads = settings['num_heads']
        self.dim_head = settings['dim_head']
        self.num_hidden = self.dim_head * self.num_heads
        self.seq_len = settings['window_size'] * 2
        self.embed_dim = settings['embedding_dim']

        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim)
#         self.embedding = torch.randn([self.vocab_size, self.embed_dim], requires_grad=True)
        self.W_Q = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_K = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_V = nn.Linear(self.embed_dim, self.num_hidden)
        self.cos_sim = nn.CosineSimilarity(dim=-1)

    def attention(self, target, context):
        Q = self.W_Q(target).view(self.batch_size, self.num_heads, self.dim_head)
        W = torch.zeros([self.batch_size, self.seq_len, self.num_heads, self.num_heads]).to(target.device)
        V = torch.zeros([self.batch_size, self.seq_len, self.num_hidden]).to(target.device)

        for i in range(self.batch_size):
            for j in range(self.seq_len):
                K_t = self.W_K(context[i][j]).view(self.num_heads, self.dim_head).transpose(0, 1)
                W[i][j] = torch.matmul(Q[i], K_t) / (self.dim_head ** 0.5)
                V[i][j] = self.W_V(context[i][j])
        W = nn.Softmax(dim=-1)(W)
        V = V.view(self.batch_size, self.seq_len, self.num_heads, self.dim_head)
        tmp = torch.matmul(W, V).view(self.batch_size, self.seq_len, self.num_hidden)
        context_vector = torch.sum(tmp, dim=1).view(self.batch_size, self.num_hidden)
        target_vector = self.W_V(target).view(self.batch_size, self.num_hidden)
        return target_vector, context_vector
    
    def forward(self, t, c):
        target = self.embedding(t.long())
        context = self.embedding(c.long())
        v_t, v_c = self.attention(target, context)
        return v_t, v_c
#         sim = self.cos_sim(v_t, v_c)
#         return sim

In [47]:
if torch.cuda.is_available() and settings['is_training']:
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)

cpu


In [48]:
model = w2v_model(settings).to(device)
lossfunc = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=settings['learning_rate'], momentum=0.9)

In [24]:
cos_sim = nn.CosineSimilarity(dim=1, eps=1e-6)
def get_embed(token):
    return model.embedding(torch.Tensor([vocab.index(token)]).long().to(device))
def most_similar(token, num_return):
    v_w1 = get_embed(token)
    word_sim = {}
    for i in range(len(vocab)):
        word = vocab[i]
        v_w2 = get_embed(word)
        theta = cos_sim(v_w1, v_w2)
        word_sim[word] = theta.detach().numpy()
    words_sorted = sorted(word_sim.items(), key=lambda kv: kv[1], reverse=True)
    for word, sim in words_sorted[:num_return]:
        yield (word, sim)

In [22]:
writer = SummaryWriter()

In [51]:
model.train()
num_steps = train_set.__len__()//settings['batch_size']
for epoch in range(settings['num_epochs']):
    for step in range(train_set.__len__()//settings['batch_size']):
        start = time.time()
        (t, c) = next(iter(train_loader))
        t, c = t.to(device), c.to(device)
        optimizer.zero_grad()
        v_t, v_c = model(t, c)
        loss = lossfunc(v_t, v_c.to(device))
        loss.backward()
        optimizer.step()
        if step % 10 == 0:
            print('epoch {} step {} loss: {:.6f} time used for 10 steps {:6f}'.format(
                epoch, step, loss.tolist(), time.time()-start))
            writer.add_scalar('speed', time.time()-start, epoch*num_steps+step)
            
            model.eval()
            (t, c) = next(iter(test_loader))
            t, c = t.to(device), c.to(device)
            v_t, v_c = model(t, c)
            test_loss = lossfunc(v_t, v_c.to(device))
            (t, c) = next(iter(dev_loader))
            t, c = t.to(device), c.to(device)
            v_t, v_c = model(t, c)
            dev_loss = lossfunc(v_t, v_c.to(device))
            writer.add_scalars('loss', {'train': loss.tolist(),
                                        'test': test_loss.tolist(),
                                        'dev': dev_loss.tolist()
                                       }, epoch*num_steps+step)
            model.train()
    torch.save(model.state_dict(), 'MSE_SGD/epoch_{}.pt'.format(epoch))

epoch 0 step 0 loss: 0.000053 time used for 10 steps 42.297522
epoch 1 step 0 loss: 0.000052 time used for 10 steps 47.269611


In [114]:
class w2v_model_CBoW(nn.Module):
    def __init__(self, settings):
        super(w2v_model_CBoW, self).__init__()
        self.vocab_size = settings['vocab_size']
        self.batch_size = settings['batch_size']
        self.num_heads = settings['num_heads']
        self.dim_head = settings['dim_head']
        self.num_hidden = self.dim_head * self.num_heads
        self.seq_len = settings['window_size'] * 2
        self.embed_dim = settings['embedding_dim']

        self.embedding = nn.Embedding(self.vocab_size, self.embed_dim)
        self.W_Q = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_K = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_V = nn.Linear(self.embed_dim, self.num_hidden)
        self.W_out = nn.Linear(self.num_hidden, self.vocab_size)
        self.cos_sim = nn.CosineSimilarity(dim=-1)

    def attention(self, target, context):
        Q = self.W_Q(target).view(self.batch_size, self.num_heads, self.dim_head)
        W = torch.zeros([self.batch_size, self.seq_len, self.num_heads, self.num_heads]).to(target.device)
        V = torch.zeros([self.batch_size, self.seq_len, self.num_hidden]).to(target.device)

        for i in range(self.batch_size):
            for j in range(self.seq_len):
                K_t = self.W_K(context[i][j]).view(self.num_heads, self.dim_head).transpose(0, 1)
                W[i][j] = torch.matmul(Q[i], K_t) / (self.dim_head ** 0.5)
                V[i][j] = self.W_V(context[i][j])
        W = nn.Softmax(dim=-1)(W)
        V = V.view(self.batch_size, self.seq_len, self.num_heads, self.dim_head)
        tmp = torch.matmul(W, V).view(self.batch_size, self.seq_len, self.num_hidden)
        context_vector = torch.sum(tmp, dim=1).view(self.batch_size, self.num_hidden)
        return context_vector

    def forward(self, t, c):
        target = self.embedding(t.long())
        context = self.embedding(c.long())
        v_c = self.attention(target, context)
        pred = nn.Softmax(dim=1)(self.W_out(v_c))
        return pred

In [115]:
tmp_model = w2v_model_CBoW(settings)

In [116]:
pred = tmp_model(t,c)

In [117]:
pred.shape

torch.Size([512, 51156])

In [119]:
CELoss = nn.CrossEntropyLoss()

In [124]:
CELoss(pred, t.long().view(-1))

tensor(10.8427, grad_fn=<NllLossBackward>)