In [1]:
import torch
import torch.utils.data.dataloader as dataloader
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import os
import re
import sys
import gc
import time
import copy
import numpy as np
from collections import Counter
from tqdm import tqdm

In [2]:
text = []
for file in os.listdir('Holmes_Training_Data/'):
    with open(os.path.join('Holmes_Training_Data', file), 'r', errors='ignore') as f:
        text.extend(f.read().splitlines())
text = [x.replace('*', '') for x in text]
text = [re.sub('[^ \fA-Za-z0-9_]', '', x) for x in text]
text = [x for x in text if x != '']
print(len(text))
# text = text[:1000000]

4121929


In [3]:
raw_text = []
for x in text:
    raw_text.extend(x.split(' '))
raw_text = [x for x in raw_text if x != '']
del text
gc.collect()

0

In [4]:
vocab = set(raw_text)
vocab_size = len(vocab)
freqs = Counter(raw_text)

In [5]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['The', 'Project', 'Etext', 'of'], 'Gutenberg'), (['Project', 'Gutenberg', 'of', 'Reminiscences'], 'Etext'), (['Gutenberg', 'Etext', 'Reminiscences', 'of'], 'of'), (['Etext', 'of', 'of', 'Tolstoy'], 'Reminiscences'), (['of', 'Reminiscences', 'Tolstoy', 'Copyright'], 'of')]


In [6]:
freqs_pow = torch.Tensor([freqs[ix_to_word[i]] for i in range(vocab_size)]).pow(0.75)
dist = freqs_pow / freqs_pow.sum()
del raw_text
gc.collect()

0

In [7]:
def neg_sample(num_samples, positives=[]):
    w = np.random.choice(len(dist), (len(positives), num_samples), p=dist.numpy())
    if positives.is_cuda:
        return torch.tensor(w).to(device)
    else:
        return torch.tensor(w)

In [8]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight.data.uniform_(-0.5 / vocab_size, 0.5 / vocab_size)
    def forward(self, inputs, label):
        negs = neg_sample(5, label)
        u_embeds = self.embeddings(label).view(len(label), -1)
        v_embeds_pos = self.embeddings(inputs).mean(dim=1)
        v_embeds_neg = self.embeddings(negs).mean(dim=1)
        loss1 = torch.diag(torch.matmul(u_embeds, v_embeds_pos.transpose(0, 1)))
        loss2 = torch.diag(torch.matmul(u_embeds, v_embeds_neg.transpose(0, 1)))
        loss1 = -torch.log(1 / (1 + torch.exp(-loss1)))
        loss2 = -torch.log(1 / (1 + torch.exp(loss2)))
        loss = (loss1.mean() + loss2.mean())
        return(loss)

In [9]:
CONTEXT_SIZE = 2
batch_size = 4096
device = torch.device('cuda:0')
losses = []
# loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=100,
             context_size=CONTEXT_SIZE*2)
model.to(device)
# model = torch.nn.DataParallel(model, device_ids=[0, 1]).cuda()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [10]:
data_iter = torch.utils.data.DataLoader(data, batch_size=batch_size,
                                        shuffle=False, num_workers=2)

In [11]:
king = model.embeddings(torch.tensor(word_to_ix['king']).cuda())
queen = model.embeddings(torch.tensor(word_to_ix['queen']).cuda())
man = model.embeddings(torch.tensor(word_to_ix['man']).cuda())
woman = model.embeddings(torch.tensor(word_to_ix['woman']).cuda())

((king - man + woman) * queen).sum() / \
    (torch.sqrt(((king - man + woman)**2).sum()) * torch.sqrt((queen**2).sum()))

tensor(-0.0886, device='cuda:0', grad_fn=<DivBackward1>)

In [12]:
for epoch in range(20):
    total_loss = torch.Tensor([0])
    num = 0
    for context, target in tqdm(data_iter):
        num += 1
        context_ids = []
        for i in range(len(context[0])):
            context_ids.append(make_context_vector([context[j][i] for j in range(len(context))], word_to_ix))
        context_ids = torch.stack(context_ids)
        context_ids = context_ids.to(device)
        model.zero_grad()
        label = make_context_vector(target, word_to_ix)
        label = label.to(device)
        loss = model(context_ids, label)
#         loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)
    print('epoch %d loss %.4f' %(epoch, total_loss / num))
print(losses)

100%|██████████| 10087/10087 [08:14<00:00, 20.41it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 0 loss 1.3864


100%|██████████| 10087/10087 [08:25<00:00, 19.94it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 1 loss 1.3864


100%|██████████| 10087/10087 [08:38<00:00, 19.45it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 2 loss 1.3737


100%|██████████| 10087/10087 [08:46<00:00, 19.17it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 3 loss 1.3394


100%|██████████| 10087/10087 [08:35<00:00, 19.58it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 4 loss 1.3260


100%|██████████| 10087/10087 [09:08<00:00, 18.40it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 5 loss 1.3208


100%|██████████| 10087/10087 [09:07<00:00, 18.44it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 6 loss 1.3181


100%|██████████| 10087/10087 [08:26<00:00, 19.93it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 7 loss 1.3164


100%|██████████| 10087/10087 [08:49<00:00, 19.05it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 8 loss 1.3137


100%|██████████| 10087/10087 [08:49<00:00, 19.04it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 9 loss 1.3079


100%|██████████| 10087/10087 [08:39<00:00, 19.43it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 10 loss 1.3017


100%|██████████| 10087/10087 [08:55<00:00, 18.82it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 11 loss 1.2965


100%|██████████| 10087/10087 [08:44<00:00, 19.22it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 12 loss 1.2919


100%|██████████| 10087/10087 [08:46<00:00, 19.18it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 13 loss 1.2880


100%|██████████| 10087/10087 [09:05<00:00, 18.48it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 14 loss 1.2847


100%|██████████| 10087/10087 [08:40<00:00, 19.39it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 15 loss 1.2818


100%|██████████| 10087/10087 [08:43<00:00, 19.26it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 16 loss 1.2792


100%|██████████| 10087/10087 [08:48<00:00, 19.09it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 17 loss 1.2770


100%|██████████| 10087/10087 [08:38<00:00, 19.45it/s]
  0%|          | 0/10087 [00:00<?, ?it/s]

epoch 18 loss 1.2749


100%|██████████| 10087/10087 [08:55<00:00, 18.85it/s]

epoch 19 loss 1.2730
[tensor([13985.0820]), tensor([13984.8545]), tensor([13856.2217]), tensor([13510.5332]), tensor([13375.6387]), tensor([13322.5732]), tensor([13295.5137]), tensor([13278.3076]), tensor([13251.7588]), tensor([13192.3027]), tensor([13130.3662]), tensor([13077.4775]), tensor([13031.2695]), tensor([12992.3320]), tensor([12958.3672]), tensor([12929.6260]), tensor([12903.6025]), tensor([12880.6699]), tensor([12859.5586]), tensor([12841.1924])]





In [13]:
king = model.embeddings(torch.tensor(word_to_ix['king']).cuda())
queen = model.embeddings(torch.tensor(word_to_ix['queen']).cuda())
man = model.embeddings(torch.tensor(word_to_ix['man']).cuda())
woman = model.embeddings(torch.tensor(word_to_ix['woman']).cuda())

((king - man + woman) * queen).sum() / \
    (torch.sqrt(((king - man + woman)**2).sum()) * torch.sqrt((queen**2).sum()))

tensor(0.9994, device='cuda:0', grad_fn=<DivBackward1>)