In [1]:
import re
import nltk
nltk.download('brown')
import itertools
from nltk.corpus import brown

corpus = []

for cat in ['news']:
    for text_id in brown.fileids(cat):
        raw_text = list(itertools.chain.from_iterable(brown.sents(text_id)))
        text = ' '.join(raw_text)
        text = text.lower()
        text.replace('\n', ' ')
        text = re.sub('[^a-z ]+', '', text)
        corpus.append([w for w in text.split() if w != ''])

[nltk_data] Downloading package brown to /home/maxim/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [2]:
from collections import Counter
import random, math

def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {word: word_counts[word]/float(sum_word_counts) for word in word_counts}
    for text in corpus:
        filtered_corpus.append([])
        for word in text:
            if random.random() < (1+math.sqrt(word_counts[word] * 1e3)) * 1e-3 / float(word_counts[word]):
                filtered_corpus[-1].append(word)
    return filtered_corpus


In [3]:
corpus = subsample_frequent_words(corpus)
vocabulary = set(itertools.chain.from_iterable(corpus))

word_to_index = {w: idx for (idx, w) in enumerate(vocabulary)}
index_to_word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [4]:
import torch
import torch.nn  as  nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

In [5]:
class EarlyStopping():
    def __init__(self, patience=5, min_percent_gain=0.1):
        self.patience = patience
        self.loss_list = []
        self.min_percent_gain = min_percent_gain / 100.
        
    def update_loss(self, loss):
        self.loss_list.append(loss)
        if len(self.loss_list) > self.patience:
            del self.loss_list[0]
    
    def stop_training(self):
        if len(self.loss_list) == 1:
            return False
        gain = (max(self.loss_list) - min(self.loss_list)) / max(self.loss_list)
        print("Loss gain: {}%".format(round(100*gain,2)))
        if gain < self.min_percent_gain:
            return True
        else:
            return False

In [6]:
import random

def get_batches(context_tuple_list, batch_size=64):
    random.shuffle(context_tuple_list)
    batches = []
    batch_target, batch_context, batch_negative = [], [], []
    for i in range(len(context_tuple_list)):
        batch_target.append(word_to_index[context_tuple_list[i][0]])
        batch_context.append(word_to_index[context_tuple_list[i][1]])
        batch_negative.append([word_to_index[w] for w in context_tuple_list[i][2]])
        if (i+1) % batch_size == 0 or i == len(context_tuple_list)-1:
            tensor_target = autograd.Variable(torch.from_numpy(np.array(batch_target)).long())
            tensor_context = autograd.Variable(torch.from_numpy(np.array(batch_context)).long())
            tensor_negative = autograd.Variable(torch.from_numpy(np.array(batch_negative)).long())
            batches.append((tensor_target, tensor_context, tensor_negative))
            batch_target, batch_context, batch_negative = [], [], []
    return batches

In [7]:
from numpy.random import multinomial

def sample_negative(sample_size):
    sample_probability = {}
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    normalizing_factor = sum([v**0.75 for v in word_counts.values()])
    for word in word_counts:
        sample_probability[word] = word_counts[word]**0.75 / normalizing_factor
    words = np.array(list(word_counts.keys()))
    while True:
        word_list = []
        sampled_index = np.array(multinomial(sample_size, list(sample_probability.values())))
        for index, count in enumerate(sampled_index):
            for _ in range(count):
                 word_list.append(words[index])
        yield word_list

In [8]:
import numpy as np

context_tuple_list = []
w = 2
negative_samples = sample_negative(20)

for text in corpus:
    for i, word in enumerate(text):
        # print(word)
        first_context_word_index = max(0,i-w-1)
        last_context_word_index = min(i+w, len(text))
        for j in range(first_context_word_index, last_context_word_index):
            if i!=j:
                # print('target ' + word)
                # print('context ' + text[j])
                context_tuple_list.append((word, text[j], next(negative_samples)))
        # print('\n')
print("There are {} pairs of target and context words".format(len(context_tuple_list)))

There are 271500 pairs of target and context words


In [9]:
from datetime import datetime

In [10]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import torch.nn.functional as F

class Word2Vec(nn.Module):
    def __init__(self, embedding_size, vocab_size):
        super(Word2Vec, self).__init__()
        self.embeddings_target = nn.Embedding(vocab_size, embedding_size)
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size)

    def forward(self, target_word, context_word, negative_example):
        emb_target = self.embeddings_target(target_word)
        emb_context = self.embeddings_context(context_word)
        emb_product = torch.mul(emb_target, emb_context)
        emb_product = torch.sum(emb_product, dim=1)
        out = torch.sum(F.logsigmoid(emb_product))
        emb_negative = self.embeddings_context(negative_example)
        emb_product = torch.bmm(emb_negative, emb_target.unsqueeze(2))
        emb_product = torch.sum(emb_product, dim=1)
        out += torch.sum(F.logsigmoid(-emb_product))

        return -out

In [11]:
vocabulary_size = len(vocabulary)
print(vocabulary_size)

loss_function = nn.CrossEntropyLoss()
net = Word2Vec(embedding_size=300, vocab_size=vocabulary_size)
net = torch.load("checkpoint|time: 18|02|2024 21:52:48.pt")
optimizer = optim.Adam(net.parameters(), lr=0.01) # optimize lr
early_stopping = EarlyStopping(patience=5, min_percent_gain=1)

_ = 0
while True:
    losses = []
    context_tuple_batches = get_batches(context_tuple_list, batch_size=64)
    for i in range(len(context_tuple_batches)):
        net.zero_grad()
        target_tensor, context_tensor, negative_tensor = context_tuple_batches[i]
        # print("target" + target_tensor.__str__())
        # print("context" + context_tensor.__str__())
        # print("negative" + negative_tensor.__str__())
        
        net = net.cuda()
        target_tensor = target_tensor.cuda()
        context_tensor = context_tensor.cuda()
        negative_tensor = negative_tensor.cuda()
        loss = net(target_tensor, context_tensor, negative_tensor)
        loss.backward()
        optimizer.step()
        losses.append(loss.cpu().data)
    print("Loss: ", np.mean(losses))

    if _%30==0:
        now = datetime.now()
        dt_string = now.strftime("%d|%m|%Y %H:%M:%S")

        torch.save(net, "checkpoint" + "|time: " + dt_string +  ".pt")
    # print(i)
    _ += 1

    early_stopping.update_loss(np.mean(losses))
    if early_stopping.stop_training():
        break

12132
Loss:  5364.673
Loss:  1742.4702
Loss gain: 67.52%
Loss:  647.22943
Loss gain: 87.94%
Loss:  301.57928
Loss gain: 94.38%
Loss:  174.70119
Loss gain: 96.74%
Loss:  121.84628
Loss gain: 93.01%
Loss:  93.503975
Loss gain: 85.55%
Loss:  75.0067
Loss gain: 75.13%
Loss:  62.018265
Loss gain: 64.5%
Loss:  52.972736
Loss gain: 56.52%
Loss:  45.25982
Loss gain: 51.6%
Loss:  37.301075
Loss gain: 50.27%
Loss:  36.135548
Loss gain: 41.73%
Loss:  30.50519
Loss gain: 42.41%


KeyboardInterrupt: 

In [None]:
import numpy as np

def get_closest_word(word, topn=10):
    word_distance = []
    emb = net.cpu().embeddings_target
    pdist = nn.PairwiseDistance()
    i = word_to_index[word]
    lookup_tensor_i = torch.tensor([i], dtype=torch.long).cpu()
    v_i = emb(lookup_tensor_i)
    for j in range(len(vocabulary)):
        if j != i:
            lookup_tensor_j = torch.tensor([j], dtype=torch.long)
            v_j = emb(lookup_tensor_j)

            # if index_to_word[j].__str__() == 'woman':
            #     print(index_to_word[j].__str__() + (v_i - v_j).__str__())

            word_distance.append((index_to_word[j], float(pdist(v_i, v_j))))
    word_distance.sort(key=lambda x: x[1])
    return word_distance[:topn]

141

In [None]:
get_closest_word('court')

[('experiment', 48.783164978027344),
 ('affixed', 49.36464309692383),
 ('brookss', 49.70392990112305),
 ('decried', 49.862693786621094),
 ('prospective', 49.94652557373047),
 ('visibly', 50.006126403808594),
 ('kerrville', 50.097084045410156),
 ('splitlevel', 50.10708236694336),
 ('withdraw', 50.30921173095703),
 ('kindergarten', 50.422882080078125)]