In [5]:
import numpy as np
import torch
import scipy
import math
import time
import sys
from collections import Counter

def getCorpus(size):
    with open('corpus.txt', "r") as f:
        text = f.read()
        text = text.lower().split()
        text = text[: min(len(text), size)]
        vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1))
        vocab_dict['<unk>'] = len(text) - sum(list(vocab_dict.values()))
        idx_to_word = list(vocab_dict.keys())
        word_to_idx = {word:ind for ind, word in enumerate(idx_to_word)}
        word_counts = np.array(list(vocab_dict.values()), dtype=np.float32)
        word_freqs = word_counts / sum(word_counts)
        print("Words list length:{}".format(len(text)))
        print("Vocab size:{}".format(len(idx_to_word)))
    return text, idx_to_word, word_to_idx, word_counts, word_freqs

def buildCooccuranceMatrix(text, word_to_idx):
    vocab_size = len(word_to_idx)
    maxlength = len(text)
    text_ids = [word_to_idx.get(word, word_to_idx["<unk>"]) for word in text]
    cooccurance_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    print("Co-Matrix consumed mem:%.2fMB" % (sys.getsizeof(cooccurance_matrix)/(1024*1024)))
    for i, center_word_id in enumerate(text_ids):
        window_indices = list(range(i - WINDOW_SIZE, i)) + list(range(i + 1, i + WINDOW_SIZE + 1))
        window_indices = [i % maxlength for i in window_indices]
        window_word_ids = [text_ids[index] for index in window_indices]
        for context_word_id in window_word_ids:
            cooccurance_matrix[center_word_id][context_word_id] += 1
        if (i+1) % 1000000 == 0:
            print(">>>>> Process %dth word" % (i+1))
    print(">>>>> Save co-occurance matrix completed.")
    return cooccurance_matrix

def buildWeightMatrix(co_matrix):
    xmax = 100.0
    weight_matrix = np.zeros_like(co_matrix, dtype=np.float32)
    print("Weight-Matrix consumed mem:%.2fMB" % (sys.getsizeof(weight_matrix) / (1024 * 1024)))
    for i in range(co_matrix.shape[0]):
        for j in range(co_matrix.shape[1]):
            weight_matrix[i][j] = math.pow(co_matrix[i][j] / xmax, 0.75) if co_matrix[i][j] < xmax else 1
        if (i+1) % 1000 == 0:
            print(">>>>> Process %dth weight" % (i+1))
    print(">>>>> Save weight matrix completed.")
    return weight_matrix

class WordEmbeddingDataset(torch.utils.data.Dataset):
    def __init__(self, co_matrix, weight_matrix):
        self.co_matrix = co_matrix
        self.weight_matrix = weight_matrix
        self.train_set = []

        for i in range(self.weight_matrix.shape[0]):
            for j in range(self.weight_matrix.shape[1]):
                if weight_matrix[i][j] != 0:
                    self.train_set.append((i, j))

    def __len__(self):
        '''
        :return: the size of train_set
        '''
        return len(self.train_set)

    def __getitem__(self, index):
        '''
        :return: one of the sample
        '''
        (i, j) = self.train_set[index]
        return i, j, torch.tensor(self.co_matrix[i][j], dtype=torch.float), self.weight_matrix[i][j]

class GloveModelForBGD(torch.nn.Module):
    def __init__(self, vocab_size, embed_size):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size

        self.v = torch.nn.Embedding(vocab_size, embed_size)
        self.w = torch.nn.Embedding(vocab_size, embed_size)
        self.biasv = torch.nn.Embedding(vocab_size, 1)
        self.biasw = torch.nn.Embedding(vocab_size, 1)

        initrange = 0.5 / self.embed_size
        self.v.weight.data.uniform_(-initrange, initrange)
        self.w.weight.data.uniform_(-initrange, initrange)

    def forward(self, i, j, co_occur, weight):
        vi = self.v(i)
        wj = self.w(j)
        bi = self.biasv(i)
        bj = self.biasw(j)

        similarity = torch.mul(vi, wj)
        similarity = torch.sum(similarity, dim=1)

        loss = similarity + bi + bj - torch.log(co_occur)
        loss = 0.5 * weight * loss * loss

        return loss.sum().mean()

    def gloveMatrix(self):
        return self.v.weight.data.numpy() + self.w.weight.data.numpy()

EMBEDDING_SIZE = 50
MAX_VOCAB_SIZE = 4000
WINDOW_SIZE = 5

NUM_EPOCHS = 10
BATCH_SIZE = 10
LEARNING_RATE = 0.1
TEXT_SIZE = 20000000
WEIGHT_FILE = "weight.txt"

text, idx_to_word, word_to_idx, word_counts, word_freqs = getCorpus(size=TEXT_SIZE)
co_matrix = buildCooccuranceMatrix(text, word_to_idx)
weight_matrix = buildWeightMatrix(co_matrix)
dataset = WordEmbeddingDataset(co_matrix, weight_matrix)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
model = GloveModelForBGD(MAX_VOCAB_SIZE, EMBEDDING_SIZE)
optimizer = torch.optim.Adagrad(model.parameters(), lr=LEARNING_RATE)

print_every = 10000
save_every = 50000
epochs = NUM_EPOCHS
iters_per_epoch = int(dataset.__len__() / BATCH_SIZE)
total_iterations = iters_per_epoch * epochs
print("Iterations: %d per one epoch, Total iterations: %d " % (iters_per_epoch, total_iterations))
start = time.time()
for epoch in range(epochs):
    loss_print_avg = 0
    iteration = iters_per_epoch * epoch
    for i, j, co_occur, weight in dataloader:
        iteration += 1
        optimizer.zero_grad()
        loss = model(i, j, co_occur, weight)
        loss.backward()
        optimizer.step()
        loss_print_avg += loss.item()
torch.save(model.state_dict(), WEIGHT_FILE)

Words list length:20000000
Vocab size:4000
Co-Matrix consumed mem:61.04MB
>>>>> Process 1000000th word
>>>>> Process 2000000th word
>>>>> Process 3000000th word
>>>>> Process 4000000th word
>>>>> Process 5000000th word
>>>>> Process 6000000th word
>>>>> Process 7000000th word
>>>>> Process 8000000th word
>>>>> Process 9000000th word
>>>>> Process 10000000th word
>>>>> Process 11000000th word
>>>>> Process 12000000th word
>>>>> Process 13000000th word
>>>>> Process 14000000th word
>>>>> Process 15000000th word
>>>>> Process 16000000th word
>>>>> Process 17000000th word
>>>>> Process 18000000th word
>>>>> Process 19000000th word
>>>>> Process 20000000th word
>>>>> Save co-occurance matrix completed.
Weight-Matrix consumed mem:61.04MB
>>>>> Process 1000th weight
>>>>> Process 2000th weight
>>>>> Process 3000th weight
>>>>> Process 4000th weight
>>>>> Save weight matrix completed.
Iterations: 551696 per one epoch, Total iterations: 5516960 


KeyboardInterrupt: ignored

In [6]:
def find_nearest(word, embedding_weights):
  index = word_to_idx[word]
  embedding = embedding_weights[index]
  cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
  return [idx_to_word[i] for i in cos_dis.argsort()[:10]]

In [7]:
glove_matrix = model.gloveMatrix()
for word in ["good", "one", "green", "like", "america", "queen", "better", "work", "computer", "language"]:
  print(word, find_nearest(word, glove_matrix))

good ['good', 'very', 'because', 'bad', 'not', 'work', 'how', 'so', 'give', 'did']
one ['one', 'only', 'has', 'all', 'two', 'also', 'which', 'for', 'this', 'that']
green ['green', 'blue', 'white', 'yellow', 'dark', 'black', 'red', 'brown', 'orange', 'color']
like ['like', 'some', 'other', 'often', 'such', 'very', 'make', 'these', 'have', 'because']
america ['america', 'europe', 'america,', 'north', 'africa', 'central', 'western', 'america.', 'south', 'europe,']
queen ['queen', 'elizabeth', 'king', 'daughter', 'prince', 'wife', 'anne', 'mary', 'maria', 'charles']
better ['better', 'good', 'bad', 'way', 'find', 'get', 'because', 'without', 'so', 'much']
work ['work', 'well', 'not', 'but', 'did', 'because', 'good', 'often', 'their', 'own']
computer ['computer', 'software', 'uses', 'data', 'program', 'device', 'using', 'computers', 'use', 'programs']
language ['language', 'language.', 'spoken', 'language,', 'languages', 'speak', 'word', 'programming', 'means', 'latin']


In [8]:
glove_matrix = model.gloveMatrix()
print("text", find_nearest("text", glove_matrix))

text ['text', 'sound', 'type', '\\na', 'referred', 'translation', 'commonly', 'sounds', 'code', '"a']


In [9]:
glove_matrix = model.gloveMatrix()
print("physics", find_nearest("physics", glove_matrix))

physics ['physics', 'chemistry', 'medicine', 'nobel', 'prize', 'quantum', 'physiology', 'mathematics', 'sciences', 'molecular']


In [10]:
glove_matrix = model.gloveMatrix()
print("north", find_nearest("north", glove_matrix))

north ['north', 'south', 'central', 'east', 'west', 'western', 'southern', 'africa', 'america', 'australia.']


In [11]:
glove_matrix = model.gloveMatrix()
print("queen", find_nearest("queen", glove_matrix))

queen ['queen', 'elizabeth', 'king', 'daughter', 'prince', 'wife', 'anne', 'mary', 'maria', 'charles']


In [12]:
glove_matrix = model.gloveMatrix()
print("car", find_nearest("car", glove_matrix))

car ['car', 'race', 'cars', 'motor', 'driver', 'company', 'produced', 'model', 'production', 'france.\\n\\nreferences\\ninsee\\n\\ncommunes']


In [15]:
def cosine_similarity(word_a, word_b, embedding_weights):
  idx_a = word_to_idx[word_a]
  idx_b = word_to_idx[word_b]
  emb_a = embedding_weights[idx_a]
  emb_b = embedding_weights[idx_b]
  cos_dis = scipy.spatial.distance.cosine(emb_a, emb_b)
  return cos_dis

In [19]:
glove_matrix = model.gloveMatrix()
print("France vs. Spain", cosine_similarity("france", "spain", glove_matrix))

France vs. Spain 0.273925244808197


In [20]:
glove_matrix = model.gloveMatrix()
print("tree vs. water", cosine_similarity("tree", "water", glove_matrix))

tree vs. water 0.4943550229072571


In [21]:
glove_matrix = model.gloveMatrix()
print("water vs. sky", cosine_similarity("water", "sky", glove_matrix))

water vs. sky 0.7244596779346466


In [22]:
glove_matrix = model.gloveMatrix()
print("sky vs. bird", cosine_similarity("sky", "bird", glove_matrix))

sky vs. bird 0.6768551766872406
