# GloVe

In [1]:
from itertools import combinations
import re

import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.sparse import dok_matrix

In [2]:
data = list(pd.read_csv('./data/nyt-ingredients-snapshot-2015.csv')['input'].dropna())
data

['1 1/4 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted',
 '1 cup peeled and cooked fresh chestnuts (about 20), or 1 cup canned, unsweetened chestnuts',
 '1 medium-size onion, peeled and chopped',
 '2 stalks celery, chopped coarse',
 '1 1/2 tablespoons vegetable oil',
 '2 tablespoons unflavored gelatin, dissolved in 1/2 cup water',
 'Salt',
 '1 cup canned plum tomatoes with juice',
 '6 cups veal or beef stock',
 '1/3 cup Worcestershire sauce',
 '1 tablespoon Louisiana hot sauce',
 '1/2 teaspoon hot red pepper flakes',
 '4 bay leaves',
 '6 cloves garlic, crushed and chopped',
 '2 carrots, peeled and diced',
 '2 medium onions, diced',
 '6 tablespoons butter',
 '1 tablespoon Creole seasoning, or other seasonings of your choice',
 '3 pounds beef brisket',
 '1/2 cup fine dry bread crumbs',
 'Freshly ground black pepper',
 '12 oysters and their liquor',
 '3 tablespoons minced scallions',
 '1 tablespoon flour',
 '6 ounces button mushrooms, thinly 

In [3]:
def tokenize(sentence):
    return re.findall(r'[\w]+', sentence)

In [4]:
tokenized_corpus = [tokenize(sentence.lower()) for sentence in data]

In [5]:
len(tokenized_corpus)

179063

In [6]:
word2id = {}
for sentence in tokenized_corpus:
    for word in sentence:
        if word not in word2id.keys():
            word2id[word] = 0
        word2id[word] += 1

sorted_word2id = sorted(word2id.items(), key=lambda x: x[1], reverse=True)
for i, (word, count) in enumerate(sorted_word2id):
    word2id[word] = i

In [7]:
id2word = {j: i for i, j in word2id.items()}

In [8]:
encoded_corpus = []
for sentence in tokenized_corpus:
    helper = []
    for word in sentence:
        helper.append(word2id[word])
    encoded_corpus.append(helper)

In [9]:
def make_coocurence_matrix(len_vocab, encoded_corpus):
    
    vocab = np.arange(len_vocab)
    matrix = dok_matrix((len_vocab, len_vocab))
    
    for doc in tqdm(encoded_corpus):
        combs = combinations(set(doc), 2)
        for i, j in combs:
            matrix[i, j] += 1
            matrix[j, i] += 1
    return matrix

In [10]:
%%time
N = make_coocurence_matrix(len(word2id), encoded_corpus)

100%|████████████████████████████████████████████████████████████████████████| 179063/179063 [02:34<00:00, 1157.31it/s]

CPU times: total: 2min 32s
Wall time: 2min 34s





In [11]:
from abc import ABC, abstractmethod

class BaseOptimizer(ABC):
    @abstractmethod
    def __init__(self) -> None:
        pass

    @abstractmethod
    def set_weight(self, weight: np.array) -> None:
        pass

    @abstractmethod
    def step(self, grad: np.array) -> np.array:
        pass

class ADAM(BaseOptimizer):
    """
    Implements Adam algorithm.

    learning_rate (float, optional) – learning rate (default: 1e-3)
    beta1, beta2 (Tuple[float, float], optional) –
    coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))
    eps (float, optional) – term added to the denominator to improve numerical stability (default: 1e-8)
    """

    def __init__(self, beta1: float = 0.9, beta2: float = 0.999, eps: float = 1e-8,
                 learning_rate: float = 3e-4, weight_decay: float = 0) -> None:
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay

        self.EMA1 = None
        self.EMA2 = None

        self.weight = None

    def set_weight(self, weight: np.array) -> None:
        self.weight = weight.copy()
        self.EMA1 = np.zeros(shape=self.weight.shape)
        self.EMA2 = np.zeros(shape=self.weight.shape)

    def step(self, grad: np.array) -> np.array:
        assert self.weight is not None, 'You should set the weight'
        grad = grad.copy() + self.weight_decay * self.weight
        self.EMA1 = (1 - self.beta1) * grad + self.beta1 * self.EMA1
        self.EMA2 = (1 - self.beta2) * grad ** 2 + self.beta2 * self.EMA2
        self.weight -= self.learning_rate * self.EMA1 / (np.sqrt(self.EMA2) + self.eps)

        return self.weight.copy()

In [12]:
def f(x, x_max, alpha):
    """Весовая функция 𝑓(𝑥𝑖𝑗)"""
    return np.where(x <= x_max, (x/x_max)**alpha, 1)

def update_glove_weights(x, w, d, alpha, max_x, optimizer_w, optimizer_d):
    """
    x - square integer matrix VocabSize x VocabSize - coocurrence matrix
    w - VocabSize x EmbSize - first word vectors
    d - VocabSize x EmbSize - second word vectors
    alpha - float - power in weight smoothing function f
    max_x - int - maximum coocurrence count in weight smoothing function f
    learning_rate - positive float - size of gradient step
    """
    x = np.asarray(x)
    w = np.asarray(w)
    d = np.asarray(d)
    
    f_x = f(x, max_x, alpha)
    gloveloss = np.sum(f_x * (np.log(1+x) - w.dot(d.T)) ** 2)
    glove_loss_deriv = -2 * (f_x * (np.log(1+x) - w.dot(d.T)))
    
    w_grad = glove_loss_deriv.dot(d)
    d_grad = glove_loss_deriv.T.dot(w)
    
    w = optimizer_w.step(w_grad)
    d = optimizer_d.step(d_grad)
    return w, d, gloveloss

In [13]:
# w = np.random.uniform(-1.0/100, 1.0/100,
#                       size=(len(word2id), 100))
# d = np.random.uniform(-1.0/100, 1.0/100,
#                       size=(len(word2id), 100))
# optimizer_w = ADAM(learning_rate=0.001)
# optimizer_w.set_weight(w)

# optimizer_d = ADAM(learning_rate=0.001)
# optimizer_d.set_weight(d)
# alpha = 0.75
# x_max = 100

# for i in range(10000):
#     w, d, loss = update_glove_weights(N.toarray(), w, d, alpha, x_max, optimizer_w, optimizer_d)
#     print(loss)

In [14]:
w = np.load('./data/w.npy')

In [15]:
def negative_euclidean(a, b):
    return -np.sqrt(np.sum((a-b)**2, axis=1))

def l2_norm_vector(x):
    return x / np.sqrt(np.sum(x**2, axis=1, keepdims=True))

def get_nearest(embeddings, query_word_id, get_n):
    """
    embeddings - VocabSize x EmbSize - word embeddings
    query_word_id - integer - id of query word to find most similar to
    get_n - integer - number of most similar words to retrieve

    returns list of `get_n` tuples (word_id, similarity) sorted by descending order of similarity value
    """
    embeddings = np.asarray(embeddings)
    
    # Нормирование
    embeddings = l2_norm_vector(embeddings)
    
    distances = negative_euclidean(embeddings[query_word_id], embeddings)
    
    similars = [(word, distance) for word, distance in enumerate(distances)]
    similars = sorted(similars, key=lambda pair: pair[1], reverse=True)

    return similars[:get_n]

In [16]:
for i, j in get_nearest(w, word2id['salt'], 20):
    print(id2word[i], j)

salt -0.0
kosher -0.8478311577734389
aste -0.9087465510793785
sea -0.9100766384185061
grounded -0.9298566604450065
taste -0.9419398667792018
freshly -0.9550678821123476
pepper -0.967164374015822
ground -0.9770381881350795
frsehly -0.9927840393609063
plenty -0.9968474317840569
to -0.9988405729058228
coarse -0.9988961514808067
black -1.0138186828084585
desired -1.0362869521672309
more -1.0442771224000031
dotting -1.0471271416530998
tpepper -1.0485695064143907
cayenne -1.056628416112558
added -1.0625433400270485


In [17]:
import bokeh.models as bm, bokeh.plotting as pl
from bokeh.io import output_notebook
output_notebook()

def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
                 width=600, height=400, show=True, **kwargs):
    """ draws an interactive plot for data points with auxilirary info on hover """
    if isinstance(color, str): color = [color] * len(x)
    data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })

    fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
    fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)

    fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
    if show: pl.show(fig)
    return fig

In [18]:
from sklearn.manifold import TSNE

# map word vectors onto 2d plane with TSNE. hint: don't panic it may take a minute or two to fit.
# normalize them as just lke with pca
words = [i for i, j in sorted_word2id[:1000]]
vectors = np.array([w[i] for i in [word2id[word] for word in words]])

word_tsne = TSNE(n_components=2).fit_transform(vectors)#YOUR CODE
word_tsne = (word_tsne - word_tsne.mean(axis=0)) / word_tsne.std(axis=0)



In [19]:
draw_vectors(word_tsne[:, 0], word_tsne[:, 1], color='green', token=words)