# Word2Vec + FastText

In [1]:
import collections
import copy
import re
from itertools import product
import traceback

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
full_dataset = list(pd.read_csv('./data/nyt-ingredients-snapshot-2015.csv')['input'].dropna())
np.random.shuffle(full_dataset)

print("Размер выборки", len(full_dataset))

print('\n'.join(full_dataset[:10]))

Размер выборки 179063
1 cup plus a splash orange liqueur
1/2 cup beer
1 pound potatoes, preferably fingerling, Ruby Crescent or Yellow Finn
1 branch fresh tarragon or thyme separated into sprigs, or 1/2 teaspoon dried thyme or tarragon
3 pints boiling water
1/2 cup plus 1 tablespoon extra-virgin olive oil
1/2 cup flour
1/4 teaspoon salt, plus more to taste
1 medium yellow squash, cut into small dice
2 teaspoons rice vinegar


In [3]:
TOKEN_RE = re.compile(r'[\w\d]+')


def tokenize_text_simple_regex(txt, min_token_size=4):
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_token_size]

def tokenize_corpus(texts, tokenizer=tokenize_text_simple_regex, **tokenizer_kwargs):
    return [tokenizer(text, **tokenizer_kwargs) for text in texts]

def build_vocabulary(tokenized_texts, max_size=1000000, max_doc_freq=0.8, min_count=5, pad_word=None):
    word_counts = collections.defaultdict(int)
    doc_n = 0

    # посчитать количество документов, в которых употребляется каждое слово
    # а также общее количество документов
    for txt in tokenized_texts:
        doc_n += 1
        unique_text_tokens = set(txt)
        for token in unique_text_tokens:
            word_counts[token] += 1

    # убрать слишком редкие и слишком частые слова
    word_counts = {word: cnt for word, cnt in word_counts.items()
                   if cnt >= min_count and cnt / doc_n <= max_doc_freq}

    # отсортировать слова по убыванию частоты
    sorted_word_counts = sorted(word_counts.items(),
                                reverse=True,
                                key=lambda pair: pair[1])

    # добавим несуществующее слово с индексом 0 для удобства пакетной обработки
    if pad_word is not None:
        sorted_word_counts = [(pad_word, 0)] + sorted_word_counts

    # если у нас по прежнему слишком много слов, оставить только max_size самых частотных
    if len(word_counts) > max_size:
        sorted_word_counts = sorted_word_counts[:max_size]

    # нумеруем слова
    word2id = {word: i for i, (word, _) in enumerate(sorted_word_counts)}

    # нормируем частоты слов
    word2freq = np.array([cnt / doc_n for _, cnt in sorted_word_counts], dtype='float32')

    return word2id, word2freq

def texts_to_token_ids(tokenized_texts, word2id):
    return [[word2id[token] for token in text if token in word2id]
            for text in tokenized_texts]

In [4]:
tokenized_texts = tokenize_corpus(full_dataset)
word2id, word2freq = build_vocabulary(tokenized_texts, max_doc_freq=0.9, min_count=5, pad_word='<PAD>')

token_ids = texts_to_token_ids(tokenized_texts, word2id)

print('\n'.join(' '.join(str(t) for t in sent)
                for sent in token_ids[:10]))

45 872 97 411
596
12 92 79 1095 1020 150 1981
1270 9 206 63 244 29 82 2 41 63 206
603 371 39
45 8 40 44 13
38
2 4 45 47 10
35 150 233 29 36 171
17 72 48


In [5]:
class SequenceDataset:
    def __init__(self, texts, pad_value=0):
        self.texts = texts
        self.pad_value = pad_value

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        txt = self.texts[item]

        txt = np.array(txt)

        return txt

In [6]:
dataset = SequenceDataset(token_ids)

print(dataset[0])

[ 45 872  97 411]


In [7]:
def make_diag_mask(size, radius):
    """Квадратная матрица размера Size x Size с двумя полосами ширины radius вдоль главной диагонали"""
    idxs = np.arange(size)
    abs_idx_diff = np.abs(np.arange(size)[np.newaxis, :] - np.arange(size)[:, np.newaxis])
    mask = ((abs_idx_diff <= radius // 2) & (abs_idx_diff > 0)).astype(int)
    return mask

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    s = 1 / (1 + np.exp(-z))
    return s * (1 - s)

def log_loss(y_true, a_pred):
    '''
    Compute log loss, a_pred - vector of size n_objects
    '''
    return np.mean(-y_true * np.log(a_pred) - (1 - y_true) * np.log(1 - a_pred))

def log_loss_derivative(y_true, a_pred):
    '''
    Compute detivative of log loss
    '''
    return (-y_true / a_pred + (1 - y_true) / (1 - a_pred)) / len(y_true)

In [8]:
class Word2VecVanilla:
    def __init__(self, vocabulary, emb_size, window_size, ns_rate, pad_idx=0):
        self.vocabulary = vocabulary
        self.vocab_size = len(vocabulary)
        self.emb_size = emb_size
        self.window_size = window_size
        self.ns_rate = ns_rate
        self.pad_idx = pad_idx
        
        self.center_embeddings = np.random.uniform(-1.0/self.emb_size, 1.0/self.emb_size,
                                                   size=(self.vocab_size, self.emb_size))
        self.center_embeddings[pad_idx] = 0
        
        self.context_embeddings = np.random.uniform(-1.0/self.emb_size, 1.0/self.emb_size,
                                                    size=(self.vocab_size, self.emb_size))
        self.context_embeddings[pad_idx] = 0
        
        self.id2word = {i: w for w, i in self.vocabulary.items()}
        
        self.embeddings_normed = None
        
    
    def norm_embeddings(self):
        self.embeddings_normed = self.center_embeddings / (np.linalg.norm(
            self.center_embeddings, ord=2, axis=-1, keepdims=True) + 1e-4)

    def generate_w2v_sgns_samples(self, text):
        """
        text - list of integer numbers - ids of tokens in text
        window_size - odd integer - width of window
        vocab_size - positive integer - number of tokens in vocabulary
        ns_rate - positive integer - number of negative tokens to sample per one positive sample

        returns list of training samples (CenterWord, CtxWord, Label)
        """
        text = np.array(text)
        size = len(text)
        context_words_idxs = make_diag_mask(size, self.window_size)
        center_words_idx = np.eye(N=size, dtype=int)

        result = []

        for i in range(size):
            context = context_words_idxs[i]
            center = center_words_idx[i]

            context_idx = np.where(context != 0)[0]
            center_idx = np.where(center != 0)[0]

            center_word = text[center_idx]
            context_words = text[context_idx]

            if center_word == self.pad_idx:
                continue

            positive_examples = [list(i) + [1] for i in product(center_word, context_words)]
            positive_examples = [
                positive_example for positive_example in positive_examples if self.pad_idx not in positive_example
            ]

            # чтобы не совпадало с самим собой
            all_words = np.arange(self.vocab_size)
            center_word_idx = np.argwhere(all_words == center_word)[0][0]
            all_words = list(all_words)
            del all_words[center_word_idx], all_words[self.pad_idx]
            all_words = np.array(all_words)
            negative_word = np.random.choice(all_words)

            # выбрать одно из двух, либо убрать совпадение с самим собой, либо нет
            negative_words = np.random.randint(self.vocab_size, size=self.ns_rate*len(positive_examples))    
            negative_examples = [list(i) + [0] for i in product(center_word, negative_words)]

            positive_negative_examples = positive_examples + negative_examples    
            result.extend(positive_negative_examples)

        return np.random.permutation(result)
    

    def update_w2v_weights(self, center_word, context_word, label, learning_rate):
        """
        center_embeddings - VocabSize x EmbSize
        context_embeddings - VocabSize x EmbSize
        center_word - int - identifier of center word
        context_word - int - identifier of context word
        label - 1 if context_word is real, 0 if it is negative
        learning_rate - float > 0 - size of gradient step
        """

        # Выберем центральное и контекстное слово
        center_embedding = self.center_embeddings[center_word]
        context_embedding = self.context_embeddings[context_word]

        # Считаем "оценку" сходства и ее вероятность по формуле 𝑃(𝐶𝑡𝑥𝑊𝑜𝑟𝑑|𝐶𝑒𝑛𝑡𝑒𝑟𝑊𝑜𝑟𝑑)=𝜎(𝑊𝐶𝑒𝑛𝑡𝑒𝑟𝑊𝑜𝑟𝑑,:⋅𝐷𝐶𝑡𝑥𝑊𝑜𝑟𝑑,:)
        score = center_embedding.dot(context_embedding)
        prob = sigmoid(score)

        loss = log_loss(label, prob)

        # Считаем производные для правила цепочки: производная лосса * производную сигмоиды
        # * производную 𝑊𝐶𝑒𝑛𝑡𝑒𝑟𝑊𝑜𝑟𝑑,:⋅𝐷𝐶𝑡𝑥𝑊𝑜𝑟𝑑,: по 𝑊𝐶𝑒𝑛𝑡𝑒𝑟𝑊𝑜𝑟𝑑 и 𝐷𝐶𝑡𝑥𝑊𝑜𝑟𝑑
        log_loss_deriv = log_loss_derivative(np.array([label]), np.array([prob]))
        sigmoid_deriv = sigmoid_derivative(score)

        # Считаем производные по цетральному слову и контекстному для обновления весов
        center_grad = log_loss_deriv * sigmoid_deriv * context_embedding
        context_grad = log_loss_deriv * sigmoid_deriv * center_embedding

        self.center_embeddings[center_word] -= learning_rate * center_grad
        self.context_embeddings[context_word] -= learning_rate * context_grad

        return loss
    
    def fit(self, dataset, learning_rate=1e-2, n_epochs=10):
        for n in tqdm(range(n_epochs)):
            try:
                idxs = np.random.permutation(len(dataset))

                for i, idx in enumerate(idxs):
                    data = dataset[idx]
                    samples = self.generate_w2v_sgns_samples(text=data)

                    cur_idx = 1
                    cur_loss = 0

                    for sample in samples:
                        center_word, context_word, label = sample

                        loss = self.update_w2v_weights(
                            center_word=center_word,
                            context_word=context_word,
                            label=label,
                            learning_rate=learning_rate
                        )

                        cur_loss += loss
                        cur_idx += 1

                print(f"LOSS: {cur_loss / cur_idx}")

            except KeyboardInterrupt:
                print('Досрочно остановлено пользователем')
                return self

            except Exception as ex:
                print('Ошибка при обучении: {}\n{}'.format(ex, traceback.format_exc()))
                break
        return self
    
    def most_similar(self, word, topk=10):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        return self.most_similar_by_vector(self.get_vector(word), topk=topk)

    def analogy(self, a1, b1, a2, topk=10):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        a1_v = self.get_vector(a1)
        b1_v = self.get_vector(b1)
        a2_v = self.get_vector(a2)
        query = b1_v - a1_v + a2_v
        return self.most_similar_by_vector(query, topk=topk)

    def most_similar_by_vector(self, query_vector, topk=10):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        similarities = (self.embeddings_normed * query_vector).sum(-1)
        best_indices = np.argpartition(-similarities, topk, axis=0)[:topk]
        result = [(self.id2word[i], similarities[i]) for i in best_indices]
        result.sort(key=lambda pair: -pair[1])
        return result

    def get_vector(self, word):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        if word not in self.vocabulary:
            raise ValueError('Неизвестное слово "{}"'.format(word))
        return self.embeddings_normed[self.vocabulary[word]]

    def get_vectors(self, *words):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        word_ids = [self.vocabulary[i] for i in words]
        vectors = np.stack([self.embeddings_normed[i] for i in word_ids], axis=0)
        return vectors

In [9]:
word2vec = Word2VecVanilla(
    vocabulary=word2id,
    emb_size=100,
    window_size=5,
    ns_rate=10,
    pad_idx=0
)

In [10]:
word2vec.fit(dataset, n_epochs=3)

 33%|███████████████████████████▋                                                       | 1/3 [12:46<25:32, 766.44s/it]

LOSS: 0.027821838962048352


 67%|███████████████████████████████████████████████████████▎                           | 2/3 [25:30<12:45, 765.12s/it]

LOSS: 0.11800157154489815


100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [38:12<00:00, 764.21s/it]

LOSS: 0.0





<__main__.Word2VecVanilla at 0x23685e530d0>

In [11]:
word2vec.norm_embeddings()
print('chicken\n', word2vec.most_similar('chicken'), '\n')
print('wine\n', word2vec.most_similar('wine'), '\n')
print('salt\n', word2vec.most_similar('salt'), '\n')

chicken
 [('chicken', 0.9999628414370786), ('duck', 0.8375601140226202), ('turkey', 0.8134993126393183), ('thighs', 0.7932530122474649), ('legs', 0.7838325180714862), ('breasts', 0.7802486438501787), ('breast', 0.7801002868214305), ('wings', 0.753420181955752), ('drumsticks', 0.7485258358195042), ('veal', 0.7318013286752812)] 

wine
 [('wine', 0.9999613666041658), ('sherry', 0.8561270823775503), ('grigio', 0.8388384086243729), ('sauvignon', 0.8283005488345155), ('champagne', 0.8236668280750943), ('balsamic', 0.8190846118740012), ('shao', 0.8018726020419095), ('pinot', 0.7912309005701603), ('bottle', 0.7877572708844414), ('hsing', 0.7865292393456697)] 

salt
 [('salt', 0.9999665549606125), ('fleur', 0.8062762320131163), ('kosher', 0.7799918669723506), ('nblack', 0.7522017818029882), ('cayenne', 0.7462796715024314), ('diamond', 0.7399968981457564), ('maldon', 0.7319423615480875), ('lots', 0.7245726593757553), ('passover', 0.71830744908613), ('asafetida', 0.7182918669222995)] 



In [12]:
def get_n_gramms(word, n):
    size = len(word) - n + 1
    return [word[i:i+n] for i in range(size)]

def make_ngramms_dict(vocabulary, n_list):
    max_num = max(vocabulary.values())+1
    ngramms = []
    
    for word in vocabulary.keys():
        for n in n_list:
            gramms = get_n_gramms(word, n)
            ngramms.extend(gramms)
            
    ngramms = set(ngramms)
    ngramm2id = {ngramm:i+max_num for i, ngramm in enumerate(ngramms)}
    return ngramm2id

def make_token2subwords(words_voc, ngramm_voc, n_list):
    token2subwords = {}
    
    for word in words_voc.keys():
        
        gramms_ids = []
        for n in n_list:
            for gramm in get_n_gramms(word, n):
                gramms_ids.append(ngramm_voc[gramm])
                
        token2subwords[words_voc[word]] = gramms_ids
    return token2subwords

In [13]:
n_list = [2]

ngramm2id = make_ngramms_dict(word2id, n_list=n_list)
ngramm2id

{'ab': 2632,
 'bl': 2633,
 '27': 2634,
 'pc': 2635,
 'on': 2636,
 'dj': 2637,
 'lc': 2638,
 'zo': 2639,
 'ua': 2640,
 'fl': 2641,
 'ax': 2642,
 'de': 2643,
 'nn': 2644,
 'ue': 2645,
 'ay': 2646,
 'kk': 2647,
 'ld': 2648,
 'od': 2649,
 'zc': 2650,
 'bb': 2651,
 'ku': 2652,
 'oc': 2653,
 'rè': 2654,
 'af': 2655,
 'te': 2656,
 'ag': 2657,
 'va': 2658,
 'ex': 2659,
 'wo': 2660,
 'mt': 2661,
 'ib': 2662,
 'ff': 2663,
 'ca': 2664,
 'îc': 2665,
 '17': 2666,
 'by': 2667,
 'ém': 2668,
 'ev': 2669,
 'kp': 2670,
 'oe': 2671,
 'zz': 2672,
 'yè': 2673,
 'ik': 2674,
 'ow': 2675,
 '05': 2676,
 'un': 2677,
 'ón': 2678,
 'oy': 2679,
 'pr': 2680,
 'ks': 2681,
 'gt': 2682,
 'mi': 2683,
 'sm': 2684,
 'it': 2685,
 'ws': 2686,
 'ch': 2687,
 'dc': 2688,
 'sc': 2689,
 '64': 2690,
 '41': 2691,
 'hl': 2692,
 'ud': 2693,
 'do': 2694,
 'ph': 2695,
 'ze': 2696,
 'sh': 2697,
 'ge': 2698,
 'mo': 2699,
 'lp': 2700,
 'ei': 2701,
 'ôn': 2702,
 'si': 2703,
 'su': 2704,
 'da': 2705,
 'tp': 2706,
 '56': 2707,
 'xs': 2708,

In [14]:
word2id

{'<PAD>': 0,
 'tablespoons': 1,
 'teaspoon': 2,
 'chopped': 3,
 'salt': 4,
 'pepper': 5,
 'cups': 6,
 'ground': 7,
 'tablespoon': 8,
 'fresh': 9,
 'taste': 10,
 'freshly': 11,
 'pound': 12,
 'olive': 13,
 'garlic': 14,
 'peeled': 15,
 'finely': 16,
 'teaspoons': 17,
 'large': 18,
 'minced': 19,
 'butter': 20,
 'ounces': 21,
 'black': 22,
 'sugar': 23,
 'about': 24,
 'pounds': 25,
 'juice': 26,
 'sliced': 27,
 'white': 28,
 'into': 29,
 'inch': 30,
 'lemon': 31,
 'cloves': 32,
 'leaves': 33,
 'onion': 34,
 'medium': 35,
 'small': 36,
 'grated': 37,
 'flour': 38,
 'water': 39,
 'extra': 40,
 'dried': 41,
 'parsley': 42,
 'wine': 43,
 'virgin': 44,
 'plus': 45,
 'chicken': 46,
 'more': 47,
 'vinegar': 48,
 'diced': 49,
 'unsalted': 50,
 'cream': 51,
 'ounce': 52,
 'optional': 53,
 'whole': 54,
 'sauce': 55,
 'tomatoes': 56,
 'kosher': 57,
 'green': 58,
 'eggs': 59,
 'milk': 60,
 'grams': 61,
 'vegetable': 62,
 'thyme': 63,
 'cheese': 64,
 'thinly': 65,
 'coarsely': 66,
 'pieces': 67,
 'gi

In [15]:
token2subwords = make_token2subwords(word2id, ngramm2id, n_list=n_list)
token2subwords

{0: [2912, 2796, 2759, 3081],
 1: [2949, 2632, 2633, 2967, 2839, 2763, 3061, 2745, 2636, 2988],
 2: [2656, 2783, 3000, 2763, 3061, 2745, 2636],
 3: [2687, 2851, 2941, 3090, 2843, 3027],
 4: [3091, 3024, 2724],
 5: [2843, 2808, 3090, 2843, 3006],
 6: [2922, 2749, 2756],
 7: [2785, 3035, 3100, 2677, 2861],
 8: [2949, 2632, 2633, 2967, 2839, 2763, 3061, 2745, 2636],
 9: [3109, 3021, 2839, 2697],
 10: [2949, 3000, 2894, 2656],
 11: [3109, 3021, 2839, 2697, 2692, 2846],
 12: [3061, 3100, 2677, 2861],
 13: [2771, 3020, 2932, 2961],
 14: [3080, 2898, 3028, 3020, 2857],
 15: [2843, 2899, 2809, 2967, 3027],
 16: [2938, 2841, 2830, 2809, 2846],
 17: [2656, 2783, 3000, 2763, 3061, 2745, 2636, 2988],
 18: [2844, 2898, 2886, 2698],
 19: [2683, 2841, 2906, 2814, 3027],
 20: [2842, 2813, 2973, 2656, 3006],
 21: [3100, 2677, 2906, 2814, 2839],
 22: [2633, 2844, 2977, 2903],
 23: [2704, 2731, 3080, 2898],
 24: [2632, 2869, 3100, 2813],
 25: [3061, 3100, 2677, 2861, 2787],
 26: [2963, 2974, 2857, 2814],

In [16]:
class FastTextVanilla:
    def __init__(self, vocabulary, ngramm2id, token2subwords, emb_size, window_size, ns_rate, pad_idx=0):
        self.vocabulary = vocabulary
        self.vocab_size = len(vocabulary)
        
        self.ngramm2id = ngramm2id
        self.ngramm_size = len(ngramm2id)
        
        self.token2subwords = token2subwords
        
        self.emb_size = emb_size
        self.window_size = window_size
        self.ns_rate = ns_rate
        self.pad_idx = pad_idx
        
        self.center_embeddings = np.random.uniform(-1.0/self.emb_size, 1.0/self.emb_size,
                                                   size=(self.vocab_size+self.ngramm_size, self.emb_size))
        self.center_embeddings[pad_idx] = 0
        
        self.context_embeddings = np.random.uniform(-1.0/self.emb_size, 1.0/self.emb_size,
                                                    size=(self.vocab_size, self.emb_size))
        self.context_embeddings[pad_idx] = 0
        
        self.id2word = {i: w for w, i in self.vocabulary.items()}
        
        self.embeddings_normed = None
        
    
    def norm_embeddings(self):
        self.embeddings_normed = self.center_embeddings / (np.linalg.norm(
            self.center_embeddings, ord=2, axis=-1, keepdims=True) + 1e-4)

    def generate_ft_sgns_samples(self, text):
        """
        text - list of integer numbers - ids of tokens in text
        window_size - odd integer - width of window
        vocab_size - positive integer - number of tokens in vocabulary
        ns_rate - positive integer - number of negative tokens to sample per one positive sample
        token2subwords - list of lists of int - i-th sublist contains list of identifiers of n-grams for token #i (list of subword units)

        returns list of training samples (CenterSubwords, CtxWord, Label)
        """
        text = np.array(text)
        size = len(text)
        context_words_idxs = make_diag_mask(size, self.window_size)
        center_words_idx = np.eye(N=size, dtype=int)

        result = []

        for i, word in enumerate(text):
            context = context_words_idxs[i]
            center = center_words_idx[i]

            context_idx = np.where(context != 0)[0]
            center_idx = np.where(center != 0)[0]

            center_word = text[center_idx].tolist()
            context_words = text[context_idx]

            center_word.extend(self.token2subwords[word])

            positive_examples = [tuple(list(j) + [1]) for j in product([center_word], context_words)]
            positive_examples = [
                positive_example for positive_example in positive_examples if self.pad_idx not in positive_example
            ]

            # чтобы не совпадало с самим собой
            all_words = np.arange(self.vocab_size)
            center_word_idx = np.argwhere(all_words == text[center_idx].tolist())[0][0]
            all_words = list(all_words)
            del all_words[center_word_idx], all_words[self.pad_idx]
            all_words = np.array(all_words)
            negative_word = np.random.choice(all_words)

            # выбрать одно из двух, либо убрать совпадение с самим собой, либо нет
            negative_words = np.random.randint(self.vocab_size, size=self.ns_rate*len(positive_examples))    
            negative_examples = [tuple(list(j) + [0]) for j in product([center_word], negative_words)]

            positive_negative_examples = positive_examples + negative_examples
            result.extend(positive_negative_examples)

        return result
    
    def update_ft_weights(self, center_subwords, context_word, label, learning_rate):
        """
        center_embeddings - VocabSize x EmbSize
        context_embeddings - VocabSize x EmbSize
        center_subwords - list of ints - list of identifiers of n-grams contained in center word
        context_word - int - identifier of context word
        label - 1 if context_word is real, 0 if it is negative
        learning_rate - float > 0 - size of gradient step
        """
        # Выберем центральное и контекстное слово
        center_embedding = self.center_embeddings[center_subwords]
        context_embedding = self.context_embeddings[[context_word]]

        # Считаем "оценку" сходства и ее вероятность по формуле
        # 𝑃(𝐶𝑡𝑥𝑊𝑜𝑟𝑑|𝐶𝑒𝑛𝑡𝑒𝑟𝑆𝑢𝑏𝑤𝑜𝑟𝑑𝑠)=𝜎((∑𝑙𝑒𝑛(𝐶𝑒𝑛𝑡𝑒𝑟𝑆𝑢𝑏𝑤𝑜𝑟𝑑𝑠)𝑤∈𝐶𝑒𝑛𝑡𝑒𝑟𝑆𝑢𝑏𝑤𝑜𝑟𝑑𝑠𝑊𝑤,:𝑙𝑒𝑛(𝐶𝑒𝑛𝑡𝑒𝑟𝑆𝑢𝑏𝑤𝑜𝑟𝑑𝑠))⋅𝐷𝐶𝑡𝑥𝑊𝑜𝑟𝑑,:)
        score = np.mean(center_embedding, axis=0, keepdims=True).dot(context_embedding.T)
        prob = sigmoid(score)
        loss = log_loss(label, prob)

        # Считаем производные для правила цепочки: производная лосса * производную сигмоиды
        # * производную 𝑊𝐶𝑒𝑛𝑡𝑒𝑟𝑊𝑜𝑟𝑑,:⋅𝐷𝐶𝑡𝑥𝑊𝑜𝑟𝑑,: по 𝑊𝐶𝑒𝑛𝑡𝑒𝑟𝑊𝑜𝑟𝑑 и 𝐷𝐶𝑡𝑥𝑊𝑜𝑟𝑑
        log_loss_deriv = log_loss_derivative(np.array([label]), np.array(prob))
        sigmoid_deriv = sigmoid_derivative(score)

        # Считаем производные по цетральному слову и контекстному для обновления весов
        center_grad = log_loss_deriv * sigmoid_deriv * context_embedding / len(center_subwords)
        context_grad = log_loss_deriv * sigmoid_deriv * np.mean(center_embedding, axis=0, keepdims=True)

        self.center_embeddings[center_subwords] -= learning_rate * center_grad
        self.context_embeddings[[context_word]] -= learning_rate * context_grad

        return loss

    
    def fit(self, dataset, learning_rate=1e-2, n_epochs=10):
        for n in tqdm(range(n_epochs)):
            try:
                idxs = np.random.permutation(len(dataset))

                for i, idx in enumerate(idxs):
                    data = dataset[idx]
                    samples = self.generate_ft_sgns_samples(text=data)

                    cur_idx = 1
                    cur_loss = 0

                    for sample in samples:
                        center_subwords, context_word, label = sample

                        loss = self.update_ft_weights(
                            center_subwords=center_subwords,
                            context_word=context_word,
                            label=label,
                            learning_rate=learning_rate
                        )

                        cur_loss += loss
                        cur_idx += 1

                print(f"LOSS: {cur_loss / cur_idx}")

            except KeyboardInterrupt:
                print('Досрочно остановлено пользователем')
                return self

            except Exception as ex:
                print('Ошибка при обучении: {}\n{}'.format(ex, traceback.format_exc()))
                break
        return self
    
    def most_similar(self, word, topk=10):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        return self.most_similar_by_vector(self.get_vector(word), topk=topk)

    def analogy(self, a1, b1, a2, topk=10):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        a1_v = self.get_vector(a1)
        b1_v = self.get_vector(b1)
        a2_v = self.get_vector(a2)
        query = b1_v - a1_v + a2_v
        return self.most_similar_by_vector(query, topk=topk)

    def most_similar_by_vector(self, query_vector, topk=10):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        similarities = (self.embeddings_normed[:self.vocab_size] * query_vector).sum(-1)
        best_indices = np.argpartition(-similarities, topk, axis=0)[:topk]
        result = [(self.id2word[i], similarities[i]) for i in best_indices]
        result.sort(key=lambda pair: -pair[1])
        return result

    def get_vector(self, word):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        if word not in self.vocabulary:
            raise ValueError('Неизвестное слово "{}"'.format(word))
        return self.embeddings_normed[self.vocabulary[word]]

    def get_vectors(self, *words):
        assert self.embeddings_normed is not None, 'Необходимо нормировать эмбеддинги self.norm_embeddings'
        word_ids = [self.vocabulary[i] for i in words]
        vectors = np.stack([self.embeddings_normed[i] for i in word_ids], axis=0)
        return vectors

In [17]:
fasttext = FastTextVanilla(
    vocabulary=word2id,
    ngramm2id=ngramm2id,
    token2subwords=token2subwords,
    emb_size=100,
    window_size=5,
    ns_rate=10,
    pad_idx=0
)

In [18]:
fasttext.fit(dataset, n_epochs=3)

 33%|███████████████████████████▎                                                      | 1/3 [27:32<55:05, 1652.93s/it]

LOSS: 0.1542979294302138


 67%|██████████████████████████████████████████████████████▋                           | 2/3 [55:11<27:36, 1656.17s/it]

LOSS: 0.0


100%|████████████████████████████████████████████████████████████████████████████████| 3/3 [1:22:38<00:00, 1652.99s/it]

LOSS: 0.22443739081415162





<__main__.FastTextVanilla at 0x23686559b50>

In [19]:
fasttext.norm_embeddings()
print('chicken\n', fasttext.most_similar('chicken'), '\n')
print('wine\n', fasttext.most_similar('wine'), '\n')
print('salt\n', fasttext.most_similar('salt'), '\n')

chicken
 [('chicken', 0.9999438705060457), ('broth', 0.8522845357240816), ('stock', 0.8264915899448626), ('homemade', 0.8163790970614407), ('fish', 0.789068851834661), ('sodium', 0.7542037407875364), ('quarts', 0.7486070354691838), ('skinless', 0.7427085342574133), ('beef', 0.7375763240696203), ('vegetable', 0.7254080895618105)] 

wine
 [('wine', 0.9999497635735426), ('sherry', 0.8516056872629647), ('champagne', 0.8140638736784306), ('vinegar', 0.7976674277249974), ('ruby', 0.7823168028606087), ('shaoxing', 0.7780367584946614), ('cider', 0.7679885671184234), ('shao', 0.7415824977598999), ('cognac', 0.7305820144955263), ('full', 0.7167917985935811)] 

salt
 [('salt', 0.9999687565909968), ('black', 0.8621077213765054), ('taste', 0.8393914102384562), ('tomatoes', 0.8266827619283045), ('brisket', 0.8257603749587378), ('pepper', 0.8181771661170533), ('bell', 0.8020285316227762), ('cayenne', 0.8006072539935507), ('nblack', 0.7966581208768948), ('dozen', 0.7947899873440986)] 



In [1]:
from IPython.display import HTML

with open('./style.css') as f:
    style = f.read()
HTML(style)