# Натянуть сову на линейное пространство

Создайте эмбеддинги слов и визуализируйте векторные операции над ними: сложение, вычитание, взятие ближайшего, дальнейшего и прочее. Сравните качество представлений gensim и BERT с точки зрения операций над словами, докажите примерами.

Для создания эмбеддингов с gensim обучите модель на нормализованных текстовых данных. Данные найдите на kaggle или выберите один из предложенных датасетов. Для создания эмбеддингов с BERT используйте предобученные модели.

Предлагаемые датасеты:
 - [sentiment твитов про ковид](https://www.kaggle.com/datatattle/covid-19-nlp-text-classification)
 - [Amazon product reviews](https://www.kaggle.com/kashnitsky/hierarchical-text-classification)
 - [Отзывы интернет-магазина](https://www.kaggle.com/shymammoth/shopee-reviews)
 - [Тексты статей конференции NIPS](https://www.kaggle.com/rowhitswami/nips-papers-1987-2019-updated?select=papers.csv)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

import torch
import torch.nn as nn
from tqdm.notebook import tqdm, trange

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

In [None]:
data = pd.read_csv('data/corona.csv')

In [None]:
txt_data = data.OriginalTweet.to_frame()

txt_data.info()
txt_data.head()

In [None]:
from typing import List, Tuple
from collections import Counter
from itertools import chain
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

nltk.data.path.append('./data')
nltk.download('stopwords', download_dir = './data')
eng_stopwords = stopwords.words('english')

In [None]:
from gensim.parsing.preprocessing import *

def build_vocab(texts: List[List[str]]) -> Counter:
    words = list(chain.from_iterable(texts))
    vocab = Counter(words)
    return vocab

custom_filters = [lambda x: x.lower(),
                  lambda x: re.sub(r'http\S+', '', x),
                  strip_tags,
                  strip_non_alphanum,
                  strip_punctuation,
                  strip_multiple_whitespaces,
                  strip_numeric,
                  lambda x: remove_stopwords(x, stopwords = eng_stopwords),
                  
                  stem_text]

def preprocess(text: str) -> np.ndarray:
    return preprocess_string(text, filters = custom_filters)

texts = txt_data.OriginalTweet.apply(preprocess)
vocab = build_vocab(texts)

In [None]:
print(len(vocab))
print(vocab.most_common(10))

print(len(texts))
texts.head()

In [None]:
from gensim.models import Word2Vec

# gs = Word2Vec(sentences = texts,
#               vector_size = 256,
#               seed = 0,
#               min_count = 1,
#               workers = 8,
#               sg = False,
#               epochs = 10
#              )

gs = Word2Vec.load("data/models/gs_mod.model")

print(gs.wv.vectors.shape)

In [None]:
import umap

points = umap.UMAP(random_state = 0, n_jobs = 8).fit_transform(gs.wv.vectors[:500])

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax.scatter(points[:, 0], points[:, 1])

for i, txt in enumerate(gs.wv.index_to_key[:500]):
    ax.annotate(txt, (points[i, 0], points[i, 1]))

In [None]:
import torch
from transformers import BertTokenizerFast, BertModel

bert = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # возращать эмбеддинги каждого слова
                                  )
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

bert.eval()

In [None]:
@torch.no_grad()
def make_sentence_embedding_bert(text: str) -> torch.Tensor:
    tokens_tensor, _, segments_tensors = tokenizer(text, return_tensors = 'pt').values()
    
    # calc embeddings
    outputs = bert(tokens_tensor, segments_tensors)
    last_layer_embs = outputs.last_hidden_state
    sentense_embedding = last_layer_embs.squeeze(0).sum(dim=0)
    
    return sentense_embedding

In [None]:
texts = texts.astype(str)
embs_bert = []

In [None]:
# for text in tqdm(vocab):
#     emb = make_sentence_embedding_bert(text)
#     embs_bert.append(emb)

In [None]:
# to_save = np.stack([ten.detach().numpy() for ten in embs_bert])
# np.save("data/models/bert2", to_save)
# gs.save("data/models/gs_mod.model")
embs_bert = np.load("data/models/bert2.npy")
# gs = Word2Vec.load("data/models/gs_mod.model")


In [None]:
# np.save("data/models/vocab2", list(vocab.keys()))
# fixed_vocab = list(vocab.keys())
fixed_vocab = np.load("data/models/vocab2.npy")

In [None]:
print(gs.wv.vectors.shape)
print(embs_bert.shape)

In [None]:
ten_bert = torch.from_numpy(embs_bert)
ten_gs = torch.from_numpy(gs.wv.vectors)

In [None]:
print(ten_bert.size())
print(ten_gs.size())

In [None]:
def bert_sum(query: str) -> List[float]:
    return sum([make_sentence_embedding_bert(word) for word in query.split()])

def best_gs(query: str) -> List[Tuple[str, float]]:
    try:
        return gs.wv.most_similar(positive = preprocess(query))
    except KeyError:
        return [("None", 0)] * 10

def best_bert(query: str) -> List[Tuple[str, float]]:
    q_emb = make_sentence_embedding_bert(query)
    similarities = torch.nn.functional.cosine_similarity(q_emb, ten_bert, dim = -1)
    temp = list(zip(fixed_vocab, similarities.detach().numpy()))
    temp.sort(key = lambda tup: tup[1], reverse = True)
    return temp[:10]

def best_bert_sum(query: str) -> pd.DataFrame:
    q_emb = bert_sum(query)
    similarities = torch.nn.functional.cosine_similarity(q_emb, ten_bert, dim = -1)
    temp = list(zip(fixed_vocab, similarities.detach().numpy()))
    temp.sort(key = lambda tup: tup[1], reverse = True)
    return pd.DataFrame(temp[:10], columns = ['bert_word', 'bert_sim'])

def bests(query: str) -> pd.DataFrame:
    gsdf = pd.DataFrame(best_gs(query), columns = ['gs_word', 'gs_sim'])
    bertdf = pd.DataFrame(best_bert(query), columns = ['bert_word', 'bert_sim'])
    return pd.concat([gsdf, bertdf], axis = 1)

def worst_gs(query: str) -> List[Tuple[str, float]]:
    try:
        return gs.wv.most_similar(negative = preprocess(query))
    except KeyError:
        return [("None", 0)] * 10

def worst_bert(query: str) -> List[Tuple[str, float]]:
    q_emb = make_sentence_embedding_bert(query)
    similarities = torch.nn.functional.cosine_similarity(q_emb, ten_bert, dim = -1)
    temp = list(zip(fixed_vocab, similarities.detach().numpy()))
    temp.sort(key = lambda tup: tup[1])
    return temp[:10]

def worsts(query: str) -> pd.DataFrame:
    gsdf = pd.DataFrame(worst_gs(query), columns = ['gs_word', 'gs_sim'])
    bertdf = pd.DataFrame(worst_bert(query), columns = ['bert_word', 'bert_sim'])
    return pd.concat([gsdf, bertdf], axis = 1)

In [None]:
from IPython.display import display

print('covid: ')
display(bests('covid'))
print('death: ')
display(bests('death'))
print('cure: ')
display(bests('cure'))

In [None]:
print("schizophrenia: ")
display(bests('schizophrenia'))

In [None]:
q1 = "covid vaccine"
q2 = "vaccine covid"
display(bests(q1))
display(bests(q2))

In [None]:
display(best_bert_sum(q1))
display(best_bert_sum(q2))

In [None]:
cgs = gs.wv.get_vector('covid')
dgs = gs.wv.get_vector('death')
display(pd.DataFrame(gs.wv.similar_by_vector(cgs - dgs), columns = ['gs_word', 'gs_sim']))
display(pd.DataFrame(gs.wv.similar_by_vector(dgs - cgs), columns = ['gs_word', 'gs_sim']))


In [None]:
cbert = make_sentence_embedding_bert('covid')
dbert = make_sentence_embedding_bert('death')

similarities = torch.nn.functional.cosine_similarity(cbert - dbert, ten_bert, dim = -1)
temp = list(zip(fixed_vocab, similarities.detach().numpy()))
temp.sort(key = lambda tup: tup[1])
v1 = temp[:10]

similarities = torch.nn.functional.cosine_similarity(dbert - cbert, ten_bert, dim = -1)
temp = list(zip(fixed_vocab, similarities.detach().numpy()))
temp.sort(key = lambda tup: tup[1])
v2 = temp[:10]

display(pd.DataFrame(v1, columns = ['bert_word', 'bert_sim']))
display(pd.DataFrame(v2, columns = ['bert_word', 'bert_sim']))

# Projector

Из прошлого задания вы выяснили наиболее хорошую для представления связей между словами модель. Спроецируйте ~2-3 тысячи наиболее популярных слов из выбранного корпуса в tensorflow projector.

In [None]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

In [None]:
embs = np.stack(embs_bert[:2000])
writer.add_embedding(embs,
                     metadata = fixed_vocab[:2000])
writer.close()