# demo of how embeddings are used

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [None]:
#挂载google硬盘
from google.colab import drive
drive.mount('/content/drive')
import os

Mounted at /content/drive


In [None]:
%ls
%cd drive/MyDrive/nlp-project/word2vec-pytorch/
%ls
%pwd

[0m[01;34mdrive[0m/  [01;34msample_data[0m/
/content/drive/MyDrive/nlp-project/word2vec-pytorch
config.yaml  [0m[01;34mdata[0m/  [01;34mdocs[0m/  [01;34mnotebooks[0m/  README.md  requirements.txt  train.py  [01;34mutils[0m/  [01;34mweights[0m/


'/content/drive/MyDrive/nlp-project/word2vec-pytorch'

In [None]:
import numpy as np
import pandas as pd
import torch
import sys

from sklearn.manifold import TSNE
import plotly.graph_objects as go

# sys.path.append("../")

## Loading Model and Vocabulary

In [None]:
folder = "weights/skipgram_WikiText2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model = torch.load(f"../{folder}/model.pt", map_location=device)
# vocab = torch.load(f"../{folder}/vocab.pt")

model = torch.load(f"{folder}/model.pt", map_location=device)
vocab = torch.load(f"{folder}/vocab.pt")

## Getting Embeddings

In [None]:
# embedding from first model layer
embeddings = list(model.parameters())[0]
embeddings = embeddings.cpu().detach().numpy()
print(embeddings.shape)

# normalization
norms = (embeddings ** 2).sum(axis=1) ** (1 / 2)
norms = np.reshape(norms, (len(norms), 1))
embeddings_norm = embeddings / norms
print(embeddings_norm.shape)

(4099, 300)
(4099, 300)


## SCAN Vocab Embeddings

In [None]:
print("vocab的长度:",len(vocab))
print("vocab的token:",vocab.get_itos())
print("(token, index):",vocab.get_stoi())
print(vocab.lookup_token(1))
stoi_dict = vocab.get_stoi()
for word, id in stoi_dict.items():
    if id >= 4090:
      print(word,id)

vocab的长度: 4099
the
wagner 4096
underneath 4094
toured 4093
sheffield 4090
violent 4095
weapon 4098
signs 4091
walking 4097
swiss 4092


# Visualization with t-SNE

In [None]:
# get embeddings
embeddings_df = pd.DataFrame(embeddings)

# t-SNE transform
tsne = TSNE(n_components=2)
embeddings_df_trans = tsne.fit_transform(embeddings_df)
embeddings_df_trans = pd.DataFrame(embeddings_df_trans)

# get token order
embeddings_df_trans.index = vocab.get_itos()

# if token is a number
is_numeric = embeddings_df_trans.index.str.isnumeric()

<bound method Vocab.get_itos of Vocab()>


In [None]:
color = np.where(is_numeric, "green", "black")
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=embeddings_df_trans[0],
        y=embeddings_df_trans[1],
        mode="text",
        text=embeddings_df_trans.index,
        textposition="middle center",
        textfont=dict(color=color),
    )
)

import plotly.io as pio

pio.show(fig)

# Find Similar Words

In [None]:
def get_top_similar(word: str, topN: int = 10):
    word_id = vocab[word]
    print(word_id)
    if word_id == 0:
        print("Out of vocabulary word")
        return

    # 直接从embeddings_norm中取第n=word_id行，就是对应word的词向量
    word_vec = embeddings_norm[word_id]
    # print(word_vec.shape)
    # print(word_vec)
    word_vec = np.reshape(word_vec, (len(word_vec), 1))
    # print(word_vec)
    # 当前向量与矩阵点乘，得到4099 * 1向量
    dists = np.matmul(embeddings_norm, word_vec).flatten()
    # print(dists.shape)
    print(dists)
    print(-dists)
    # 对向量进行排序(默认是升序，加个负号变为降序)，同时返回topN的索引
    topN_ids = np.argsort(-dists)[1 : topN + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = vocab.lookup_token(sim_word_id)
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict

In [None]:
for word, sim in get_top_similar("man").items():
    print("{}: {:.3f}".format(word, sim))

240
[ 0.13296239 -0.02198889  0.07728398 ... -0.01044335  0.38554946
  0.12280472]
[-0.13296239  0.02198889 -0.07728398 ...  0.01044335 -0.38554946
 -0.12280472]
mega: 0.555
woman: 0.553
bass: 0.520
person: 0.403
walking: 0.386
breaking: 0.353
shot: 0.349
young: 0.347
&: 0.346
someone: 0.346


# Vector Equations

In [None]:
emb1 = embeddings[vocab["king"]]
emb2 = embeddings[vocab["man"]]
emb3 = embeddings[vocab["woman"]]

emb4 = emb1 - emb2 + emb3
emb4_norm = (emb4 ** 2).sum() ** (1 / 2)
emb4 = emb4 / emb4_norm

emb4 = np.reshape(emb4, (len(emb4), 1))
dists = np.matmul(embeddings_norm, emb4).flatten()

top5 = np.argsort(-dists)[:5]

for word_id in top5:
    print("{}: {:.3f}".format(vocab.lookup_token(word_id), dists[word_id]))

king: 0.690
reign: 0.469
son: 0.453
woman: 0.436
daughter: 0.435
