# Representing Words as vectors

# 01. Word2Vec

### Words become numbers

In [1]:
import gensim.downloader as api

word2vec = api.load("glove-wiki-gigaword-100")

In [2]:
vector = word2vec["king"]

print("Vector shape:", vector.shape)
print(vector)  

Vector shape: (100,)
[-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.5288

### Let's check simantics

In [3]:
word2vec.similarity("king", "queen")

np.float32(0.7507691)

In [4]:
word2vec.similarity("king", "car")

np.float32(0.28304237)

In [5]:
word2vec.most_similar("king", topn=5)

[('prince', 0.7682328820228577),
 ('queen', 0.7507690787315369),
 ('son', 0.7020888328552246),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175)]

In [8]:
vec = word2vec["king"] - word2vec["boy"] + word2vec["girl"]
word2vec.similar_by_vector(vec, topn=5)

[('king', 0.9287101626396179),
 ('queen', 0.8306989669799805),
 ('prince', 0.7634375095367432),
 ('princess', 0.7259793281555176),
 ('crown', 0.6941155195236206)]

## 02. Sentence embeddings

In [12]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Available device: {device}")

embedding_model = SentenceTransformer(
    "all-MiniLM-L6-v2",
    device=device
)

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

Available device: cpu


In [10]:
embedding_model.encode("I Like computer science")

array([-5.98189272e-02, -4.16225903e-02, -2.75666844e-02,  7.24908104e-03,
       -2.23820228e-02, -1.28726169e-01,  6.82869405e-02, -5.10968966e-03,
        4.95481007e-02,  9.82555002e-02, -6.52575418e-02, -6.24126531e-02,
       -1.26319304e-02,  1.01306196e-02, -1.58078279e-02, -6.97240140e-03,
       -8.73232260e-02, -6.21334575e-02,  2.80504245e-02, -7.75084794e-02,
       -7.22782165e-02,  7.36584421e-03, -9.07739624e-03, -3.84757407e-02,
        1.60988402e-02,  9.01823267e-02,  3.06600947e-02, -2.62009315e-02,
       -5.85225597e-02, -6.81563765e-02, -1.11934446e-01,  8.49199370e-02,
        8.81168805e-03,  3.16391774e-02, -3.20692919e-02,  1.49402330e-02,
        6.68244883e-02, -2.69344784e-02,  4.37476560e-02,  2.81169526e-02,
       -4.52860072e-02,  5.46173751e-02,  9.76005718e-02, -1.42523311e-02,
       -3.02179903e-03, -2.56810095e-02, -2.35912632e-02, -6.86878413e-02,
        5.67548126e-02,  2.49384325e-02,  7.98478577e-05, -3.87980305e-02,
       -7.40376860e-02,  