In [1]:
# !pip install gensim
# !pip install python-Levenshtein

# Embeddings Classic Examples
This short notebook shows the classic examples used to demonstrate the math of embeddings, 

- vector[Queen] =  vector[King]  - vector[Man] + vector[Woman]

- vector[Madrid] = vector[Spain] - vector[Italy] + vector[Rome]

We first import the gensim library and load the word2vec embeddings. Then we compute, 

- vector[King]  - vector[Man] + vector[Woman]
- vector[Spain] - vector[Italy] + vector[Rome]

and look for the words with the most similar vectors. Note that in both cases the most similar one is the starting work (King and Spain) the expected result (Queen and Madrid) appear as second word.
 
Similarity is computed using Cosine Similarity.

In [2]:
import gensim
from gensim.models import Word2Vec
from gensim.models import fasttext
import gensim.downloader
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
word2vec300 = gensim.downloader.load('word2vec-google-news-300')

In [4]:
word2vec300.most_similar(positive=["king","woman"], negative=["man"], topn=10)

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]

In [5]:
e = np.array(word2vec300['king'])-np.array(word2vec300['man'])+np.array(word2vec300['woman'])
word2vec300.most_similar(positive=[e], topn=10)

[('king', 0.8449392318725586),
 ('queen', 0.7300518155097961),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676948547363),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613664388656616),
 ('sultan', 0.5376776456832886),
 ('Queen_Consort', 0.5344247221946716),
 ('queens', 0.5289887189865112)]

In [6]:
word2vec300.most_similar(positive=["Spain","Rome"], negative=["Italy"], topn=10)

[('Madrid', 0.6931530237197876),
 ('Seville', 0.5892879366874695),
 ('Barcelona', 0.5854125618934631),
 ('Athens', 0.51669842004776),
 ('Seville_Spain', 0.5157975554466248),
 ('Giles_Tremlett', 0.5132015347480774),
 ('Santiago_de_Compostela', 0.5131174325942993),
 ('Valencia', 0.5016350150108337),
 ('Spaniards', 0.4990643262863159),
 ('Paris', 0.48526036739349365)]

In [7]:
e = np.array(word2vec300['Spain'])-np.array(word2vec300['Italy'])+np.array(word2vec300['Rome'])
word2vec300.most_similar(positive=[e], topn=10)

[('Rome', 0.7016042470932007),
 ('Madrid', 0.6859280467033386),
 ('Spain', 0.599949836730957),
 ('Seville', 0.5847654938697815),
 ('Barcelona', 0.576607882976532),
 ('Athens', 0.5255098938941956),
 ('Seville_Spain', 0.5133261680603027),
 ('Santiago_de_Compostela', 0.5105167627334595),
 ('Giles_Tremlett', 0.5102741718292236),
 ('Paris', 0.4927102327346802)]

In [8]:
word2vec300.most_similar(positive=["doctor","woman"], negative=["man"], topn=10)

[('gynecologist', 0.7093892097473145),
 ('nurse', 0.647728681564331),
 ('doctors', 0.6471461653709412),
 ('physician', 0.6438996195793152),
 ('pediatrician', 0.6249487996101379),
 ('nurse_practitioner', 0.6218312978744507),
 ('obstetrician', 0.6072015166282654),
 ('ob_gyn', 0.5986713171005249),
 ('midwife', 0.5927063226699829),
 ('dermatologist', 0.5739566683769226)]

In [9]:
e = np.array(word2vec300['doctor'])-np.array(word2vec300['man'])+np.array(word2vec300['woman'])
word2vec300.most_similar(positive=[e], topn=10)

[('doctor', 0.883492112159729),
 ('gynecologist', 0.7276508212089539),
 ('nurse', 0.6698511838912964),
 ('physician', 0.6674119830131531),
 ('doctors', 0.664949357509613),
 ('pediatrician', 0.6398378014564514),
 ('nurse_practitioner', 0.6237460374832153),
 ('obstetrician', 0.6188927292823792),
 ('midwife', 0.6041982769966125),
 ('dentist', 0.5999662280082703)]

# Same example using fasttext
To use the example you must download the English word embeddings in the fasttex directory from,

https://fasttext.cc/docs/en/crawl-vectors.html

or using the command below.

In [10]:
# # uncomment to download
# !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

In [11]:
ft = fasttext.load_facebook_model("./fasttext/cc.en.300.bin.gz")

In [12]:
ft.wv.most_similar(positive=["king","woman"], negative=["man"],topn=10)

[('queen', 0.7554813623428345),
 ('queen-mother', 0.6141631603240967),
 ('princess', 0.5755329728126526),
 ('monarch', 0.5741075277328491),
 ('kings', 0.5688967704772949),
 ('queenship', 0.5649926066398621),
 ('Queen', 0.5638619661331177),
 ('empress', 0.5544731020927429),
 ('consort', 0.5524798035621643),
 ('queen.The', 0.5497488379478455)]

In [13]:
e = np.array(ft.wv['king'])-np.array(ft.wv['man'])+np.array(ft.wv['woman'])
ft.wv.most_similar(positive=[e], topn=10)

[('king', 0.7286674380302429),
 ('queen', 0.6542678475379944),
 ('kings', 0.5410280823707581),
 ('queen-mother', 0.5250692367553711),
 ('Queen', 0.5074419975280762),
 ('royal', 0.500452995300293),
 ('king-', 0.4945007264614105),
 ('queens', 0.49149513244628906),
 ('monarch', 0.4913707971572876),
 ('queenship', 0.48369985818862915)]