# Install necessary packages
- !pip install gensim
- !pip install wikipedia2vec
- !pip install pyemd # Necessary for one of Gensim's functions

**Update**: these are installed via Poetry, so if you set up your environment per the root readme and are correctly using that environment as your Jupyter Notebook kernel, the packages will be available to you.

# Experimenting with Gensim and Word2Vec

Following complete API documentation here:
https://radimrehurek.com/gensim/models/keyedvectors.html

In [2]:
#Import test texts and Word2Vec pre-loaded model
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [3]:
# Instantiate Word2Vec
# See parameter explanations here:
# https://github.com/kavgan/nlp-in-practice/blob/master/word2vec/Word2Vec.ipynb
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv

In [4]:
# # Save word_vectors to disk
# from gensim.test.utils import get_tmpfile
# from gensim.models import KeyedVectors

# fname = get_tmpfile('vectors.kv')
# word_vectors.save(fname)
# word_vectors = KeyedVectors.load(fname, mmap='r')

# # Load word_vectors from disk
# # Possible for Wikipedia2Vec
# from gensim.test.utils import datapath
# wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin", binary=True))

In [5]:
# Use API to download Glove wiki word embeddings
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")



In [6]:
# Find the most similar word with positive and negative inputs
result = word_vectors.most_similar(positive=["woman", "king"], negative=["man"])
result

[('queen', 0.7698541283607483),
 ('monarch', 0.6843380928039551),
 ('throne', 0.6755735874176025),
 ('daughter', 0.6594556570053101),
 ('princess', 0.6520534753799438),
 ('prince', 0.6517034769058228),
 ('elizabeth', 0.6464517712593079),
 ('mother', 0.6311717629432678),
 ('emperor', 0.6106470823287964),
 ('wife', 0.6098655462265015)]

In [7]:
# Finds most similar word using "multiplicative combination objective"
# Less susceptible to one large distance dominating calculation
result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
result

[('queen', 0.8964555859565735),
 ('monarch', 0.8495979309082031),
 ('throne', 0.8447030782699585),
 ('princess', 0.8371668457984924),
 ('elizabeth', 0.835679292678833),
 ('daughter', 0.8348594307899475),
 ('prince', 0.8230059742927551),
 ('mother', 0.8154449462890625),
 ('margaret', 0.8147734999656677),
 ('father', 0.8100855350494385)]

In [8]:
# Returns least non-matching word of input words
# Caution: If you input "breakfast lunch dinner", it will say "lunch"
# is least similar so no ability to say "All similar"
print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))

cereal


  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [9]:
# Calculates word similarity score
similarity = word_vectors.similarity('woman', 'man')
similarity

0.8323495

In [10]:
# Returns most similar words
result = word_vectors.similar_by_word("cat")
result

[('dog', 0.8798074722290039),
 ('rabbit', 0.7424426674842834),
 ('cats', 0.7323004007339478),
 ('monkey', 0.7288709878921509),
 ('pet', 0.7190139889717102),
 ('dogs', 0.7163872718811035),
 ('mouse', 0.6915250420570374),
 ('puppy', 0.6800068020820618),
 ('rat', 0.6641027331352234),
 ('spider', 0.6501135230064392)]

In [11]:
# Computes "Word Mover's Distance" between two documents
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()
similarity = word_vectors.wmdistance(sentence_obama, sentence_president)
similarity

3.4892687395218687

In [12]:
# Computes cosine distance between two words
distance = word_vectors.distance('media', 'media')
distance

5.960464477539063e-08

In [13]:
distance = word_vectors.distance('media', 'press')
distance

0.2534049153327942

In [14]:
distance = word_vectors.distance('media', 'mob')
distance

0.6921039819717407

In [15]:
# Computes cosine similarity between two sets of words
sim = word_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
sim

0.7066633

In [16]:
# Each word has 100 length numbers as representation
print(word_vectors['computer'].shape)
print("First 10")
print(word_vectors['computer'][:10])

(100,)
First 10
[-0.16298   0.30141   0.57978   0.066548  0.45835  -0.15329   0.43258
 -0.89215   0.57747   0.36375 ]


# Experimenting with Wikipedia2Vec

Following API usage here:
https://wikipedia2vec.github.io/wikipedia2vec/usage/

### Download word embeddings
You must first download the word embeddings binary file (3.3GB)

https://wikipedia2vec.github.io/wikipedia2vec/pretrained/

Use *enwiki_20180420* - 100d(bin)

In [1]:
from wikipedia2vec import Wikipedia2Vec

In [2]:
# Load unzipped pkl file
wiki2vec = Wikipedia2Vec.load("../../embeddings/enwiki_20180420_100d.pkl")

In [3]:
# Retrieve word vector
wv = wiki2vec.get_word_vector("royalty")
print(len(wv))
print(wv)

100
[-0.20645621  0.3541138  -0.0294481  -0.33744335 -0.08076815  0.31424952
  0.19943002 -0.45241427  0.1767314   0.08421883  0.18743327 -0.15441522
  0.7555824  -0.3163147  -0.1272997  -0.16718598  0.26394138 -0.12993027
  0.26369524 -0.15656409  0.31780142 -0.21851811 -0.246363    0.09481585
 -0.824227   -0.32929772 -0.08541542 -0.42382434  0.25865236  0.48900202
  0.2663292   0.30553964 -0.24619317  0.4986544  -0.15859959 -0.05226322
  0.15892026  0.21789268  0.1005047  -0.04396052 -0.13266805  0.14376648
 -0.07025098  0.07240359  0.26175997  0.2744427  -0.05514829  0.3889919
 -0.15309823  0.14552292  0.16997342  0.44799    -0.19526581 -0.10786294
 -0.32927316  0.3289106   0.02902091  0.08773877 -0.41468772  0.26678678
 -0.1374677  -0.33667436  0.05344973  0.27092975  0.38892925 -0.43767536
  0.28642192 -0.09463934  0.02864233 -0.04812264 -0.5708012   0.48663247
 -0.07153141  0.5161     -0.29984742  0.12222655  0.06087735  0.09465893
 -0.32330158  0.00459747 -0.29446656 -0.80294424

In [4]:
# Retrieve entity vector
wv = wiki2vec.get_entity_vector("Queen Elizabeth II")
print(len(wv))
print(wv)

100
[-0.20736943 -0.1413487   0.44806942 -0.15780485  0.03426249 -1.496996
  0.6636046   1.0935112  -0.28068435 -1.7422425   0.28946415 -0.1224269
  1.359793   -2.1443315  -1.2891554   0.5883908  -1.4934471   0.28643036
 -0.5680678   0.06121244  0.14584215 -0.5493008  -0.5274965  -0.21286193
 -2.531544   -1.0664226   0.6852317  -0.6560998   1.0964793   1.7911379
 -0.88538927 -0.6312805  -0.579102    0.2503235   0.581396   -1.0452244
  0.99863523 -1.1343025   1.5228372  -0.34984437 -0.30327943 -0.69956243
 -0.27598628  0.8382855   1.158086   -0.19607222 -0.23079026  0.7163011
 -0.02645348 -0.53512233  0.8909849   0.33066076 -0.09964236 -0.59745216
 -0.70949936  0.24142893  0.28429118  0.13206743 -1.3893279  -0.00942595
 -0.16754963 -0.6323121  -0.41165853  0.28699186  0.63819945 -1.1207265
  0.57338834 -0.12862757 -0.57689637  0.32655638 -0.5370033   0.31795335
 -0.24858339  0.17761594 -0.53856456 -0.717328    0.29654366 -0.18517432
 -0.38363308  0.10567519 -0.41610453 -1.4170125  -0.06

In [5]:
# Retrieve a word
print(wiki2vec.get_word('royalty'))
print(type(wiki2vec.get_word('royalty')))

<Word royalty>
<class 'wikipedia2vec.dictionary.Word'>


In [6]:
# Retrieve an entity
print(wiki2vec.get_entity('Metropolitan Museum'))
print(type(wiki2vec.get_entity('Metropolitan Museum')))

<Entity Metropolitan Museum of Art>
<class 'wikipedia2vec.dictionary.Entity'>


In [42]:
# Retrieve an entity
print(wiki2vec.get_entity('Harvard University'))
print(type(wiki2vec.get_entity('Metropolitan Museum')))

<Entity Harvard University>
<class 'wikipedia2vec.dictionary.Entity'>


In [34]:
%%time

# Get most similar word
similar_yoda = wiki2vec.most_similar(wiki2vec.get_word('yoda'), 100)
similar_yoda

CPU times: user 3.92 s, sys: 4.1 s, total: 8.02 s
Wall time: 15.4 s


[(<Word yoda>, 0.99999994),
 (<Word kenobi>, 0.7954376),
 (<Word darth>, 0.7514157),
 (<Entity Yoda>, 0.7504513),
 (<Word yularen>, 0.74208754),
 (<Word saesee>, 0.7390899),
 (<Word jedi>, 0.7362622),
 (<Word sidious>, 0.73391825),
 (<Word anakin>, 0.7304383),
 (<Word zarbon>, 0.7281125),
 (<Word ackbar>, 0.7240917),
 (<Word watto>, 0.7238953),
 (<Word unduli>, 0.7219176),
 (<Word numberman>, 0.7216795),
 (<Word yarael>, 0.7216082),
 (<Word mystel>, 0.7211916),
 (<Word kaito>, 0.7205416),
 (<Word vilmarh>, 0.7194575),
 (<Word threepio>, 0.71926695),
 (<Word solusar>, 0.7167008),
 (<Word echuu>, 0.71605474),
 (<Word yoshi>, 0.71546626),
 (<Word jorus>, 0.715434),
 (<Entity X'Ting>, 0.71265334),
 (<Word joruus>, 0.71257347),
 (<Entity Qui-Gon Jinn>, 0.71158594),
 (<Word 3po>, 0.71024007),
 (<Word jabba>, 0.70992076),
 (<Word pandabubba>, 0.70945776),
 (<Word ghaleon>, 0.7088256),
 (<Entity C-3PO>, 0.70879513),
 (<Word yaddle>, 0.70696616),
 (<Word benjirou>, 0.7062665),
 (<Word dagobah>,

In [10]:
%%time

# Get most similar entity
similar_harvard = wiki2vec.most_similar(wiki2vec.get_entity('Harvard University'), 10)
similar_harvard

CPU times: user 3.1 s, sys: 1.49 s, total: 4.59 s
Wall time: 4.52 s


[(<Entity Harvard University>, 1.0000001),
 (<Entity John F. Kennedy School of Government>, 0.81384987),
 (<Word harvard>, 0.80841565),
 (<Entity Harvard College>, 0.7746782),
 (<Entity Signet society>, 0.75375855),
 (<Entity List of Harvard College freshman dormitories>, 0.7442793),
 (<Entity Harvard Faculty of Arts and Sciences>, 0.7426085),
 (<Entity Radcliffe Institute for Advanced Study>, 0.7415956),
 (<Entity Harvard Society of Fellows>, 0.7386711),
 (<Entity Cambridge, Massachusetts>, 0.73181486)]

In [28]:
from wikipedia2vec.dictionary import Entity

In [41]:
# Retrieve only entities from word
yoda_entities = []
for i in similar_yoda:
#     print(type(i[0]))
    if isinstance(i[0], Entity):
        yoda_entities.append(i)
    if len(yoda_entities) == 3:
        break
yoda_entities

[(<Entity Yoda>, 0.7504513),
 (<Entity X'Ting>, 0.71265334),
 (<Entity Qui-Gon Jinn>, 0.71158594)]

# Using Gensim & Wikipedia2Vec Together

You can use gensim's `load_word2vec_format` to work with wikipedia2vec directly. However, you have to use the `(txt)` file from wikipedia2vec to do this, not the `(bin)` file. Given the greater number of modules provided by gensim, this is likely the preferred path.

In [25]:
# Import models type
from gensim.models import KeyedVectors

In [28]:
%%time

# Use model type to load txt file
w2v = KeyedVectors.load_word2vec_format("../../embeddings/enwiki_20180420_100d.txt")

CPU times: user 8min 15s, sys: 16.7 s, total: 8min 32s
Wall time: 9min 9s


In recent gensim versions you can load a subset starting from the front of the file using the optional limit parameter to load_word2vec_format(). So use limit=500000 to get the most-frequent 500,000 words' vectors.

In [29]:
result = w2v.most_similar("save")
result

[('saving', 0.7900023460388184),
 ('saved', 0.7763535976409912),
 ('reclaim', 0.7728390097618103),
 ('recover', 0.7695224285125732),
 ('help', 0.7515880465507507),
 ('redeem', 0.7505150437355042),
 ('restore', 0.7452740669250488),
 ('bring', 0.7452179789543152),
 ('destroy', 0.7413591742515564),
 ('keep', 0.740227997303009)]

In [30]:
result = w2v.most_similar(positive=['woman', 'king'], negative=['man'])
result

[('queen', 0.8306491374969482),
 ('monarch', 0.7416261434555054),
 ('ENTITY/Queen_consort', 0.7348717451095581),
 ('laungshe', 0.7347309589385986),
 ('regnant', 0.7243735194206238),
 ('chelna', 0.7236213684082031),
 ('consort', 0.720160722732544),
 ('indlovukati', 0.7181541919708252),
 ('kamamalu', 0.7178552150726318),
 ('indlovukazi', 0.714848518371582)]

In [31]:
# Calculates word similarity score
similarity = w2v.similarity('woman', 'man')
similarity

0.7687104

In [32]:
# Calculate word distance
distance = w2v.distance('media', 'press')
distance

0.5174522697925568