In this we wll understand basics of *word-embeddings* using **word2Vec** and **GloVe** models using **gensim** module.

In [2]:
#!pip install gensim

In [3]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors # Gensim contains word2vec models and processing tools

In [12]:
# this is 6 billion wikipedia dataset with 50 dimensions
glove_file = datapath(r'glove.6B.50d.txt') # This is a GloVe model
tmp_file = get_tmpfile(r'word2vec.glove.6B.50d.txt')

In [13]:
glove_file

'C:\\Users\\syeda\\anaconda3\\lib\\site-packages\\gensim\\test\\test_data\\glove.6B.50d.txt'

In [14]:
tmp_file

'C:\\Users\\syeda\\AppData\\Local\\Temp\\tmph3wji6g9\\word2vec.glove.6B.50d.txt'

In [15]:
from gensim.scripts.glove2word2vec import glove2word2vec # convert GloVe to word2Vec

glove2word2vec(glove_file, tmp_file)  # Converting the GloVe file into a Word2Vec file
model = KeyedVectors.load_word2vec_format(tmp_file) # load word2vec model

  glove2word2vec(glove_file, tmp_file)  # Converting the GloVe file into a Word2Vec file


In [16]:
model

<gensim.models.keyedvectors.KeyedVectors at 0x2d582917310>

In [17]:
# Check out what the embedding looks like
wordEmbed = model['cat']
print(wordEmbed.shape)
print(wordEmbed)

(50,)
[ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ]


In [18]:
# What happens if a word it out of the dictionary?

word = 'Aanif' # I am not that famous yet
if word in model:
    print(f'{word} is in the model')
else:
    print(f'{word} is NOT in the model')

Aanif is NOT in the model


In [19]:
# Most like/similar(positive)

model.most_similar(positive=['boy'])

[('girl', 0.932719886302948),
 ('woman', 0.859611988067627),
 ('man', 0.8564430475234985),
 ('kid', 0.819257915019989),
 ('mother', 0.8179756999015808),
 ('teenage', 0.8029857277870178),
 ('baby', 0.8001460433006287),
 ('dad', 0.7850530743598938),
 ('her', 0.7836802005767822),
 ('old', 0.7815377712249756)]

In [20]:
# Most like X(positive) but unlike Y(negative)

model.most_similar(positive=['boy', 'girl'], negative=['man'])

[('toddler', 0.7926537990570068),
 ('teenage', 0.7791370749473572),
 ('girls', 0.7590368986129761),
 ('12-year-old', 0.7517068386077881),
 ('girlfriend', 0.7495954632759094),
 ('baby', 0.7431076765060425),
 ('teen', 0.7420293688774109),
 ('9-year', 0.7407599687576294),
 ('14-year-old', 0.7330296039581299),
 ('orphan', 0.726045548915863)]

In [21]:
# a word that is not similar to other words(od-one-out)
model.doesnt_match("boy girl car man".split())

'car'

In [22]:
# check similarity score between words
model.similarity('woman', 'man')

0.8860338

In [23]:
# check X:y::A:b
# king is to queen, what is 'A' to woman
model.similar_by_vector(model['king'] - model['queen'] + model['woman'])

[('man', 0.8706067204475403),
 ('father', 0.8266595602035522),
 ('who', 0.8227341771125793),
 ('woman', 0.8118484020233154),
 ('death', 0.7955310344696045),
 ('another', 0.7886118292808533),
 ('whose', 0.7844258546829224),
 ('old', 0.7815861105918884),
 ('victim', 0.7748768329620361),
 ('him', 0.7725713849067688)]