# 1) Import the libraries

In [1]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [2]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

# 2) Data Preprocessing

In [3]:
# google pretrained model
# https://www.kaggle.com/umbertogriffo/googles-trained-word2vec-model-in-python

In [4]:
# install kaggle API
! pip install -q kaggle

In [5]:
# Create a directory as kaggle
! mkdir -p ~/.kaggle

In [7]:
# Copy API key to kaggle directory
! cp "/content/drive/MyDrive/Colab Notebooks/NLP/Word Embeddings/kaggle.json" ~/.kaggle

In [8]:
# Disable the API Key
! chmod 600 /root/.kaggle/kaggle.json

In [9]:
# Import the dataset
! kaggle datasets download -d umbertogriffo/googles-trained-word2vec-model-in-python

Downloading googles-trained-word2vec-model-in-python.zip to /content
100% 3.17G/3.17G [00:55<00:00, 52.3MB/s]
100% 3.17G/3.17G [00:55<00:00, 61.8MB/s]


In [10]:
# unzip the dataset
! unzip /content/googles-trained-word2vec-model-in-python

Archive:  /content/googles-trained-word2vec-model-in-python.zip
  inflating: GoogleNews-vectors-negative300.bin  
  inflating: GoogleNews-vectors-negative300.bin.gz  


# 3) Build the model

In [12]:
# Load the already trained model
model = KeyedVectors.load_word2vec_format('/content/GoogleNews-vectors-negative300.bin', binary=True, limit=100000)

# 4) Predict the output

In [16]:
model['man']
# An array showing how a man is represneted in a vector of size 300

array([ 0.32617188,  0.13085938,  0.03466797, -0.08300781,  0.08984375,
       -0.04125977, -0.19824219,  0.00689697,  0.14355469,  0.0019455 ,
        0.02880859, -0.25      , -0.08398438, -0.15136719, -0.10205078,
        0.04077148, -0.09765625,  0.05932617,  0.02978516, -0.10058594,
       -0.13085938,  0.001297  ,  0.02612305, -0.27148438,  0.06396484,
       -0.19140625, -0.078125  ,  0.25976562,  0.375     , -0.04541016,
        0.16210938,  0.13671875, -0.06396484, -0.02062988, -0.09667969,
        0.25390625,  0.24804688, -0.12695312,  0.07177734,  0.3203125 ,
        0.03149414, -0.03857422,  0.21191406, -0.00811768,  0.22265625,
       -0.13476562, -0.07617188,  0.01049805, -0.05175781,  0.03808594,
       -0.13378906,  0.125     ,  0.0559082 , -0.18261719,  0.08154297,
       -0.08447266, -0.07763672, -0.04345703,  0.08105469, -0.01092529,
        0.17480469,  0.30664062, -0.04321289, -0.01416016,  0.09082031,
       -0.00927734, -0.03442383, -0.11523438,  0.12451172, -0.02

In [17]:
# Find 10 closest words in the vector space similars to 'man'
# Not in characters equality, but in semantic
model.most_similar('man')

[('woman', 0.7664012908935547),
 ('boy', 0.6824870109558105),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903800010681),
 ('girl', 0.5921714305877686),
 ('robber', 0.5585119128227234),
 ('teen_ager', 0.5549196600914001),
 ('men', 0.5489763021469116),
 ('guy', 0.5420035123825073),
 ('person', 0.5342026352882385)]

In [18]:
# Expected Queen
# A men is a king
# A women is a ?????
vec = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

  
  This is separate from the ipykernel package so we can avoid doing imports until


[('king', 0.6478992104530334),
 ('queen', 0.5354937314987183),
 ('women', 0.5233659148216248),
 ('kings', 0.5162314176559448),
 ('queens', 0.499536395072937),
 ('princes', 0.46233272552490234),
 ('monarch', 0.4528029263019562),
 ('monarchy', 0.4293174147605896),
 ('crown_prince', 0.42302513122558594),
 ('womens', 0.41756653785705566)]

In [19]:
# Capital da Germania = Berlim
# Capital da ????     = Paris
# ????? = France
vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

  after removing the cwd from sys.path.
  """


[('France', 0.7724405527114868),
 ('Paris', 0.6798242926597595),
 ('Belgium', 0.598486065864563),
 ('Germany', 0.5652832984924316),
 ('Spain', 0.550815761089325),
 ('Italy', 0.5462925434112549),
 ('Marseille', 0.5372347831726074),
 ('Switzerland', 0.5364958047866821),
 ('French', 0.5346114039421082),
 ('Morocco', 0.5051252841949463)]

In [20]:
# Messi = Football king
# ???? = Cricket king
vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


[('Messi', 0.7382575869560242),
 ('Sehwag', 0.6777455806732178),
 ('Tendulkar', 0.6748222708702087),
 ('Xavi', 0.6610832810401917),
 ('Dravid', 0.6569646596908569),
 ('Dhoni', 0.6550688743591309),
 ('Lionel_Messi', 0.6407608985900879),
 ('Forlan', 0.640610933303833),
 ('Yuvraj', 0.6390379071235657),
 ('Ponting', 0.6390198469161987)]