# 1) Importing the libraries


In [3]:
import tensorflow as tf
print(tf.__version__)

2.4.1


In [6]:
import nltk
nltk.download('punkt')  # Tokenization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import pandas as pd
import gensim
from gensim.models import Word2Vec, KeyedVectors

# 2) Data Preprocessing

In [8]:
# google pretrained model

# https://www.kaggle.com/rootuser/worldnews-on-reddit

In [9]:
#  Kagggle API
! pip install -q kaggle

In [11]:
# Create a directory as kaggle
! mkdir -p ~/.kaggle

In [15]:
# Copy API key to kaggle directory
! cp "/content/drive/MyDrive/Colab Notebooks/NLP/Word Embeddings/kaggle.json" ~/.kaggle

In [16]:
# Disable the API Key
! chmod 600 /root/.kaggle/kaggle.json

In [17]:
# Import the dataset
! kaggle datasets download -d rootuser/worldnews-on-reddit

Downloading worldnews-on-reddit.zip to /content
 64% 17.0M/26.6M [00:00<00:00, 33.0MB/s]
100% 26.6M/26.6M [00:00<00:00, 41.9MB/s]


In [18]:
# unzip the dataset
! unzip /content/worldnews-on-reddit.zip

Archive:  /content/worldnews-on-reddit.zip
  inflating: reddit_worldnews_start_to_2016-11-22.csv  


In [19]:
df = pd.read_csv('/content/reddit_worldnews_start_to_2016-11-22.csv')

In [20]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [22]:
df.shape

(509236, 8)

In [23]:
# Grab just the title column
news_titles = df['title'].values

In [28]:
news_titles[0:5]

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border',
       'Jump-start economy: Give health care to all ',
       'Council of Europe bashes EU&UN terror blacklist'], dtype=object)

Tokenization

In [25]:
new_vec = [nltk.word_tokenize(title) for title in news_titles]

In [27]:
new_vec[3]

['Jump-start', 'economy', ':', 'Give', 'health', 'care', 'to', 'all']

# 3) Build the model

In [29]:
# Text tokenized 
# Minimum word count // if 1. all the words are considered
# Size of vector // if 32. return a vector of size 32
model = Word2Vec(new_vec, min_count=1, size=32)

In [30]:
model

<gensim.models.word2vec.Word2Vec at 0x7f46b9ae1780>

# 4) Predict the output

In [31]:
model.wv['man']
# An array showing how a man is represneted in a vector of size 32

array([-1.4726138 , -0.04686417,  0.41637388,  4.5729785 ,  0.8859756 ,
        0.9762413 ,  2.5887802 , -0.39307952,  3.8581197 ,  1.3446149 ,
       -0.19675647,  3.1809907 , -0.5411156 , -3.5843108 , -3.2981126 ,
        3.831629  ,  0.64928657,  3.0749938 , -2.222473  , -0.9591494 ,
        6.483887  , -1.8517106 ,  3.7513947 ,  2.704052  ,  3.845648  ,
       -3.8637164 ,  1.49986   , -0.6097532 ,  0.35982645,  0.47674403,
        1.2878771 ,  1.7770767 ], dtype=float32)

In [32]:
# Find 10 closest words in the vector space similars to 'man'
# Not in characters equality, but in semantic
model.wv.most_similar('man')

[('woman', 0.9788249731063843),
 ('couple', 0.9128268361091614),
 ('teenager', 0.9018714427947998),
 ('girl', 0.9000893831253052),
 ('boy', 0.889074444770813),
 ('teacher', 0.8879342675209045),
 ('mother', 0.8673346042633057),
 ('doctor', 0.8621090054512024),
 ('father', 0.8523733019828796),
 ('policeman', 0.8340638875961304)]

In [35]:
# Expected Queen
vec = model.wv['king'] - model.wv['man'] + model.wv['women']
model.wv.most_similar([vec])

[('women', 0.7007871866226196),
 ('unions', 0.6991894245147705),
 ('pro-family', 0.6927920579910278),
 ('equality', 0.6857810616493225),
 ('office—and', 0.6822486519813538),
 ('gay', 0.671711802482605),
 ('LGBT', 0.6635644435882568),
 ('equal', 0.6555752158164978),
 ('couples', 0.652648389339447),
 ('Milder', 0.6512923240661621)]

In [37]:
# Capital da Germania = Berlim
# Capital da ????     = Paris
# ????? = France
vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

[('Belgium', 0.8572447896003723),
 ('France', 0.8472557663917542),
 ('Paris', 0.813854992389679),
 ('Germany', 0.8106518387794495),
 ('Sweden', 0.7864277958869934),
 ('Brussels', 0.784581184387207),
 ('Britain', 0.7358848452568054),
 ('UK', 0.732637345790863),
 ('Turkey', 0.7200224995613098),
 ('Europe', 0.6917171478271484)]

In [43]:
# Messi = Football king
# ???? = Cricket king
vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

[('British-Iranian', 0.7513188123703003),
 ('kidnapping/child', 0.7178007960319519),
 ('£80k', 0.7127825021743774),
 ('one-year-old', 0.7124025821685791),
 ('plum', 0.712323009967804),
 ('dear', 0.7053304314613342),
 ('75-page', 0.6966793537139893),
 ('undergoes', 0.6927151679992676),
 ('eight-week', 0.6865567564964294),
 ('Vano', 0.6860729455947876)]