In [1]:
# Kaggle API
! pip install kaggle



In [2]:
! mkdir -p ~/.kaggle

In [4]:
#To get the dataset from kaggle you need the following
# 1. An active kaggle account
# 2. API key

! cp kaggle.json ~/.kaggle

In [5]:
#Set the permission of key to user only
! chmod 600 /root/.kaggle/kaggle.json

In [6]:
#Import data from Kaggle
! kaggle datasets download -d rootuser/worldnews-on-reddit

Downloading worldnews-on-reddit.zip to /content
 79% 21.0M/26.6M [00:01<00:01, 5.82MB/s]
100% 26.6M/26.6M [00:01<00:00, 19.7MB/s]


In [7]:
#UNzip 
! unzip /content/worldnews-on-reddit.zip

Archive:  /content/worldnews-on-reddit.zip
  inflating: reddit_worldnews_start_to_2016-11-22.csv  


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [8]:
# Data

import pandas as pd
import numpy as np
import gensim

In [9]:
data = pd.read_csv('/content/reddit_worldnews_start_to_2016-11-22.csv')
data.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509236 entries, 0 to 509235
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   time_created  509236 non-null  int64 
 1   date_created  509236 non-null  object
 2   up_votes      509236 non-null  int64 
 3   down_votes    509236 non-null  int64 
 4   title         509236 non-null  object
 5   over_18       509236 non-null  bool  
 6   author        509236 non-null  object
 7   subreddit     509236 non-null  object
dtypes: bool(1), int64(3), object(4)
memory usage: 27.7+ MB


In [11]:
#Goal : To extract the semantic relationship of each words on the basis of this dataset (title)
news_titles = data.title.values

In [12]:
news_titles

array(['Scores killed in Pakistan clashes',
       'Japan resumes refuelling mission',
       'US presses Egypt on Gaza border', ...,
       'Professor receives Arab Researchers Award',
       'Nigel Farage attacks response to Trump ambassador tweet',
       'Palestinian wielding knife shot dead in West Bank: Israel police'],
      dtype=object)

In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
#Tokenize the words

newList = [nltk.word_tokenize(title) for title in news_titles]

In [15]:
#Build my Model
from gensim.models.word2vec import Word2Vec
model = Word2Vec(newList,min_count=1,size=32)

#you 
# min_count
# size  
# documentation

In [16]:
#PREDICTIONS --- CHECKING IF THE MODEL PRESERVED SEMANTIC RELATIONSHIP SUCCESSFULLY OR NOT
#Closest related words
model.wv.most_similar('man')

[('woman', 0.9673755168914795),
 ('couple', 0.9005117416381836),
 ('boy', 0.8987310528755188),
 ('girl', 0.8978184461593628),
 ('doctor', 0.8938748836517334),
 ('teenager', 0.8781819343566895),
 ('mother', 0.8704747557640076),
 ('teacher', 0.8567318916320801),
 ('father', 0.8322591781616211),
 ('teen', 0.8204310536384583)]

In [17]:
len(model.wv['man'])

32

In [18]:
# Relationship Example 1

vec = model.wv['king'] - model.wv['man'] + model.wv['queen']
model.wv.most_similar([vec])

[('Weidmann', 0.7087693214416504),
 ('Christine', 0.6910353899002075),
 ('Ideologue', 0.6416141390800476),
 ('Czarina', 0.6396905779838562),
 ('Ollie', 0.6385036706924438),
 ('200-Year', 0.6358978152275085),
 ('Tusk', 0.6344927549362183),
 ('surveying', 0.6211035251617432),
 ('G.O.P', 0.620972216129303),
 ('FOS', 0.6185770630836487)]

In [19]:
# Relationship Example 2

vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

[('whence', 0.6971856355667114),
 ('75-year', 0.6829763650894165),
 ('21ft', 0.6778444051742554),
 ('Siachen', 0.6693453192710876),
 ('snorkelers', 0.6659572124481201),
 ('malaysian', 0.66435706615448),
 ('government-industry', 0.6630187034606934),
 ('467', 0.661349356174469),
 ('1-minute', 0.6607033610343933),
 ('Disfigured', 0.658394455909729)]

In [20]:
# Relationship Example 3

vec = model.wv['Germany'] - model.wv['Berlin'] + model.wv['Paris']
model.wv.most_similar([vec])

[('Belgium', 0.8751973509788513),
 ('France', 0.8459389209747314),
 ('Sweden', 0.834692120552063),
 ('Paris', 0.8163720965385437),
 ('Germany', 0.8088282346725464),
 ('Brussels', 0.7947176098823547),
 ('UK', 0.7563801407814026),
 ('Britain', 0.7550041079521179),
 ('Turkey', 0.725766122341156),
 ('Norway', 0.7137984037399292)]

# Pre-trained WordVector by Google

In [21]:
! kaggle datasets download -d umbertogriffo/googles-trained-word2vec-model-in-python

Downloading googles-trained-word2vec-model-in-python.zip to /content
100% 3.17G/3.17G [01:22<00:00, 36.1MB/s]
100% 3.17G/3.17G [01:22<00:00, 41.4MB/s]


In [22]:
! unzip /content/googles-trained-word2vec-model-in-python.zip

Archive:  /content/googles-trained-word2vec-model-in-python.zip
  inflating: GoogleNews-vectors-negative300.bin  
  inflating: GoogleNews-vectors-negative300.bin.gz  


In [None]:
#load the model
#from gensim.models import KeyedVectors
#model = KeyedVectors.load_word2vec_format("/content/GoogleNews-vectors-negative300.bin", binary=True)

In [23]:
#load the model ___ Partial
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("/content/GoogleNews-vectors-negative300.bin", binary=True , limit=100000)

In [24]:
#PREDICTIONS --- CHECKING IF THE MODEL PRESERVED SEMANTIC RELATIONSHIP SUCCESSFULLY OR NOT
#Closest related words
model.wv.most_similar('man')

  This is separate from the ipykernel package so we can avoid doing imports until


[('woman', 0.7664012908935547),
 ('boy', 0.6824870109558105),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903800010681),
 ('girl', 0.5921714305877686),
 ('robber', 0.5585119128227234),
 ('teen_ager', 0.5549196600914001),
 ('men', 0.5489763021469116),
 ('guy', 0.5420035123825073),
 ('person', 0.5342026352882385)]

In [25]:
# Relationship Example 1

vec = model.wv['king'] - model.wv['man'] + model.wv['queen']
model.wv.most_similar([vec])

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


[('queen', 0.8422121405601501),
 ('king', 0.8070623278617859),
 ('queens', 0.6293326616287231),
 ('monarch', 0.6233264207839966),
 ('kings', 0.5990484356880188),
 ('princess', 0.5685814619064331),
 ('royal', 0.55137038230896),
 ('princes', 0.5398682951927185),
 ('prince', 0.522888720035553),
 ('Queen', 0.5119318962097168)]

In [26]:
# Relationship Example 2

vec = model.wv['Messi'] - model.wv['Football'] + model.wv['Cricket']
model.wv.most_similar([vec])

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


[('Messi', 0.7382575869560242),
 ('Sehwag', 0.6777455806732178),
 ('Tendulkar', 0.6748222708702087),
 ('Xavi', 0.6610832810401917),
 ('Dravid', 0.6569646596908569),
 ('Dhoni', 0.6550688743591309),
 ('Lionel_Messi', 0.6407608985900879),
 ('Forlan', 0.640610933303833),
 ('Yuvraj', 0.6390379071235657),
 ('Ponting', 0.6390198469161987)]

In [27]:

vec = model.wv['India'] - model.wv['Delhi'] + model.wv['Paris']
model.wv.most_similar([vec])

  
  This is separate from the ipykernel package so we can avoid doing imports until


[('Paris', 0.6724128127098083),
 ('France', 0.6392810344696045),
 ('French', 0.47859641909599304),
 ('Morocco', 0.47215068340301514),
 ('Europe', 0.4615107476711273),
 ('Belgium', 0.4492771625518799),
 ('Marseilles', 0.44075489044189453),
 ('Italy', 0.4313548505306244),
 ('Olivier', 0.4299730658531189),
 ('Christophe', 0.4268326759338379)]