In [1]:
import gensim
import pandas as pd
import tensorflow_datasets as tfds

In [2]:
#Loading the data
agnews,info = tfds.load(name = 'ag_news_subset',as_supervised = True, with_info = True)

In [3]:
# Appending the train and test sentences to a list
corpus_train = agnews['train']
corpus_test = agnews['test']
conv = []
for s,l in corpus_train:
    conv.append(s.numpy().decode('utf8'))
    
for s,l in corpus_test:
    conv.append(s.numpy().decode('utf8'))
print(len(conv))

127600


In [4]:
# Converting the list into series
data_s = pd.Series(conv)

In [5]:
# pre-processing the sentences by removing unnecessary words and punctuations.
data = data_s.apply(gensim.utils.simple_preprocess)
data

0         [amd, new, dual, core, opteron, chip, is, desi...
1         [reuters, major, league, baseball, monday, ann...
2         [president, bush, quot, revenue, neutral, quot...
3         [britain, will, run, out, of, leading, scienti...
4         [london, england, sports, network, england, mi...
                                ...                        
127595    [it, took, days, for, russia, security, servic...
127596    [montreal, air, canada, said, it, sealed, deal...
127597    [the, administration, issued, report, indicati...
127598    [los, gatos, calf, shares, of, mail, order, dv...
127599    [former, philadelphia, phillies, catcher, john...
Length: 127600, dtype: object

In [6]:
model = gensim.models.Word2Vec( window = 5, min_count = 2, workers = 4)

In [7]:
model.build_vocab(data, progress_per = 1000)

In [8]:
#Training the model
model.train(data, total_examples = model.corpus_count, epochs = model.epochs)

(15789203, 19053875)

In [9]:
#Testing similar words from the corpus
model.wv.most_similar('baseball')

[('football', 0.648295521736145),
 ('nhl', 0.645673394203186),
 ('franchise', 0.6093348264694214),
 ('basketball', 0.5842990279197693),
 ('nfl', 0.5825951099395752),
 ('lockout', 0.5817408561706543),
 ('players', 0.5756290555000305),
 ('playoffs', 0.5612039566040039),
 ('hockey', 0.5574430823326111),
 ('expos', 0.5549972653388977)]

In [10]:
#Tesitng similar words from the corpus
model.wv.most_similar('apple')

[('ipod', 0.712005078792572),
 ('itunes', 0.6999995112419128),
 ('dell', 0.6775985360145569),
 ('sony', 0.636110246181488),
 ('yahoo', 0.59694904088974),
 ('google', 0.5946887731552124),
 ('realnetworks', 0.5925344824790955),
 ('macintosh', 0.5876981616020203),
 ('music', 0.585974395275116),
 ('hp', 0.5840809345245361)]

In [11]:
#Testing similar words form the corpus
model.wv.most_similar('ronaldo')

[('adriano', 0.8861850500106812),
 ('gudjohnsen', 0.8594332933425903),
 ('kezman', 0.8511380553245544),
 ('drogba', 0.8472457528114319),
 ('didier', 0.8449971079826355),
 ('pavel', 0.8346201777458191),
 ('deco', 0.8320674300193787),
 ('damien', 0.8315231800079346),
 ('mateja', 0.8312647342681885),
 ('eidur', 0.8287298083305359)]