In [53]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
import nltk
from nltk.corpus import stopwords

stopw = set(stopwords.words('english'))

## 1. Read Data

In [55]:
def read_file(file):
    f = open(file,'r',encoding='utf8')
    text = f.read()
    
    # Tokenize- sentences and words
    sentences = nltk.sent_tokenize(text)
    print(len(sentences))
    
    data = []
    for sent in sentences:
        words = nltk.word_tokenize(sent)
        words = [w.lower() for w in words if len(w)>2 and w not in stopw]
        data.append(words)
    
    
    return data

In [56]:
text = read_file("bollywood.txt")
print(text)

18
[['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018'], ['the', 'deepika', 'ranveer', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple'], ['from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'deepika', 'ranveer', 'wedding', 'style', 'file'], ['not', 'ambanis', 'deepika', 'ranveer', 'priyanka', 'nick'], ['man', 'proves', 'wedding', 'the', 'year', 'this', 'year', 'year', 'big', 'fat', 'lavish', 'extravagant', 'weddings'], ['from', 'isha', 'ambani', 'anand', 'piramal', 'deepika', 'padukone', 'ranveer', 'singh', 'priyanka', 'chopra', 'nick', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', '2018', 'saw', 'many', 'grand', 'weddings'], ['but', 'nothing', 'beats', 'man', 'wedding', 'the', 'year', 'award', 'social', 'media'], ['priyanka', 'also', 'shared', 'video', 'featuring', 'nick', 'jonaswas', 'also', 'celebratin

## 2. Create Model

In [57]:
from gensim.models import Word2Vec

model = Word2Vec(text,size=300,window=10,min_count=1)

print(model)



Word2Vec(vocab=116, size=300, alpha=0.025)


In [58]:
words = list(model.wv.vocab)
print(words)

['deepika', 'padukone', 'ranveer', 'singh', 'wedding', 'one', 'biggest', 'bollywood', 'events', 'happened', '2018', 'the', 'celebrations', 'hooked', 'phones', 'waiting', 'come', 'also', 'gave', 'enough', 'reason', 'believe', 'stylish', 'two', 'couple', 'from', 'airport', 'looks', 'reception', 'parties', 'everything', 'entire', 'timeline', 'style', 'file', 'not', 'ambanis', 'priyanka', 'nick', 'man', 'proves', 'year', 'this', 'big', 'fat', 'lavish', 'extravagant', 'weddings', 'isha', 'ambani', 'anand', 'piramal', 'chopra', 'jonas', 'kapil', 'sharma', 'ginni', 'chatrath', 'saw', 'many', 'grand', 'but', 'nothing', 'beats', 'award', 'social', 'media', 'shared', 'video', 'featuring', 'jonaswas', 'celebrating', 'family', 'first', 'celebrated', 'christmas', 'london', 'pictures', 'new', 'outstanding', 'glimpses', 'celebration', 'verbier', 'switzerland', 'married', 'december', 'three', 'receptions', 'delhi', 'mumbai', 'jaggo', 'night', 'made', 'even', 'special', 'industry', 'friends', 'long', '

In [59]:
print(model['deepika'][:10])
print(model['deepika'].shape)

[-6.0725358e-04 -9.8790537e-05  1.6006879e-03 -5.7181343e-04
 -1.4793633e-03 -1.1211601e-03 -1.6153050e-03  1.4487529e-03
  6.6224733e-05 -1.2393841e-03]
(300,)


  print(model['deepika'][:10])
  print(model['deepika'].shape)


## 3. Create Analogies

In [72]:
def predict(a,b,c,word_vectors):
    
    a,b,c = a.lower(),b.lower(),c.lower()
    
    # Similarity |b-a| = |d-c| should be max
    max_similarity = -100
    
    d = None
    
    options = ["ranveer","deepika","padukone","singh","nick","jonas","chopra","priyanka","virat","anushka","ginni"]
    
    wa,wb,wc = word_vectors[a],word_vectors[b],word_vectors[c]
    
    # To find s.t. similarity (|b-a|,|d-c|) should be max
    
    for w in options:
        if w in [a,b,c]:
            continue
            
        wv = word_vectors[w] 
        sim = cosine_similarity([wb-wa],[wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
            
    return d

In [73]:
triad = ("nick","priyanka","virat")
predict(*triad,model.wv)

'chopra'

In [74]:
triad = ("ranveer","deepika","priyanka")
predict(*triad,model.wv)

'padukone'

In [75]:
triad = ("ranveer","singh","deepika")
predict(*triad,model.wv)

'padukone'

In [76]:
triad = ("deepika","padukone","priyanka")
predict(*triad,model.wv)

'singh'

In [77]:
triad = ("priyanka","jonas","nick")
predict(*triad,model.wv)

'padukone'

![Image](./images/p2.gif)