In [6]:
import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [8]:
v_apple = word_vectors['apple']
v_mango = word_vectors['mango']
v_india = word_vectors['india']

In [10]:
print(v_apple.shape)
print(v_mango.shape)
print(v_india.shape)

(300,)
(300,)
(300,)


In [11]:
cosine_similarity([v_apple],[v_mango])

array([[0.57518554]], dtype=float32)

In [12]:
cosine_similarity([v_apple],[v_india])

array([[0.17158596]], dtype=float32)

## 1. Finding Odd One Out

In [19]:
import numpy as np

In [20]:
input_1 = ["apple","mango","juice","party","orange"]

In [27]:
def odd_one_out(words):
    
    # Generate all word embeddings for the given list
    all_word_vectors = [word_vectors[w] for w in words]
    print(len(all_word_vectors))
    print(len(all_word_vectors[0]))
    
    avg_vector = np.mean(all_word_vectors,axis=0)
    print(avg_vector.shape)
    
    # Iterate over every word and find similarity
    odd_one_out = None
    min_similarity = 1.0
    
    for w in words:
        sim = cosine_similarity([word_vectors[w]],[avg_vector])
        
        if sim < min_similarity:
            min_similarity = sim
            odd_one_out = w
            
        print('Similarity between word %s and avg vector is %.2f'%(w,sim))
            
    return odd_one_out

In [28]:
odd_one_out(input_1)

5
300
(300,)
Similarity between word apple and avg vector is 0.78
Similarity between word mango and avg vector is 0.76
Similarity between word juice and avg vector is 0.71
Similarity between word party and avg vector is 0.36
Similarity between word orange and avg vector is 0.65


'party'

## 2. Word Analogies

In [36]:
print(len(word_vectors.vocab))
print(type(word_vectors))
print(type(word_vectors['india']))
print(word_vectors['india'].shape)
print(word_vectors['india'][:10])

3000000
<class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>
<class 'numpy.ndarray'>
(300,)
[-0.234375   -0.07177734  0.01055908  0.32617188 -0.06298828 -0.17871094
  0.03173828 -0.39648438 -0.16992188 -0.03540039]


In [39]:
def predict_word(a,b,c,word_vectors):
    """Accepts a triad of words - a,b,c and returns d such that a is to b : c is to d"""
    
    a,b,c = a.lower(),b.lower(),c.lower()
    
    # Similarity |b-a| = |d-c| should be max
    max_similarity = -100
    
    d = None
    
    words = word_vectors.vocab.keys() # list of 3000000 words
    
    wa,wb,wc = word_vectors[a],word_vectors[b],word_vectors[c]
    
    # To find s.t. similarity (|b-a|,|d-c|) should be max
    
    for w in words:
        if w in [a,b,c]:
            continue
            
        wv = word_vectors[w]
        sim = cosine_similarity([wb-wa],[wv-wc])
        
        if sim > max_similarity:
            max_similarity = sim
            d = w
            
    return d

In [None]:
triad_1 = ("man","woman","prince")
predict_word(*triad_1,word_vectors)

In [None]:
triad_2 = ("man","coder","woman")


#### Using builtin most similar method

In [None]:
word_vectors.most_similar(positive=['woman','king'], negative=['man'], topn=1)