In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

#install if you dont have already
#!pip install gensim
from gensim.models import KeyedVectors

In [2]:
def get_vectors(embeddings,words):
    X = np.zeros((1, 300))
    for word in words:
        english = word
        eng_emb = embeddings[english]
        X = np.row_stack((X, eng_emb))
    X = X[1:,:]
    return X

In [3]:
def get_word_embeddings(embeddings,set_words,complete=False):

    word_embeddings = {}
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
        if complete:
            word_embeddings[word] = embeddings[word]
    return word_embeddings


In [4]:
def cos_similarity(W1,W2):
    num = np.dot(W1,W2)
    det = np.linalg.norm(W1)*np.linalg.norm(W2)
    return (num/det)

In [5]:
def euclidean(W1,W2):
    return np.linalg.norm(np.subtract(W1,W2))

In [6]:
# we can use this to find 4th word related to 3rd word similar to how 1 and 2 related.
#for now since my subset mostly hav countries and cities, i have named this function as predict country
def predict_country(city1, country1, city2, embeddings):
    
    group = set((city1, country1, city2))

    city1_emb = embeddings[city1]
    country1_emb = embeddings[country1]
    city2_emb = embeddings[city2]
    
    vec = city2_emb-city1_emb+country1_emb

    # Initialize the similarity to -1 (it will be replaced by a similarities that are closer to +1)
    similarity = -1
    
    # initialize country to an empty string
    country = ''
    
    # loop through all words in the embeddings dictionary
    for word in embeddings.keys():
        if word not in group:

            word_emb = embeddings[word]
            cur_similarity = cos_similarity(vec,word_emb) #current similarity
            #checking whether the similarity is greater than previous similarity
            if cur_similarity > similarity:
                similarity = cur_similarity
                country = word
    return country,similarity

In [7]:
#install if you dont have already
#!pip install gensim
from gensim.models import KeyedVectors

In [8]:
#this will take around 2-3mins
embeddings = KeyedVectors.load_word2vec_format('D:/GoogleNews-vectors-negative300.bin', binary = True)

In [9]:
f = open('D:/capitals.txt', 'r').read()
set_words = set(nltk.word_tokenize(f))

In [10]:
word_embeddings=get_word_embeddings(embeddings,set_words)

In [11]:
city1='Moscow'
country1='Russia'
city2='NewDelhi'
country2,similarity=predict_country(city1,country1,city2,word_embeddings)
print("Country is {} with cosine similarity being {}".format(country2,similarity))
#Predicting City with country
country3='England'
city3,similarity = predict_country(country1,city1,country3,word_embeddings)
print("City is {} with cosine similarity being {}".format(city3,similarity))

Country is India with cosine similarity being 0.5113393664360046
City is London with cosine similarity being 0.6056334972381592


In [12]:
def get_accuracy(word_embeddings, data):
    
    num_correct = 0
    for i, row in data.iterrows():
        city1 = row[0]
        country1 = row[1]
        city2 =  row[2]
        country2 = row[3]
        predicted_country2, _ = predict_country(city1,country1,city2,word_embeddings)

        if predicted_country2 == country2:
            num_correct += 1

    m = len(data)
    accuracy = num_correct/m
    return accuracy


In [13]:
data = pd.read_csv('D://capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']
acc=get_accuracy(word_embeddings,data)

In [14]:
#print({acc:.2f})
print("Accuracy is {:.2f}".format(acc))

Accuracy is 0.92
