In [68]:
import numpy as np
import gensim
import codecs
import os
import pandas as pd

In [69]:
from gensim.test.utils import datapath

## Cosine Similarity

In [70]:
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)  # dot product
    norm_a = np.linalg.norm(a)  # norm
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [71]:
if not os.path.exists("Q1output"+os.sep + "100"):
    os.makedirs("Q1output"+os.sep + "100")
if not os.path.exists("Q1output"+os.sep + "50"):
    os.makedirs("Q1output"+os.sep + "50")

## Similarity helper function

In [72]:
def test_similarities(model,dimension,model_name,direct = False):

    df = pd.read_csv('Word similarity'+os.sep+'hindi.txt', sep=',', header=None) # read 
    df.columns = ["first_word", "second_word","similarity"]
    ans_list = []    # consine similarities for the pairs along with true label
    word_1 = []
    word_2 = []
    gts = []
    ss = []
    for index, row in df.iterrows():
        pair = [row['first_word'],row['second_word']]       # Extracting pair out of file 
        word_1.append(row['first_word'])
        word_2.append(row['second_word'])
        gts.append(row['similarity'])
        if(direct):
            ans_list.append([cosine_similarity(model[pair[0]] , model[pair[1]])*10,row['similarity']])
            ss.append(cosine_similarity(model[pair[0]] , model[pair[1]])*10)
        else:
            ans_list.append([cosine_similarity(model.wv[pair[0]] , model.wv[pair[1]])*10,row['similarity']])
            ss.append(cosine_similarity(model.wv[pair[0]] , model.wv[pair[1]])*10)
    accuracies_list = []    # threshold corresponding accuracies
    
    for i in [4, 5, 6, 7, 8]:
        count = 0
        label = []
        for j in ans_list:
            if ((j[0] >= i and j[1] >= i) or (j[0] < i and j[1] < i)):
                count += 1
            if(j[0] >=i and j[1]>=i):
                label.append(1)
            else:
                label.append(0)
        dfo = pd.DataFrame({'Word1':word_1,'Word2':word_2,'Similarity Score':ss,'Ground Truth similarity score':gts,'Label':label})
        accuracies_list.append(count / len(ans_list))
        dfo.loc[len(dfo.index)] = ['', '', '','',count / len(ans_list)]
        dfo.to_csv(os.path.join('Q1output'+os.sep+str(dimension),'Q1_'+ model_name + '_similarity_'+ str(i) +'.csv'))
    return accuracies_list

In [73]:
path_50 = 'hi'+os.sep+'50'+os.sep
path_100 = 'hi'+os.sep+'100'+os.sep

## CBOW

In [74]:
model = gensim.models.Word2Vec.load(path_100 +'cbow' + os.sep + "hi-d100-m2-cbow.model") # cbow model load

In [75]:
words = list(model.wv.index_to_key) # list of all words in vocabulary

In [76]:
test_similarities(model,100,'cbow',False)

[0.6153846153846154,
 0.49230769230769234,
 0.4461538461538462,
 0.5230769230769231,
 0.8307692307692308]

In [77]:
model = gensim.models.Word2Vec.load(path_50 +'cbow' + os.sep + "hi-d50-m2-cbow.model") # cbow model load

In [78]:
test_similarities(model,50,'cbow',False)

[0.7076923076923077,
 0.5692307692307692,
 0.5076923076923077,
 0.5538461538461539,
 0.7846153846153846]

## Skipgram 

In [79]:
model = gensim.models.Word2Vec.load(path_100 +'sg' + os.sep +"hi-d100-m2-sg.model") # skipgram model load

In [80]:
test_similarities(model,100,'skipgram',False)

[0.8923076923076924,
 0.6153846153846154,
 0.5076923076923077,
 0.5384615384615384,
 0.8153846153846154]

In [81]:
model = gensim.models.Word2Vec.load(path_50 +'sg' + os.sep +"hi-d50-m2-sg.model") # skipgram model load

In [82]:
test_similarities(model,50,'skipgram',False)

[0.9076923076923077,
 0.8307692307692308,
 0.6461538461538462,
 0.5384615384615384,
 0.7692307692307693]

## Fasttext

In [83]:
# fasttext vectors load
ft_vectors = np.load(path_100 +'fasttext' + os.sep  + 'hi-d100-m2-fasttext.model.wv.vectors.npy')

In [84]:
ft_model = {} # dictionary of vocabulary to fasttext word vectors
for i in range(len(words)):
    ft_model[words[i]] = ft_vectors[i]

In [85]:
test_similarities(ft_model,100,'FastText',True)

[0.8923076923076924,
 0.6615384615384615,
 0.47692307692307695,
 0.5384615384615384,
 0.8]

In [86]:
# fasttext vectors load
ft_vectors = np.load(path_50 +'fasttext' + os.sep  + 'hi-d50-m2-fasttext.model.wv.vectors.npy')

In [87]:
ft_model = {} # dictionary of vocabulary to fasttext word vectors
for i in range(len(words)):
    ft_model[words[i]] = ft_vectors[i]

In [88]:
test_similarities(ft_model,50,'FastText',True)

[0.8307692307692308,
 0.8307692307692308,
 0.5692307692307692,
 0.5538461538461539,
 0.7846153846153846]

## Glove

In [89]:
# glove model load
glove_file = datapath(os.getcwd()+ os.sep + path_100 +'glove' + os.sep + 'hi-d100-glove.txt')
glove_vectors = gensim.models.KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True,encoding='utf8',unicode_errors='ignore')

In [90]:
test_similarities(glove_vectors,100,'glove',True)

[0.8,
 0.6307692307692307,
 0.47692307692307695,
 0.5538461538461539,
 0.8307692307692308]

In [91]:
# glove model load
glove_file = datapath(os.getcwd()+ os.sep + path_50 +'glove' + os.sep+'hi-d50-glove.txt')
glove_vectors = gensim.models.KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True,encoding='utf8',unicode_errors='ignore')

In [92]:
test_similarities(glove_vectors,50,'glove',True)

[0.8461538461538461,
 0.7230769230769231,
 0.6153846153846154,
 0.5846153846153846,
 0.8153846153846154]