# Vector Similarity

### Importing the required libraries

In [9]:
import io
import numpy as np
from google_trans_new import google_translator 
from scipy.spatial.distance import cosine
import json 
import string
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import word_tokenize

### Loading data from json file

In [3]:
# Opening JSON file 
f = open('E:\Jupyter\SocialComputing\myrepo\\actors.json',) 
  
# returns JSON object as  
# a dictionary 
data = json.load(f) 
  
# Iterating through the json 
# list 
d = {}
d = data
  
# Closing file 
f.close()

### Storing sentences and triples from one article
Here we store all the sentences and the triples from one article. The article is about the actress 'Kalpana'. We display 5 sentences and 5 triples corresponding to her here.

In [5]:
sentences = d['Q6354355']['sentences']
triples = d['Q6354355']['triples']
print("5 sentences : \n")
print(sentences[:5],"\n")
print("5 triples : \n")
print(triples[:5],"\n")

5 sentences : 

['कल्पना प्रियदर्शनी एक भारतीय फ़िल्म अभिनेत्री थी, जो दक्षिण भारतीय फिल्मों में मुख्य रूप से मलयालम और तमिल फिल्मों में अपने काम के लिए प्रसिद्ध थी', 'उन्होंने दक्षिण भारतीय भाषाओं में ३०० से भी अधिक फिल्मों में अभिनय किया है', 'उन्होंने एक बाल कलाकार के रूप में अपना करियर शुरू किया १९७० के दशक से', '६०वें राष्ट्रीय फिल्म अवॉर्ड्स में, उन्होंने थानीचला नंजन (2012) में उनके प्रदर्शन के लिए सर्वश्रेष्ठ सहायक अभिनेत्री का पुरस्कार जीता', 'कल्पना का जन्म मंच कलाकार चवरा वी. पी. नायर और विजयलक्ष्मी के हुआ था'] 

5 triples : 

[['Kalpana', 'place of birth', 'Kerala'], ['Kalpana', 'place of death', 'Hyderabad'], ['Kalpana', 'sex or gender', 'female'], ['Kalpana', 'spouse', 'Anil Kumar'], ['Kalpana', 'country of citizenship', 'India']] 



In [10]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

### Getting source and target embeddings from the aligned multilingual vector space

In [11]:
src_path = "E:\Jupyter\SocialComputing\myrepo\MUSE\dumped\debug\ge4rutdeo7\\vectors-hi.txt"
tgt_path = "E:\Jupyter\SocialComputing\myrepo\MUSE\dumped\debug\ge4rutdeo7\\vectors-en.txt"
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

In [12]:
def invocab(word,src_id2word):
    word2id = {v: k for k, v in src_id2word.items()}
    if word in word2id:
        return True
    else:
        return False

def get_srcembeddings(word, src_emb, src_id2word):
    word2id = {v: k for k, v in src_id2word.items()}
    word_emb = src_emb[word2id[word]]
    return word_emb

def get_trgembeddings(word,tgt_emb, tgt_id2word):
    word2id = {v: k for k, v in tgt_id2word.items()}
    word_emb = tgt_emb[word2id[word]]
    return word_emb


### Obtaining sentence and triple embeddings
We obtain the sentence embeddings by taking each sentence, obtaining word embeddings for each word in the sentence, and averaging the word embeddings. For triple embeddings , we average the embeddings for each word in the triple.

In [13]:
sentence_embeddings = {}
for srcsent in sentences:
    avgsent = 0
    sumemb = 0
    for src_word in srcsent.split(' '):
        src_word = src_word.translate(str.maketrans('', '', string.punctuation))
        if invocab(src_word,src_id2word):
            src_emb = get_srcembeddings(src_word,src_embeddings, src_id2word)
        sumemb = sumemb + src_emb
    avgsent = sumemb/len(srcsent)
    sentence_embeddings[srcsent] = avgsent
entity_embeddings = {}
for entity in triples:
    sument = 0
    avgent = 0
    pred = entity[1]
    obj = entity[2]
    total = pred+" "+obj
    for word in word_tokenize(total):
        if invocab(word,tgt_id2word) and word not in stop_words and len(word) > 2:
            pred_emb = get_trgembeddings(word,tgt_embeddings, tgt_id2word)
            sument = sument + pred_emb
    avgent = sument/len(total)
    entity_embeddings[tuple(entity)] = avgent

### Obtaining similarity between sentence and triples and returning the top matches 

In [14]:
matches = {}
for sent in sentence_embeddings:
    ent_matchlist = []
    sent_embed = sentence_embeddings[sent]
    for ent in entity_embeddings:
        ent_embed = entity_embeddings[ent]
        similarity = 1 - cosine(sent_embed,ent_embed)
        ent = list(ent)
        if similarity > 0.5:
            ent_matchlist.append([ent,similarity])
    if len(ent_matchlist)>0:
        ent_matchlist = np.asarray(ent_matchlist)
        ent_matchlist = ent_matchlist[ent_matchlist[:,1].argsort()][::-1]

        matches[sent] = ent_matchlist

  dist = 1.0 - uv / np.sqrt(uu * vv)


### Below we have returned the top triple matches for each hindi sentence in the article

In [17]:
for i in matches:
    if len(matches[i])>0:
        print(i," ================================\n\n",matches[i])
        print("\n")        


 [[list(['Kalpana', 'work period (start)', '1977-01-01T00:00:00Z'])
  0.5562646976526775]
 [list(['Kalpana', 'occupation', 'actor']) 0.5489648255664478]
 [list(['Kalpana', 'number of children', '1']) 0.5398085957969948]
 [list(['Kalpana', 'manner of death', 'natural causes'])
  0.5249625103999355]
 [list(['Kalpana', 'languages spoken, written or signed', 'Malayalam'])
  0.5081523213012479]]



 [[list(['Kalpana', 'work period (start)', '1977-01-01T00:00:00Z'])
  0.5728460129889632]
 [list(['Kalpana', 'languages spoken, written or signed', 'Malayalam'])
  0.5587660164292069]
 [list(['Kalpana', 'manner of death', 'natural causes'])
  0.5385730372651234]
 [list(['Kalpana', 'number of children', '1']) 0.5337991490152064]]



 [[list(['Kalpana', 'work period (start)', '1977-01-01T00:00:00Z'])
  0.611821704312822]
 [list(['Kalpana', 'number of children', '1']) 0.5276704540197561]
 [list(['Kalpana', 'manner of death', 'natural causes'])
  0.5191525447008568]]



 [[list(['Kalpana', 'work per

#### We do find quite a few of the matching triples to be relevant to the sentence. But, there are a few irrelevant matches as well.
Upon analysis, we think the word overlap is working better than the vector similarity approach. A possible reason can be that when we simply average out the words in a sentence, and when we average out the words in the triples and then find the similarity between these two averages, some semantic information is lost. So, triples that should have been irrelevant are also found as similar. As the word overlap method is a strictly string overlap, the relevance is much stronger.