In [None]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!pip install google_trans_new

Collecting google_trans_new
  Downloading https://files.pythonhosted.org/packages/f9/7b/9f136106dc5824dc98185c97991d3cd9b53e70a197154dd49f7b899128f6/google_trans_new-1.1.9-py3-none-any.whl
Installing collected packages: google-trans-new
Successfully installed google-trans-new-1.1.9


In [None]:
import io
import numpy as np
from google_trans_new import google_translator
import nltk
nltk.download('stopwords') 
import string
from itertools import groupby 
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Importing data from json file

In [None]:
import json 
  
# Opening JSON file 
f = open('/content/drive/My Drive/test_data_v2.json',) 
  
# returns JSON object as  
# a dictionary 
data = json.load(f) 
  
# Iterating through the json 
# list 
d = {}
d = data
  
# Closing file 
f.close() 

In [None]:
### Getting a_c_p for getting all the triples
# Opening JSON file 
f = open('/content/drive/My Drive/a_c_p.json',) 
data = json.load(f) 
acp = {}
acp = data
f.close() 

### Storing sentences and triples from one article
Here we store all the sentences and the triples from one article. The article is about the actress 'Kalpana'. We display 5 sentences and 5 triples corresponding to her here.

In [None]:

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

### Getting source and target embeddings from the aligned multilingual vector space

In [None]:
dir = '/content/drive/My Drive/Algo Name detection implementation/Text/'
src_path = '/content/drive/My Drive/wiki.hi.align.vec'
tgt_path = '/content/drive/My Drive/wiki.en.align.vec'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

**Get Nearest Neighbours**


In [None]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    # print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    targetwordlist = []                               # List of target words for the source word 
    if word in word2id:                               #Check if word is in vocab
      word_emb = src_emb[word2id[word]]
      scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
      k_best = scores.argsort()[-K:][::-1]
      for i, idx in enumerate(k_best):
          # print((scores[idx], tgt_id2word[idx]))      #To give both distance and word
          if tgt_id2word[idx] not in stop_words:
            targetwordlist.append(tgt_id2word[idx].lower())
      return targetwordlist
    else:
      translator = google_translator()
      translate_text = translator.translate(word,lang_tgt='en') 
      transw = translate_text
      if transw not in stop_words:
      # print(word," - Translated - ",transw)
        return [transw.lower()]
      

These are the 5 nearest neighbours in English for the sample word - अभिनेता. We obtain these nearest neighbours from the aligned multilingual vector space. 

In [None]:
# printing nearest neighbors in the target space
src_word = 'अभिनेता'
l = get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)
print("Top 5 nearest neighbours for the word : ",src_word)
for i,e in enumerate(l):
  print("\n",i+1,". ",e)

Top 5 nearest neighbours for the word :  अभिनेता

 1 .  actor

 2 .  actors

 3 .  actress

 4 .  actresses

 5 .  film


**Function for word overlap**
We simply return the number of words in common between two phrases

In [None]:
def wordoverlap(entity_text,sentence_text_list):
  words = entity_text.split(' ')    #splitting the predicate's or object's words
  score = 0                         # calculate no of word matches
  for w in words:
    if w.lower() in sentence_text_list:
      score = score+1
  return score
print("Word overlap score for the phrases : 'cause of death' and 'actress died due to a bad cause of death' = ",wordoverlap("cause of death","actress died due to a bad cause of death"))

Word overlap score for the phrases : 'cause of death' and 'actress died due to a bad cause of death' =  3


### Matching triples with sentences
We find 5 English nearest neighbours for each word in the Hindi sentence. Then , we put all these english words in a list and find the total word overlap with the triples. We keep a threshold for a minimum word overlap score required to match a triple with the sentence

In [None]:
actors_test,cricketers_test,politicians_test = d['actors'], d['cricketers'], d['politicians']
actors, cricketers,politicians = actors_test,cricketers_test,politicians_test

In [None]:
# Creating the dictionary for the test data where key = sentence and value = list of matching triples
actors_test_dict, cricketers_test_dict,politicians_test_dict = {},{},{}
for l in actors_test:
  for k,v in l.items():
    if k == 'sentence':
      sentence = v
    if k == 'triples':
      triple_list = v
  t = [(e['subject'],e['predicate'],e['object']) for e in triple_list]
  actors_test_dict[sentence] = set(t)

for l in cricketers_test:
  for k,v in l.items():
    if k == 'sentence':
      sentence = v
    if k == 'triples':
      triple_list = v
  t = [(e['subject'],e['predicate'],e['object']) for e in triple_list]
  cricketers_test_dict[sentence] = set(t)

for l in politicians_test:
  for k,v in l.items():
    if k == 'sentence':
      sentence = v
    if k == 'triples':
      triple_list = v
  t = [(e['subject'],e['predicate'],e['object']) for e in triple_list]
  politicians_test_dict[sentence] = set(t)

In [None]:
len(actors_test_dict),len(cricketers_test_dict),len(politicians_test_dict)

(60, 13, 30)

In [None]:
### We had got a_c_p.json. We keep only the relevant triples by filtering by entity id in test annotated data
actors_trip,cricketers_trip,politician_trip = [],[],[]
actors_sent,cricketers_sent,politician_sent = [],[],[]

# Putting actors,cricketers and politicians from a_c_p
act,cric,pol = acp['a'],acp['c'],acp['p']

############### Actors #######################

# Iterating over test annotated data and keeping relevant triples only
entity_tracking = []
for e in actors_test:
  eid = e['entity_id']
  for ele in act:
    if eid == ele and eid not in entity_tracking:
      entity_tracking.append(eid)
      triples = act[ele]['triples']       #Getting triples for the matching entity id
      subject = act[ele]['personLabel']
      triplist = []
      for trip in triples:
        predicate = trip['propertyLabel']
        obj = trip['objectLabel']
        trip_tuple = (subject,predicate,obj)
        triplist.append(trip_tuple)
      actors_trip.append(triplist)

# Iterating over test annotated data and grouping annotated sentences together by entity id

for eid in entity_tracking:
  sentence_list = []
  for e in actors_test:
    if eid == e['entity_id']:
      sentence = e['sentence']
      sentence_list.append(sentence)
  actors_sent.append(sentence_list)


############### Cricketers #######################

# Iterating over test annotated data and keeping relevant triples only
entity_tracking = []
for e in cricketers_test:
  eid = e['entity_id']
  for ele in cric:
    if eid == ele and eid not in entity_tracking:
      entity_tracking.append(eid)
      triples = cric[ele]['triples']       #Getting triples for the matching entity id
      subject = cric[ele]['personLabel']
      triplist = []
      for trip in triples:
        predicate = trip['propertyLabel']
        obj = trip['objectLabel']
        trip_tuple = (subject,predicate,obj)
        triplist.append(trip_tuple)
      cricketers_trip.append(triplist)

# Iterating over test annotated data and grouping annotated sentences together by entity id

for eid in entity_tracking:
  sentence_list = []
  for e in cricketers_test:
    if eid == e['entity_id']:
      sentence = e['sentence']
      sentence_list.append(sentence)
  cricketers_sent.append(sentence_list)


############### Politicians #######################

# Iterating over test annotated data and keeping relevant triples only
entity_tracking = []
for e in politicians_test:
  eid = e['entity_id']
  for ele in pol:
    if eid == ele and eid not in entity_tracking:
      entity_tracking.append(eid)
      triples = pol[ele]['triples']       #Getting triples for the matching entity id
      subject = pol[ele]['personLabel']
      triplist = []
      for trip in triples:
        predicate = trip['propertyLabel']
        obj = trip['objectLabel']
        trip_tuple = (subject,predicate,obj)
        triplist.append(trip_tuple)
      politician_trip.append(triplist)

# Iterating over test annotated data and grouping annotated sentences together by entity id

for eid in entity_tracking:
  sentence_list = []
  for e in politicians_test:
    if eid == e['entity_id']:
      sentence = e['sentence']
      sentence_list.append(sentence)
  politician_sent.append(sentence_list)

In [None]:
len(actors_sent), len(cricketers_sent),len(politician_sent)

(34, 13, 27)

In [None]:
len(actors_trip), len(cricketers_trip),len(politician_trip)

(34, 13, 27)

So, the sentence list is a list of lists--- each list containing sentences for 1 article/ entity id. Similarly, each triple list is a list of lists--- each list containing triples for 1 article/ entity id. 
Now , we run our algo on this to find the matcheing sentences and triples within each article. We keep it in a dictionary : key = sentence and val = matching list of tripes.

In [None]:
def matches(sentences,triples):
  tgworddict = {}
  for sent in sentences:
    srcwordlist = sent.split(' ')
    tgwordlists = []
    for src_word in srcwordlist:
      # print(src_word)
      src_word = src_word.translate(str.maketrans('', '', string.punctuation))  # Removing punct
      tgtwords = get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)
      if len(tgtwords)>1 and tgtwords!=None:
        tgwordlists.extend(tgtwords)
    tgworddict[sent] = tgwordlists

  #Matches
  matches_with_sent = {}
  for sent in tgworddict:
    words = tgworddict[sent]
    matches = []
    for entity in triples:
      predicate = entity[1]
      obj = entity[2]
      score_predicate = wordoverlap(predicate,words)
      score_obj = wordoverlap(obj,words)
      if score_predicate >0 or score_obj >0:
        # print(predicate,"-",obj,"-",words)
        matches.append(entity)
    if len(matches) >0:
      #   # Sorting 
      # matches = np.asarray(matches)
      # matches = matches[matches[:,1].argsort()][::-1]
      # matches_with_sent[sent] = matches[:,0]
      # matches = set(matches)
      matches_with_sent[sent] = set(matches)
  return matches_with_sent

In [None]:
matches_act = {}
for sent_list,triple_list in zip(actors_sent,actors_trip):
  for k,v in matches(sent_list,triple_list).items():
    matches_act[k] = v

matches_cric = {}
for sent_list,triple_list in zip(cricketers_sent,cricketers_trip):
  for k,v in matches(sent_list,triple_list).items():
    matches_cric[k] = v

matches_pol = {}
for sent_list,triple_list in zip(politician_sent,politician_trip):
  for k,v in matches(sent_list,triple_list).items():
    matches_pol[k] = v

### Evaluation : Precision and Recall

In [None]:
def evaluate(test_dict,matches_dict):
  sum_prec = 0
  for key,val in matches_dict.items():
    tp,fp = 0,0
    for k,v in test_dict.items():
      #If sentence matches
      if k == key:
        for ent in v:
          for trip in val:
            if ent == trip:
              tp = tp + 1
        for trip in val:
          flag = 0
          for ent in v:
            if ent == trip:
              flag = 1
              break
          if flag == 0:
            fp = fp +1
        break
    if (tp+fp)!=0:
      prec = tp/(tp + fp)
    else:
      prec = 0
    sum_prec = prec + sum_prec

  sum_rec = 0
  for k,v in test_dict.items():
    rec = 0
    tp,fp = 0,0
    for key,val in matches_dict.items():
      #If sentence matches
      if k == key:
        for ent in v:
          for trip in val:
            if ent == trip:
              tp = tp + 1
        for trip in val:
          flag = 0
          for ent in v:
            if ent == trip:
              flag = 1
              break
          if flag == 0:
            fp = fp +1
        break
    rec = tp/len(v)
    sum_rec = rec + sum_rec


  avg_rec,avg_prec = sum_rec/len(test_dict),sum_prec/len(matches_dict)
  return avg_rec, avg_prec


avg_rec_act, avg_prec_act = evaluate(actors_test_dict,matches_act)
avg_rec_cric, avg_prec_cric = evaluate(cricketers_test_dict,matches_cric)
avg_rec_pol, avg_prec_pol = evaluate(politicians_test_dict,matches_pol)      

In [None]:
(avg_rec_act,avg_prec_act), (avg_rec_cric, avg_prec_cric) , (avg_rec_pol, avg_prec_pol)

((0.8252777777777779, 0.431624966370729),
 (0.8205128205128205, 0.3855921855921856),
 (0.6055555555555556, 0.37839506172839504))

In [None]:
AverageRecall = (avg_rec_act + avg_rec_cric + avg_rec_pol)/3
AveragePrecision = (avg_prec_act + avg_prec_cric + avg_prec_pol)/3

AverageRecall, AveragePrecision

(0.7504487179487179, 0.3985374045637699)

In [None]:
for sent in actors_test_dict:
  actors_test_dict[sent] = list(actors_test_dict[sent])
for sent in cricketers_test_dict:
  cricketers_test_dict[sent] = list(cricketers_test_dict[sent])
for sent in politicians_test_dict:
  politicians_test_dict[sent] = list(politicians_test_dict[sent])

for sent in matches_act:
  matches_act[sent] = list(matches_act[sent])
for sent in matches_cric:
  matches_cric[sent] = list(matches_cric[sent])
for sent in matches_pol:
  matches_pol[sent] = list(matches_pol[sent])

In [None]:
# saving the input test files
with open("/content//drive//MyDrive//Word Overlap Output//actors_test.json", "w") as outfile: 
    json.dump(actors_test_dict,outfile)
with open("/content/drive/MyDrive/Word Overlap Output/cricketers_test.json", "w") as outfile: 
    json.dump(cricketers_test_dict, outfile)
with open("/content/drive/MyDrive/Word Overlap Output/politicians_test.json", "w") as outfile: 
    json.dump(politicians_test_dict, outfile)

#saving the output files
with open("/content/drive/MyDrive/Word Overlap Output/actors_matches.json", "w") as outfile: 
    json.dump(matches_act, outfile)
with open("/content/drive/MyDrive/Word Overlap Output/cricketers_matches.json", "w") as outfile: 
    json.dump(matches_cric, outfile)
with open("/content/drive/MyDrive/Word Overlap Output/politicians_matches.json", "w") as outfile: 
    json.dump(matches_pol, outfile)

In [None]:
matches_cric

{'अमय खुरासिया एक पूर्व भारतीय क्रिकेटर हैं।': {('Amay Khurasiya',
   'country for sport',
   'India'),
  ('Amay Khurasiya', 'country of citizenship', 'India'),
  ('Amay Khurasiya', 'member of sports team', 'India national cricket team'),
  ('Amay Khurasiya', 'member of sports team', 'Madhya Pradesh cricket team'),
  ('Amay Khurasiya', 'occupation', 'cricketer'),
  ('Amay Khurasiya', 'sport', 'cricket')},
 'कुमार श्री इंद्रजीत सिंहजी माधवसिंहजी (pronunciation सहायता·सूचना) (१५ जून १९३७ - १२ मार्च २०११) एक भारतीय क्रिकेट टीम के टेस्ट क्रिकेट खिलाड़ी थे ': {('Kumar Indrajitsinhji',
   'CricketArchive player ID',
   '1234'),
  ('Kumar Indrajitsinhji', 'ESPNcricinfo.com player ID', '29533'),
  ('Kumar Indrajitsinhji', 'country for sport', 'India'),
  ('Kumar Indrajitsinhji', 'country of citizenship', 'British India'),
  ('Kumar Indrajitsinhji', 'country of citizenship', 'Dominion of India'),
  ('Kumar Indrajitsinhji', 'country of citizenship', 'India'),
  ('Kumar Indrajitsinhji',
   'membe

In [None]:
### Running a unit test on evaluate
a ,b,c,d= 1,2,3,4
test_dict = {'ABCD' : {(a,b,c),(a,b,d),(b,c,d)}, 'ABDC': {(a,b,c),(c,d,e)}, 'AABC' :{(a,b,c)}}
matches_dict = {'ABCD' : {(a,b,c),(a,b,d)}, 'ABDC': {(a,b,c),(a,b,d)} }

In [None]:
rec,prec = evaluate(test_dict,matches_dict)
rec,prec

In [None]:
(2/3 + 1/2 + 0)/3 

#### DIsplaying the matching sentences with the triples for the article about the actress - 'Kalpana'

In [None]:
for k,v in matches_with_sent.items():
  print("\n")
  print("*******************************************************************************")
  print(k)
  print("===============================")
  print("\t\t\t\t\t\n",v)

### Conclusion :
The result we get is fairly good. For a lot of sentences , we get triples that are relevant. Because we are using google translate to get english transliteration for the out of vocab hindi words, the result is even better. 