In [1]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [13]:
!pip install google_trans_new

Collecting google_trans_new
  Downloading https://files.pythonhosted.org/packages/f9/7b/9f136106dc5824dc98185c97991d3cd9b53e70a197154dd49f7b899128f6/google_trans_new-1.1.9-py3-none-any.whl
Installing collected packages: google-trans-new
Successfully installed google-trans-new-1.1.9


In [26]:
import io
import numpy as np
from google_trans_new import google_translator
import nltk
nltk.download('stopwords') 
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Importing data from json file

In [9]:
import json 
  
# Opening JSON file 
f = open('/content/drive/My Drive/actors.json',) 
  
# returns JSON object as  
# a dictionary 
data = json.load(f) 
  
# Iterating through the json 
# list 
d = {}
d = data
  
# Closing file 
f.close() 

### Storing sentences and triples from one article
Here we store all the sentences and the triples from one article. The article is about the actress 'Kalpana'. We display 5 sentences and 5 triples corresponding to her here.

In [34]:
sentences = d['Q6354355']['sentences']
triples = d['Q6354355']['triples']
print("5 sentences : \n")
for s in sentences[:5]:
  print(s,"\n")
print("5 triples : \n")
for t in triples[:5]:
  print(t,"\n")

5 sentences : 

कल्पना प्रियदर्शनी एक भारतीय फ़िल्म अभिनेत्री थी, जो दक्षिण भारतीय फिल्मों में मुख्य रूप से मलयालम और तमिल फिल्मों में अपने काम के लिए प्रसिद्ध थी 

उन्होंने दक्षिण भारतीय भाषाओं में ३०० से भी अधिक फिल्मों में अभिनय किया है 

उन्होंने एक बाल कलाकार के रूप में अपना करियर शुरू किया १९७० के दशक से 

६०वें राष्ट्रीय फिल्म अवॉर्ड्स में, उन्होंने थानीचला नंजन (2012) में उनके प्रदर्शन के लिए सर्वश्रेष्ठ सहायक अभिनेत्री का पुरस्कार जीता 

कल्पना का जन्म मंच कलाकार चवरा वी. पी. नायर और विजयलक्ष्मी के हुआ था 

5 triples : 

['Kalpana', 'place of birth', 'Kerala'] 

['Kalpana', 'place of death', 'Hyderabad'] 

['Kalpana', 'sex or gender', 'female'] 

['Kalpana', 'spouse', 'Anil Kumar'] 

['Kalpana', 'country of citizenship', 'India'] 



In [18]:

def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

### Getting source and target embeddings from the aligned multilingual vector space

In [19]:
dir = '/content/drive/My Drive/Algo Name detection implementation/Text/'
src_path = '/content/drive/My Drive/wiki.hi.align.vec'
tgt_path = '/content/drive/My Drive/wiki.en.align.vec'
nmax = 50000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)

**Get Nearest Neighbours**


In [20]:
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def get_nn(word, src_emb, src_id2word, tgt_emb, tgt_id2word, K=5):
    # print("Nearest neighbors of \"%s\":" % word)
    word2id = {v: k for k, v in src_id2word.items()}
    targetwordlist = []                               # List of target words for the source word 
    if word in word2id:                               #Check if word is in vocab
      word_emb = src_emb[word2id[word]]
      scores = (tgt_emb / np.linalg.norm(tgt_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
      k_best = scores.argsort()[-K:][::-1]
      for i, idx in enumerate(k_best):
          # print((scores[idx], tgt_id2word[idx]))      #To give both distance and word
          if tgt_id2word[idx] not in stop_words:
            targetwordlist.append(tgt_id2word[idx].lower())
      return targetwordlist
    else:
      translator = google_translator()
      translate_text = translator.translate(word,lang_tgt='en') 
      transw = translate_text
      if transw not in stop_words:
      # print(word," - Translated - ",transw)
        return [transw.lower()]
      

In [None]:
# nearest neighbors in the source space
src_word = 'cat'
get_nn(src_word, src_embeddings, src_id2word, src_embeddings, src_id2word, K=5)

['cat', 'cats', 'catch', 'nocat', 'categorise']

These are the 5 nearest neighbours in English for the sample word - अभिनेता. We obtain these nearest neighbours from the aligned multilingual vector space. 

In [25]:
# printing nearest neighbors in the target space
src_word = 'अभिनेता'
l = get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)
print("Top 5 nearest neighbours for the word : ",src_word)
for i,e in enumerate(l):
  print("\n",i+1,". ",e)

Top 5 nearest neighbours for the word :  अभिनेता

 1 .  actor

 2 .  actors

 3 .  actress

 4 .  actresses

 5 .  film


**Function for word overlap**
We simply return the number of words in common between two phrases

In [29]:
def wordoverlap(entity_text,sentence_text_list):
  words = entity_text.split(' ')    #splitting the predicate's or object's words
  score = 0                         # calculate no of word matches
  for w in words:
    if w.lower() in sentence_text_list:
      score = score+1
  return score
print("Word overlap score for the phrases : 'cause of death' and 'actress died due to a bad cause of death' = ",wordoverlap("cause of death","actress died due to a bad cause of death"))

Word overlap score for the phrases : 'cause of death' and 'actress died due to a bad cause of death' =  3


### Matching triples with sentences
We find 5 English nearest neighbours for each word in the Hindi sentence. Then , we put all these english words in a list and find the total word overlap with the triples. We keep a threshold for a minimum word overlap score required to match a triple with the sentence

In [30]:

# srcwordlist = ['विश्वविद्यालय', 'प्यार']
# sentence = 'पढ़ाई अंगद बेदी ने अपनी शुरुआती पढ़ाई ज्ञान भारती स्कूल, दिल्ली से की, और स्नातक की पढाई सैंट स्टीफेंस कॉलेज, दिल्ली से पूरी की है'
tgworddict = {}
for sent in sentences:
  srcwordlist = sent.split(' ')
  tgwordlists = []
  for src_word in srcwordlist:
    # print(src_word)
    src_word = src_word.translate(str.maketrans('', '', string.punctuation))  # Removing punct
    tgtwords = get_nn(src_word, src_embeddings, src_id2word, tgt_embeddings, tgt_id2word, K=5)
    if len(tgtwords)>1 and tgtwords!=None:
      tgwordlists.extend(tgtwords)
  tgworddict[sent] = tgwordlists

In [31]:
matches_with_sent = {}
for sent in tgworddict:
  words = tgworddict[sent]
  matches = []
  for entity in triples:
    predicate = entity[1]
    obj = entity[2]
    score_predicate = wordoverlap(predicate,words)
    score_obj = wordoverlap(obj,words)
    if score_predicate >0 or score_obj >0:
      # print(predicate,"-",obj,"-",words)
      matches.append([entity,score_predicate + score_obj])
  if len(matches) >0:
      # Sorting 
    matches = np.asarray(matches)
    matches = matches[matches[:,1].argsort()][::-1]
    matches_with_sent[sent] = matches[:,0]
 

  return array(a, dtype, copy=False, order=order)


#### DIsplaying the matching sentences with the triples for the article about the actress - 'Kalpana'

In [None]:
for k,v in matches_with_sent.items():
  print("\n")
  print("*******************************************************************************")
  print(k)
  print("===============================")
  print("\t\t\t\t\t\n",v)



*******************************************************************************
कल्पना प्रियदर्शनी एक भारतीय फ़िल्म अभिनेत्री थी, जो दक्षिण भारतीय फिल्मों में मुख्य रूप से मलयालम और तमिल फिल्मों में अपने काम के लिए प्रसिद्ध थी
					
 [list(['Kalpana', 'work period (start)', '1977-01-01T00:00:00Z'])
 list(['Kalpana', 'languages spoken, written or signed', 'Malayalam'])
 list(['Kalpana', 'occupation', 'actor'])
 list(['Kalpana', 'country of citizenship', 'India'])]


*******************************************************************************
उन्होंने दक्षिण भारतीय भाषाओं में ३०० से भी अधिक फिल्मों में अभिनय किया है
					
 [list(['Kalpana', 'languages spoken, written or signed', 'Malayalam'])
 list(['Kalpana', 'country of citizenship', 'India'])]


*******************************************************************************
उन्होंने एक बाल कलाकार के रूप में अपना करियर शुरू किया १९७० के दशक से
					
 [list(['Kalpana', 'occupation', 'actor'])]


************************************

### Conclusion :
The result we get is fairly good. For a lot of sentences , we get triples that are relevant. Because we are using google translate to get english transliteration for the out of vocab hindi words, the result is even better. 