In [None]:
import os
import json
import torch.nn.functional as F
import torch

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
print('using device:', device)

using device: cpu


In [None]:
root_dir='/home/tushar.abhishek/ire/research/project_copernicus/sample_annotations/hi'

sentences_file = os.path.join(root_dir, 'query', 'hindi_sample_random_sentences.json')
response_file = os.path.join(root_dir, 'response', 'annotations_406_v1')

In [None]:
with open(sentences_file) as dfile:
    sent_data = json.load(dfile)

In [None]:
with open(response_file) as dfile:
    ann_data = json.load(dfile)

In [None]:
print('total count of sentences %d' % len(sent_data))
print('total response recorded %d' % len(ann_data))

total count of sentences 27246
total response recorded 406


In [None]:
for item in ann_data:
    print(item)
    break

{'_id': {'$oid': '60ac87e2230a5f0015dc74ee'}, 'facts': [['occupation', 'cricketer', []], ['member of sports team', 'India national cricket team', [['start time', '1981'], ['end time', '1992']]], ['country for sport', 'India', []]], 'factIndex': [4, 10, 14], 'email': 'shivprasad.sagare@research.iiit.ac.in', 'sentence': 'इन्होंने बतौर क्रिकेट खिलाड़ी भारतीय क्रिकेट टीम के लिए १९८१ से १९९२ तक टेस्ट क्रिकेट और एक दिवसीय अंतर्राष्ट्रीय क्रिकेट खेला है।', 'sentenceOffset': 0, 'qid': 'Q3518502', 'covers': 'Y', 'date': {'$date': '2021-05-25T05:15:14.852Z'}}


In [None]:
qid_sentence_data = {}


for item in sent_data:
    qid = item['qid']
    if qid not in qid_sentence_data:
        qid_sentence_data[qid]={
            'facts': item['facts'],
            'sentences': [[] for x in range(item['sentence-count'])],
        }
    sentence_index = item['sentence-offset']
    qid_sentence_data[qid]['sentences'][sentence_index]=item['sentence']

In [None]:
print('total number of entities found', len(qid_sentence_data))

from collections import defaultdict
empty_sentence = defaultdict(lambda: [])
#checking if any empty sentence exists

empty_count=0
for qid in qid_sentence_data:
    for i, item in enumerate(qid_sentence_data[qid]['sentences']):
        if len(item)==0:
            empty_sentence[qid].append(i)
            empty_count+=1

print('total number of empty sentences', empty_count)

total number of entities found 9922
total number of empty sentences 0


In [None]:
import random
random_key = random.choice(list(qid_sentence_data.keys()))

print(random_key)

print(qid_sentence_data[random_key])

Q62604398
{'facts': [['date of birth', '21 January 1967', []], ['instance of', 'human', []], ['sex or gender', 'male', []], ['position held', 'Member of the 17th Lok Sabha', [['parliamentary group', 'Bharatiya Janata Party'], ['electoral district', 'Karnal Lok Sabha constituency'], ['parliamentary term', '17th Lok Sabha']]], ['member of political party', 'Bharatiya Janata Party', []], ['country of citizenship', 'India', []], ['occupation', 'politician', []], ['educated at', 'Kurukshetra University', []], ['place of birth', 'Panipat', []]], 'sentences': ['संजय भाटिया हरियाणा से संसद के निर्वाचित सदस्य हैं।']}


In [None]:
from transformers import AutoTokenizer, AutoModel

# config="xlm-roberta-large" 
config="google/muril-base-cased"
# config="ai4bharat/indic-bert"
# config="sentence-transformers/LaBSE"
# config="bert-base-multilingual-uncased"
# config="facebook/mbart-large-cc25" # also need to change

tokenizer = AutoTokenizer.from_pretrained(config)
model = AutoModel.from_pretrained(config).to(device)

In [None]:
random_data = random.choice([x for x in ann_data if len(x['factIndex'])])
random_qid = random_data['qid']

fact_list = ["%s is %s"%(x[0], x[1]) for x in qid_sentence_data[random_qid]['facts']]
sentence_list = qid_sentence_data[random_qid]['sentences']

target_sentence = sentence_list[random_data['sentenceOffset']]
fact_indexes = random_data['factIndex']

In [None]:
import torch
import torch.nn as nn
model.eval()

def pooled_rep(model_output, attention_mask, reduce='cls'):
    if reduce=='cls':
        return model_output[:, 0, :]
    elif reduce == "mean":
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    elif reduce == 'sum':
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        return sum_embeddings
    else:
        raise Exception('reduce function not present !!!')

with torch.no_grad():
    enc = tokenizer.batch_encode_plus(sentence_list, padding='longest', return_attention_mask=True, return_tensors='pt')
    #taking the [CLS] token
    s_out = model(input_ids=enc["input_ids"].to(device), attention_mask=enc["attention_mask"].to(device))[0]
    sentence_encoding = pooled_rep(s_out, enc["attention_mask"].to(device), reduce='mean')
    
    fenc = tokenizer.batch_encode_plus(fact_list, padding='longest', return_attention_mask=True, return_tensors='pt')
    f_out = model(input_ids=fenc["input_ids"].to(device), attention_mask=fenc["attention_mask"].to(device))[0]
    facts_encoding = pooled_rep(f_out, fenc["attention_mask"].to(device), reduce='mean')

In [None]:
# simple cosine similarity score

print("qid", random_qid)
print(target_sentence, end='\n\n')
with torch.no_grad():
    scores = F.cosine_similarity(facts_encoding, sentence_encoding[random_data['sentenceOffset']].unsqueeze(0), 1, 1e-6).cpu().tolist()
    score_map = {i:v for i,v in enumerate(scores)}
    for j, u in sorted(score_map.items(), key=lambda x: x[1], reverse=True):
        temp_string = "%0.3f %s" % (u, fact_list[j])
        if j in fact_indexes:
            print("* %s" % temp_string)
        else:
            print(temp_string)

qid Q317969
अब्दुस्सत्तार बांग्लादेश के राष्ट्रपति थे।

* 0.992 position held is President of Bangladesh
0.989 member of political party is Bangladesh Nationalist Party
0.989 occupation is politician
0.989 religion is Islam
0.989 country of citizenship is Bangladesh
0.989 place of birth is Birbhum district
0.989 place of death is Dhaka
0.988 country of citizenship is British Raj
0.988 instance of is human
0.988 country of citizenship is Pakistan
0.988 native language is Bengali
0.988 educated at is University of Calcutta
0.988 date of birth is 1906
0.987 date of death is 05 October 1985
0.986 sex or gender is male


In [None]:
# cosine similarity score using neighbour information
# refer to Unsupervised parallel data mining section of 
# this paper: https://arxiv.org/pdf/2006.09526.pdf for more details

import numpy as np
neighbour=5


print("qid", random_qid)
print(target_sentence, end='\n\n')
with torch.no_grad():
    g_scores = F.cosine_similarity(facts_encoding, sentence_encoding[random_data['sentenceOffset']].unsqueeze(0), 1, 1e-6).cpu().tolist()
    score_map = {}
    sentence_neighbours_enc = F.cosine_similarity(sentence_encoding, sentence_encoding[random_data['sentenceOffset']].unsqueeze(0))
    sentence_scores = [x.item() for i, x in enumerate(sentence_neighbours_enc) if i!=random_data['sentenceOffset']]
    sent_k_score = max(np.sum(sentence_scores[:neighbour]), 0)
    sent_k = max(len(sentence_scores[:neighbour]), 1)
    for i, _ in enumerate(fact_list):
        temp_fact_cosine = F.cosine_similarity(facts_encoding, facts_encoding[i].unsqueeze(0))
        facts_scores = [x.item() for j, x in enumerate(temp_fact_cosine) if i!=j]
        fact_k_score = max(np.sum(facts_scores[:neighbour]), 0)
        fact_k = max(len(facts_scores[:neighbour]), 1)
        denom = (fact_k_score/(2*fact_k)) + (sent_k_score/(2*sent_k))
        score_map[i] = g_scores[i]/denom
    
    for j, u in sorted(score_map.items(), key=lambda x: x[1], reverse=True):
        temp_string = "%0.3f %s" % (u, fact_list[j])
        if j in fact_indexes:
            print("* %s" % temp_string)
        else:
            print(temp_string)

qid Q317969
अब्दुस्सत्तार बांग्लादेश के राष्ट्रपति थे।

* 1.000 position held is President of Bangladesh
0.998 member of political party is Bangladesh Nationalist Party
0.997 educated at is University of Calcutta
0.997 occupation is politician
0.997 religion is Islam
0.996 country of citizenship is Bangladesh
0.996 place of birth is Birbhum district
0.996 instance of is human
0.996 native language is Bengali
0.996 place of death is Dhaka
0.996 country of citizenship is British Raj
0.995 country of citizenship is Pakistan
0.995 sex or gender is male
0.995 date of death is 05 October 1985
0.995 date of birth is 1906


In [None]:
def get_score(true_y, pred_y):
    # calculates the precision and recall on set
    set_a = set(true_y)
    set_b = set(pred_y)
    if len(set_a)==0 or len(set_b)==0:
        return 0.0, 0.0
    # precision, recall, f1
    tp = set_a.intersection(set_b)
    precision = len(tp)/float(len(pred_y))
    recall = len(tp)/float(len(true_y))
    return precision, recall

In [None]:
import numpy as np

def fact_str(fact, enable_qualifiers=False):
    fact_str = fact[0:2]
    qualifier_str = [' '.join(x) for x in fact[2]]
    if enable_qualifiers:
        fact_str.extend(qualifier_str)
    return fact_str

def pooled_rep(model_output, attention_mask, reduce='cls'):
    if reduce=='cls':
        return model_output[:, 0, :]
    elif reduce == "mean":
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    elif reduce == 'sum':
        token_embeddings = model_output #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        return sum_embeddings
    else:
        raise Exception('reduce function not present !!!')

def get_sentence_specific_fact_alignment(tsentences, tfacts, index, threshold=0.5, score_type='cosine', reduce='cls'):
    res = []
    with torch.no_grad():
        enc = tokenizer.batch_encode_plus(tsentences, padding='longest', return_attention_mask=True, return_tensors='pt')
        #taking the [CLS] token
        s_out = model(input_ids=enc["input_ids"].to(device), attention_mask=enc["attention_mask"].to(device))[0]
        sentence_encoding = pooled_rep(s_out, enc["attention_mask"].to(device), reduce=reduce)
        
        processed_facts = [' is '.join(fact_str(x)) for x in tfacts]
        fenc = tokenizer.batch_encode_plus(processed_facts, padding='longest', return_attention_mask=True, return_tensors='pt')
        f_out = model(input_ids=fenc["input_ids"].to(device), attention_mask=fenc["attention_mask"].to(device))[0]
        facts_encoding = pooled_rep(f_out, fenc["attention_mask"].to(device), reduce=reduce)
        
        scores = F.cosine_similarity(facts_encoding, sentence_encoding[index].unsqueeze(0), 1, 1e-6).cpu().tolist()
        if score_type=='cosine':
            score_map = {i:v for i,v in enumerate(scores)}
        else:
            neighbour = 5
            sentence_neighbours_enc = F.cosine_similarity(sentence_encoding, sentence_encoding[index].unsqueeze(0))
            sentence_scores = [x.item() for i, x in enumerate(sentence_neighbours_enc) if i!=index]
            sent_k_score = max(np.sum(sentence_scores[:neighbour]), 0)
            sent_k = max(len(sentence_scores[:neighbour]), 1)
            
            score_map = {}
            for i, _ in enumerate(tfacts):
                temp_fact_cosine = F.cosine_similarity(facts_encoding, facts_encoding[i].unsqueeze(0))
                facts_scores = [x.item() for j, x in enumerate(temp_fact_cosine) if i!=j]
                fact_k_score = max(np.sum(facts_scores[:neighbour]), 0)
                fact_k = max(len(facts_scores[:neighbour]), 1)
                denom = (fact_k_score/(2*fact_k)) + (sent_k_score/(2*sent_k))
                score_map[i] = scores[i]/denom
        
        max_facts = int(threshold*len(score_map))
        for j, u in sorted(score_map.items(), key=lambda x: x[1], reverse=True):
            if max_facts<1:
                break
            res.append(j)
            max_facts-=1
        return res

In [None]:
# evluating on the annotated dataset

from tqdm import tqdm
import numpy as np
from collections import defaultdict

partial_coverage_scores = []
full_coverage_scores = []
combined_scores = []

results = defaultdict(lambda: defaultdict())

model.eval()
for item in tqdm(ann_data):
    qid = item['qid']
    sent_index = item['sentenceOffset']
    true_fact_index = item['factIndex']
    # don't consider the empty facts datainstances
    if not len(true_fact_index):
        continue
    t_qid_data = qid_sentence_data[qid]
    
    pred_fact_index = get_sentence_specific_fact_alignment(t_qid_data['sentences'], 
                                                            t_qid_data['facts'], 
                                                               sent_index, threshold=0.5,
                                                          score_type='cosine', reduce='mean')


    results[qid][sent_index] = {'annotated': true_fact_index, 'predicted': pred_fact_index}
    score = get_score(true_fact_index, pred_fact_index)
    if item['covers']=='Y':
        full_coverage_scores.append(score)
    else:
        partial_coverage_scores.append(score)
    combined_scores.append(score)

print('combined [%d] | avg_precision : %f, avg_recall : %f' % (len(combined_scores), np.mean([x[0] for x in combined_scores]), np.mean([x[1] for x in combined_scores])))
if len(full_coverage_scores):
    print('full coverage [%d] | avg_precision : %f, avg_recall : %f' % (len(full_coverage_scores), np.mean([x[0] for x in full_coverage_scores]), np.mean([x[1] for x in full_coverage_scores])))
if len(partial_coverage_scores):
    print('partial coverage [%d] | avg_precision : %f, avg_recall : %f' % (len(partial_coverage_scores), np.mean([x[0] for x in partial_coverage_scores]), np.mean([x[1] for x in partial_coverage_scores])))

100%|██████████| 406/406 [07:19<00:00,  1.08s/it]

combined [250] | avg_precision : 0.299082, avg_recall : 0.785467
full coverage [111] | avg_precision : 0.288798, avg_recall : 0.846085
partial coverage [139] | avg_precision : 0.307294, avg_recall : 0.737059





In [None]:
import random

random_annotation = random.choice([x for x in ann_data if x['covers']=='Y'])
random_qid = random_annotation['qid']
random_sent_idx = random_annotation['sentenceOffset']

print('QID', random_qid)
print('annotator', random_annotation['email'])
print('--'*30)
print('prompt sentence [%d] : %s' % (random_sent_idx, qid_sentence_data[random_qid]['sentences'][random_sent_idx]))
print('actual sentence : %s' % random_annotation['sentence'])
print('--'*30)
print('actual index', results[random_qid][random_sent_idx]['annotated'])
print('predicted index', results[random_qid][random_sent_idx]['predicted'])
print('--'*30)
print('all candidate facts')
for fi, _fact in enumerate(qid_sentence_data[random_qid]['facts']):
    print("%d> %s" % (fi, _fact))
print('=='*30)
random_sample_score = get_score(results[random_qid][random_sent_idx]['annotated'],
                               results[random_qid][random_sent_idx]['predicted'])
print('precision: %0.2f, recall: %0.2f' % (random_sample_score[0], random_sample_score[1]))
print('=='*30)
print('annotated facts > ')
for fi in results[random_qid][random_sent_idx]['annotated']:
    print("%d> %s" % (fi, qid_sentence_data[random_qid]['facts'][fi]))
print('~~'*30)
print('predicted facts > ')
for fi in results[random_qid][random_sent_idx]['predicted']:
    print("%d> %s" % (fi, qid_sentence_data[random_qid]['facts'][fi]))

QID Q6893010
annotator tushar.abhishek@ymail.com
------------------------------------------------------------
prompt sentence [2] : 27 नवम्बर 2018 को उनका मुम्बई में निधन हो गया।
actual sentence : 27 नवम्बर 2018 को उनका मुम्बई में निधन हो गया।
------------------------------------------------------------
actual index [1, 11]
predicted index [1, 11, 3, 0, 2, 4, 9, 12, 7]
------------------------------------------------------------
all candidate facts
0> ['date of birth', '02 July 1954', []]
1> ['date of death', '27 November 2018', []]
2> ['work period (start)', '1982', []]
3> ['work period (end)', '2018', []]
4> ['instance of', 'human', []]
5> ['sex or gender', 'male', []]
6> ['religion', 'Islam', []]
7> ['country of citizenship', 'India', []]
8> ['given name', 'Mohammed', []]
9> ['place of birth', 'Kolkata', []]
10> ['occupation', 'playback singer', []]
11> ['place of death', 'Mumbai', []]
12> ['family name', 'Aziz', []]
13> ['languages spoken, written or signed', 'Bengali', []]
14> ['l