In [1]:
# case study
import json

analyze_row_index = 650
file_path = 'report/anaphors-2.1/docred/report.json'
with open(file_path, 'r') as file:
    data = json.load(file)
data[analyze_row_index]['sents']

[['Adolfo',
  'Nicolás',
  'Pachón',
  '(',
  'born',
  '29',
  'April',
  '1936',
  ')',
  ',',
  'is',
  'a',
  'Spanish',
  'priest',
  'of',
  'the',
  'Roman',
  'Catholic',
  'Church',
  '.'],
 ['He',
  'was',
  'the',
  'thirtieth',
  'Superior',
  'General',
  'of',
  'the',
  'Society',
  'of',
  'Jesus',
  ',',
  'the',
  'largest',
  'religious',
  'order',
  'in',
  'the',
  'Roman',
  'Catholic',
  'Church',
  '.'],
 ['Nicolás',
  ',',
  'after',
  'consulting',
  'with',
  'Pope',
  'Francis',
  ',',
  'determined',
  'to',
  'resign',
  'after',
  'his',
  '80th',
  'birthday',
  ',',
  'and',
  'initiated',
  'the',
  'process',
  'of',
  'calling',
  'a',
  'Jesuit',
  'General',
  'Congregation',
  'to',
  'elect',
  'his',
  'successor',
  '.'],
 ['Until',
  'the',
  'resignation',
  'of',
  'his',
  'predecessor',
  ',',
  'Peter',
  'Hans',
  'Kolvenbach',
  ',',
  'it',
  'was',
  'not',
  'the',
  'norm',
  'for',
  'a',
  'Jesuit',
  'Superior',
  'General',
  '

In [2]:
def span_to_words(sents, span) -> str:
    index_of_sent = span[0][0]
    start = span[0][1]
    end = span[1][1]
    words = sents[index_of_sent][start:end]
    return ' '.join(words)

span_to_words(data[analyze_row_index]['sents'], data[analyze_row_index]['raw_spans'][0])

'Adolfo Nicolás Pachón'

In [3]:
def join_sents_words(sents):
    return [' '.join(sent) for sent in sents]
join_sents_words(data[analyze_row_index]['sents'])

['Adolfo Nicolás Pachón ( born 29 April 1936 ) , is a Spanish priest of the Roman Catholic Church .',
 'He was the thirtieth Superior General of the Society of Jesus , the largest religious order in the Roman Catholic Church .',
 'Nicolás , after consulting with Pope Francis , determined to resign after his 80th birthday , and initiated the process of calling a Jesuit General Congregation to elect his successor .',
 'Until the resignation of his predecessor , Peter Hans Kolvenbach , it was not the norm for a Jesuit Superior General to resign ; they , like the great majority of the Popes up until Benedict XVI , generally served until death .',
 'However , the Jesuit constitutions include provision for a resignation .',
 'In October 2016 the thirty - sixth General Congregation of the Society of Jesus appointed his successor , Arturo Sosa from Venezuela .']

In [4]:
def cluster_to_words(sents, span, cluster):
    words = []
    spans = []
    for i in cluster:
        if i != -1:
            words.append(span_to_words(sents, span[i]))
            spans.append(span[i])
        else:
            words.append('None')
            spans.append('None')
    return words, spans

cluster_to_words(data[analyze_row_index]['sents'], data[analyze_row_index]['raw_spans'], data[analyze_row_index]['clusters_pred'][0])

(['Jesuit', 'Jesuit'], [[[4, 3], [4, 4]], [[3, 18], [3, 19]]])

In [5]:
import json
rel_info_path = 'data/docred/rel_info.json'
rel2id_path = 'data/docred/rel2id.json'
with open(rel_info_path, 'r') as file:
    rel_info = json.load(file)
with open(rel2id_path, 'r') as file:
    rel2id = json.load(file)

id2rel_info = {}
for k, v in rel2id.items():
    if v != 0:
        id2rel_info[v] = rel_info[k]
id2rel = {}
for k, v in rel2id.items():
    if v != 0:
        id2rel[v] = k

def visualize_rel(sents, span, rel, clusters, id2rel_info: dict, evidence :list=None):
    if(rel['h'] != -1):
        head = cluster_to_words(sents, span, clusters[rel['h']])
    else:
        head = 'None'
    if(rel['t'] != -1):
        tail = cluster_to_words(sents, span, clusters[rel['t']])
    else:
        tail = 'None'
    rel_info = id2rel_info[rel['r']]
    v_rel = {}
    v_rel['h'] = head
    v_rel['t'] = tail
    v_rel['r'] = rel_info
    if evidence != None:
        v_rel['evidence'] = evidence
    return v_rel

visualize_rel(data[analyze_row_index]['sents'], data[analyze_row_index]['raw_spans'], data[analyze_row_index]['rels_gold'][0], data[analyze_row_index]['clusters_gold'], id2rel_info)

{'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
  [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
 't': (['29 April 1936'], [[[0, 5], [0, 8]]]),
 'r': 'date of birth'}

In [6]:
def find_evidence(rel, id2rel, labels):
    rel_h = rel['h']
    rel_t = rel['t']
    rel_r = id2rel[rel['r']]
    for i in labels:
        if rel_h==i['h'] and rel_t==i['t'] and rel_r==i['r']:
            return i['evidence']
    return None
        

In [7]:
def transform_row(row, id2rel_info, id2rel):
    result_dict = {}
    result_dict['sents'] = join_sents_words(row['sents'])
    num_sents = len(result_dict['sents'])
    result_dict['num_sents'] = num_sents
    words = 0
    for sent in row['sents']:
        words += len(sent)
    result_dict['words'] = words
    result_dict['re_tp'] = row['re_tp']
    result_dict['re_fp'] = row['re_fp']
    result_dict['re_fn'] = row['re_fn']
    result_dict['reverse_cluster_map'] = row['reverse_cluster_map']
    result_dict['cnt_rel_could_not_be_predicted_me'] = row['cnt_rel_could_not_be_predicted_me']
    result_dict['cnt_rel_could_not_be_predicted_coref'] = row['cnt_rel_could_not_be_predicted_coref']
    
    rels_pred = row['rels_pred']
    rels_gold = row['rels_gold']
    clusters_gold = row['clusters_gold']
    predict_right_rel = []
    predict_wrong_rel_by_missing_head_tail = []
    predict_wrong_rel_by_missing_head = []
    predict_wrong_rel_by_missing_tail = []
    predict_wrong_rel_left = []
    cluster_map = row['cluster_map']
    golden_rel_predicted_indexs = []
    for rel in rels_pred:
        mapped_rel = {'h': cluster_map[str(rel['h'])], 't': cluster_map[str(rel['t'])], 'r': rel['r']}
        v_rel = visualize_rel(row['sents'], row['raw_spans'], rel, row['clusters_pred'], id2rel_info)
        if mapped_rel in rels_gold:
            predict_right_rel.append(v_rel)
            golden_rel_predicted_indexs.append(rels_gold.index(mapped_rel))
        elif mapped_rel['h'] == -1 and mapped_rel['t'] == -1:
            predict_wrong_rel_by_missing_head_tail.append(v_rel)
        elif mapped_rel['h'] == -1 :
            predict_wrong_rel_by_missing_head.append(v_rel)
        elif mapped_rel['t'] == -1 :
            predict_wrong_rel_by_missing_tail.append(v_rel)
        else:
            predict_wrong_rel_left.append(v_rel)
    result_dict['predict_right_rel'] = predict_right_rel
    result_dict['predict_wrong_rel_by_missing_head_tail'] = predict_wrong_rel_by_missing_head_tail
    result_dict['predict_wrong_rel_by_missing_head'] = predict_wrong_rel_by_missing_head
    result_dict['predict_wrong_rel_by_missing_tail'] = predict_wrong_rel_by_missing_tail
    result_dict['predict_wrong_rel_left'] = predict_wrong_rel_left
    
    # rel_gold
    golden_rel_predicted = []
    golden_rel_missing_span = []
    golden_rel_missing_head_and_tail = []
    golden_rel_missing_head = []
    golden_rel_missing_tail = []
    golden_rel_not_predicted = []
    labels = row['labels'] if 'labels' in row else []
    reverse_cluster_map = result_dict['reverse_cluster_map']
    for index, rel in enumerate(rels_gold):
        evidence = find_evidence(rel, id2rel, labels)
        v_rel = visualize_rel(row['sents'], row['raw_spans'], rel, row['clusters_gold'], id2rel_info, evidence)
        if index in golden_rel_predicted_indexs:
            golden_rel_predicted.append(v_rel)
        elif -1 in clusters_gold[rel['h']] or -1 in clusters_gold[rel['t']] :
            golden_rel_missing_span.append(v_rel)
        elif reverse_cluster_map[str(rel['h'])] == -1 and reverse_cluster_map[str(rel['t'])] == -1 :
            golden_rel_missing_head_and_tail.append(v_rel)
        elif reverse_cluster_map[str(rel['h'])] == -1:
            golden_rel_missing_head.append(v_rel)
        elif reverse_cluster_map[str(rel['t'])] == -1:
            golden_rel_missing_tail.append(v_rel)
        else:
            golden_rel_not_predicted.append(v_rel)
    result_dict['golden_rel_predicted'] = golden_rel_predicted
    result_dict['golden_rel_missing_span'] = golden_rel_missing_span
    result_dict['golden_rel_missing_head_and_tail'] = golden_rel_missing_head_and_tail
    result_dict['golden_rel_missing_head'] = golden_rel_missing_head
    result_dict['golden_rel_missing_tail'] = golden_rel_missing_tail
    result_dict['golden_rel_not_predicted'] = golden_rel_not_predicted
    golden_rel_predicted_cnt_evidence_0 = 0
    golden_rel_predicted_cnt_evidence_1 = 0
    golden_rel_predicted_cnt_evidence_2 = 0
    golden_rel_predicted_cnt_evidence_3 = 0
    golden_rel_predicted_cnt_evidence_4 = 0
    golden_rel_predicted_cnt_evidence_5 = 0
    golden_rel_predicted_cnt_evidence_6 = 0  # more than 6
    for rel in golden_rel_predicted:
        if 'evidence' in rel:
            if len(rel['evidence']) == 0:
                golden_rel_predicted_cnt_evidence_0 += 1
            elif len(rel['evidence']) == 1:
                golden_rel_predicted_cnt_evidence_1 += 1
            elif len(rel['evidence']) == 2:
                golden_rel_predicted_cnt_evidence_2 += 1
            elif len(rel['evidence']) == 3:
                golden_rel_predicted_cnt_evidence_3 += 1
            elif len(rel['evidence']) == 4:
                golden_rel_predicted_cnt_evidence_4 += 1
            elif len(rel['evidence']) == 5:
                golden_rel_predicted_cnt_evidence_5 += 1
            else:
                golden_rel_predicted_cnt_evidence_6 += 1
    result_dict['golden_rel_predicted_cnt_evidence_0'] = golden_rel_predicted_cnt_evidence_0
    result_dict['golden_rel_predicted_cnt_evidence_1'] = golden_rel_predicted_cnt_evidence_1
    result_dict['golden_rel_predicted_cnt_evidence_2'] = golden_rel_predicted_cnt_evidence_2
    result_dict['golden_rel_predicted_cnt_evidence_3'] = golden_rel_predicted_cnt_evidence_3
    result_dict['golden_rel_predicted_cnt_evidence_4'] = golden_rel_predicted_cnt_evidence_4
    result_dict['golden_rel_predicted_cnt_evidence_5'] = golden_rel_predicted_cnt_evidence_5
    result_dict['golden_rel_predicted_cnt_evidence_6'] = golden_rel_predicted_cnt_evidence_6
    
    golden_rel_missing_cnt_evidence_0 = 0
    golden_rel_missing_cnt_evidence_1 = 0
    golden_rel_missing_cnt_evidence_2 = 0
    golden_rel_missing_cnt_evidence_3 = 0
    golden_rel_missing_cnt_evidence_4 = 0
    golden_rel_missing_cnt_evidence_5 = 0
    golden_rel_missing_cnt_evidence_6 = 0
    
    for rel in golden_rel_missing_span:
        if 'evidence' in rel:
            if len(rel['evidence']) == 0:
                golden_rel_missing_cnt_evidence_0 += 1
            elif len(rel['evidence']) == 1:
                golden_rel_missing_cnt_evidence_1 += 1
            elif len(rel['evidence']) == 2:
                golden_rel_missing_cnt_evidence_2 += 1
            elif len(rel['evidence']) == 3:
                golden_rel_missing_cnt_evidence_3 += 1
            elif len(rel['evidence']) == 4:
                golden_rel_missing_cnt_evidence_4 += 1
            elif len(rel['evidence']) == 5:
                golden_rel_missing_cnt_evidence_5 += 1
            else:
                golden_rel_missing_cnt_evidence_6 += 1
    for rel in golden_rel_missing_head_and_tail:
        if 'evidence' in rel:
            if len(rel['evidence']) == 0:
                golden_rel_missing_cnt_evidence_0 += 1
            elif len(rel['evidence']) == 1:
                golden_rel_missing_cnt_evidence_1 += 1
            elif len(rel['evidence']) == 2:
                golden_rel_missing_cnt_evidence_2 += 1
            elif len(rel['evidence']) == 3:
                golden_rel_missing_cnt_evidence_3 += 1
            elif len(rel['evidence']) == 4:
                golden_rel_missing_cnt_evidence_4 += 1
            elif len(rel['evidence']) == 5:
                golden_rel_missing_cnt_evidence_5 += 1
            else:
                golden_rel_missing_cnt_evidence_6 += 1
    for rel in golden_rel_missing_head:
        if 'evidence' in rel:
            if len(rel['evidence']) == 0:
                golden_rel_missing_cnt_evidence_0 += 1
            elif len(rel['evidence']) == 1:
                golden_rel_missing_cnt_evidence_1 += 1
            elif len(rel['evidence']) == 2:
                golden_rel_missing_cnt_evidence_2 += 1
            elif len(rel['evidence']) == 3:
                golden_rel_missing_cnt_evidence_3 += 1
            elif len(rel['evidence']) == 4:
                golden_rel_missing_cnt_evidence_4 += 1
            elif len(rel['evidence']) == 5:
                golden_rel_missing_cnt_evidence_5 += 1
            else:
                golden_rel_missing_cnt_evidence_6 += 1
    for rel in golden_rel_missing_tail:
        if 'evidence' in rel:
            if len(rel['evidence']) == 0:
                golden_rel_missing_cnt_evidence_0 += 1
            elif len(rel['evidence']) == 1:
                golden_rel_missing_cnt_evidence_1 += 1
            elif len(rel['evidence']) == 2:
                golden_rel_missing_cnt_evidence_2 += 1
            elif len(rel['evidence']) == 3:
                golden_rel_missing_cnt_evidence_3 += 1
            elif len(rel['evidence']) == 4:
                golden_rel_missing_cnt_evidence_4 += 1
            elif len(rel['evidence']) == 5:
                golden_rel_missing_cnt_evidence_5 += 1
            else:
                golden_rel_missing_cnt_evidence_6 += 1
    
    result_dict['golden_rel_missing_cnt_evidence_0'] = golden_rel_missing_cnt_evidence_0
    result_dict['golden_rel_missing_cnt_evidence_1'] = golden_rel_missing_cnt_evidence_1
    result_dict['golden_rel_missing_cnt_evidence_2'] = golden_rel_missing_cnt_evidence_2
    result_dict['golden_rel_missing_cnt_evidence_3'] = golden_rel_missing_cnt_evidence_3
    result_dict['golden_rel_missing_cnt_evidence_4'] = golden_rel_missing_cnt_evidence_4
    result_dict['golden_rel_missing_cnt_evidence_5'] = golden_rel_missing_cnt_evidence_5
    result_dict['golden_rel_missing_cnt_evidence_6'] = golden_rel_missing_cnt_evidence_6
    
    golden_rel_not_predicted_cnt_evidence_0 = 0
    golden_rel_not_predicted_cnt_evidence_1 = 0
    golden_rel_not_predicted_cnt_evidence_2 = 0
    golden_rel_not_predicted_cnt_evidence_3 = 0
    golden_rel_not_predicted_cnt_evidence_4 = 0
    golden_rel_not_predicted_cnt_evidence_5 = 0
    golden_rel_not_predicted_cnt_evidence_6 = 0
    for rel in golden_rel_not_predicted:
        if 'evidence' in rel:
            if len(rel['evidence']) == 0:
                golden_rel_not_predicted_cnt_evidence_0 += 1
            elif len(rel['evidence']) == 1:
                golden_rel_not_predicted_cnt_evidence_1 += 1
            elif len(rel['evidence']) == 2:
                golden_rel_not_predicted_cnt_evidence_2 += 1
            elif len(rel['evidence']) == 3:
                golden_rel_not_predicted_cnt_evidence_3 += 1
            elif len(rel['evidence']) == 4:
                golden_rel_not_predicted_cnt_evidence_4 += 1
            elif len(rel['evidence']) == 5:
                golden_rel_not_predicted_cnt_evidence_5 += 1
            else:
                golden_rel_not_predicted_cnt_evidence_6 += 1
    result_dict['golden_rel_not_predicted_cnt_evidence_0'] = golden_rel_not_predicted_cnt_evidence_0
    result_dict['golden_rel_not_predicted_cnt_evidence_1'] = golden_rel_not_predicted_cnt_evidence_1
    result_dict['golden_rel_not_predicted_cnt_evidence_2'] = golden_rel_not_predicted_cnt_evidence_2
    result_dict['golden_rel_not_predicted_cnt_evidence_3'] = golden_rel_not_predicted_cnt_evidence_3
    result_dict['golden_rel_not_predicted_cnt_evidence_4'] = golden_rel_not_predicted_cnt_evidence_4
    result_dict['golden_rel_not_predicted_cnt_evidence_5'] = golden_rel_not_predicted_cnt_evidence_5
    result_dict['golden_rel_not_predicted_cnt_evidence_6'] = golden_rel_not_predicted_cnt_evidence_6

    return result_dict


In [8]:
row = data[analyze_row_index]

transform_result = transform_row(row, id2rel_info, id2rel)
print(f"cnt_rel_could_not_be_predicted_me: {transform_result['cnt_rel_could_not_be_predicted_me']}")
print(f"cnt_rel_could_not_be_predicted_coref: {transform_result['cnt_rel_could_not_be_predicted_coref']}")

num_sents = transform_result['num_sents']
words = transform_result['words']
print(f'there are {num_sents} sentences, {words} words in doc.')
print(f"sents:")
transform_result['sents']

cnt_rel_could_not_be_predicted_me: 1
cnt_rel_could_not_be_predicted_coref: 0
there are 6 sentences, 150 words in doc.
sents:


['Adolfo Nicolás Pachón ( born 29 April 1936 ) , is a Spanish priest of the Roman Catholic Church .',
 'He was the thirtieth Superior General of the Society of Jesus , the largest religious order in the Roman Catholic Church .',
 'Nicolás , after consulting with Pope Francis , determined to resign after his 80th birthday , and initiated the process of calling a Jesuit General Congregation to elect his successor .',
 'Until the resignation of his predecessor , Peter Hans Kolvenbach , it was not the norm for a Jesuit Superior General to resign ; they , like the great majority of the Popes up until Benedict XVI , generally served until death .',
 'However , the Jesuit constitutions include provision for a resignation .',
 'In October 2016 the thirty - sixth General Congregation of the Society of Jesus appointed his successor , Arturo Sosa from Venezuela .']

In [9]:
# pred
# visualize entity (mentions)
clusters_pred = row['clusters_pred']
for cluster in clusters_pred:
    print(cluster_to_words(row['sents'], row['raw_spans'], cluster))

(['Jesuit', 'Jesuit'], [[[4, 3], [4, 4]], [[3, 18], [3, 19]]])
(['Adolfo Nicolás Pachón', 'Nicolás'], [[[0, 0], [0, 3]], [[2, 0], [2, 1]]])
(['Roman Catholic Church', 'Roman Catholic Church'], [[[0, 16], [0, 19]], [[1, 18], [1, 21]]])
(['October 2016'], [[[5, 1], [5, 3]]])
(['Benedict XVI'], [[[3, 35], [3, 37]]])
(['General Congregation of the Society of Jesus'], [[[5, 7], [5, 14]]])
(['Arturo Sosa'], [[[5, 18], [5, 20]]])
(['29 April 1936'], [[[0, 5], [0, 8]]])
(['Francis'], [[[2, 6], [2, 7]]])
(['Spanish'], [[[0, 12], [0, 13]]])
(['Peter Hans Kolvenbach'], [[[3, 7], [3, 10]]])
(['Society of Jesus'], [[[1, 8], [1, 11]]])
(['Venezuela'], [[[5, 21], [5, 22]]])
(['Jesuit General Congregation'], [[[2, 23], [2, 26]]])


In [10]:
# gold
# visualize entity (mentions)
clusters_gold = row['clusters_gold']
for cluster in clusters_gold:
    print(cluster_to_words(row['sents'], row['raw_spans'], cluster))

(['Adolfo Nicolás Pachón', 'Nicolás'], [[[0, 0], [0, 3]], [[2, 0], [2, 1]]])
(['29 April 1936'], [[[0, 5], [0, 8]]])
(['Spanish'], [[[0, 12], [0, 13]]])
(['Roman Catholic Church', 'Roman Catholic Church'], [[[0, 16], [0, 19]], [[1, 18], [1, 21]]])
(['None', 'Society of Jesus'], ['None', [[1, 8], [1, 11]]])
(['Francis'], [[[2, 6], [2, 7]]])
(['Jesuit General Congregation'], [[[2, 23], [2, 26]]])
(['Peter Hans Kolvenbach'], [[[3, 7], [3, 10]]])
(['Benedict XVI'], [[[3, 35], [3, 37]]])
(['Jesuit'], [[[4, 3], [4, 4]]])
(['October 2016'], [[[5, 1], [5, 3]]])
(['Arturo Sosa'], [[[5, 18], [5, 20]]])
(['Venezuela'], [[[5, 21], [5, 22]]])


In [11]:
print(f"predict_right_rel: {len(transform_result['predict_right_rel'])}")
transform_result['predict_right_rel']

predict_right_rel: 4


[{'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
   [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
  't': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  'r': 'religion'},
 {'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
   [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
  't': (['29 April 1936'], [[[0, 5], [0, 8]]]),
  'r': 'date of birth'},
 {'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
   [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
  't': (['Spanish'], [[[0, 12], [0, 13]]]),
  'r': 'country of citizenship'},
 {'h': (['Arturo Sosa'], [[[5, 18], [5, 20]]]),
  't': (['Venezuela'], [[[5, 21], [5, 22]]]),
  'r': 'country of citizenship'}]

In [12]:
print(f"predict_wrong_rel_by_missing_head_tail: {len(transform_result['predict_wrong_rel_by_missing_head_tail'])}")
transform_result['predict_wrong_rel_by_missing_head_tail']

predict_wrong_rel_by_missing_head_tail: 0


[]

In [13]:

print(f"predict_wrong_rel_by_missing_head: {len(transform_result['predict_wrong_rel_by_missing_head'])}")
transform_result['predict_wrong_rel_by_missing_head']

predict_wrong_rel_by_missing_head: 0


[]

In [14]:
print(f"predict_wrong_rel_by_missing_tail: {len(transform_result['predict_wrong_rel_by_missing_tail'])}")
transform_result['predict_wrong_rel_by_missing_tail']

predict_wrong_rel_by_missing_tail: 0


[]

In [15]:
print(f"predict_wrong_rel_left: {len(transform_result['predict_wrong_rel_left'])}")
transform_result['predict_wrong_rel_left']

predict_wrong_rel_left: 0


[]

In [16]:
print(f"golden_rel_predicted: {len(transform_result['golden_rel_predicted'])}")
transform_result['golden_rel_predicted']

golden_rel_predicted: 4


[{'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
   [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
  't': (['29 April 1936'], [[[0, 5], [0, 8]]]),
  'r': 'date of birth',
  'evidence': [0]},
 {'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
   [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
  't': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  'r': 'religion',
  'evidence': [0, 1]},
 {'h': (['Adolfo Nicolás Pachón', 'Nicolás'],
   [[[0, 0], [0, 3]], [[2, 0], [2, 1]]]),
  't': (['Spanish'], [[[0, 12], [0, 13]]]),
  'r': 'country of citizenship',
  'evidence': [0]},
 {'h': (['Arturo Sosa'], [[[5, 18], [5, 20]]]),
  't': (['Venezuela'], [[[5, 21], [5, 22]]]),
  'r': 'country of citizenship',
  'evidence': [5]}]

In [17]:
print(f"golden_rel_missing_span: {len(transform_result['golden_rel_missing_span'])}")
transform_result['golden_rel_missing_span']

golden_rel_missing_span: 1


[{'h': (['None', 'Society of Jesus'], ['None', [[1, 8], [1, 11]]]),
  't': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  'r': 'subclass of',
  'evidence': [0, 1]}]

In [18]:
print(f"golden_rel_missing_head_and_tail: {len(transform_result['golden_rel_missing_head_and_tail'])}")
transform_result['golden_rel_missing_head_and_tail']

golden_rel_missing_head_and_tail: 0


[]

In [19]:
print(f"golden_rel_missing_head: {len(transform_result['golden_rel_missing_head'])}")
transform_result['golden_rel_missing_head']

golden_rel_missing_head: 0


[]

In [20]:
print(f"golden_rel_missing_tail: {len(transform_result['golden_rel_missing_tail'])}")
transform_result['golden_rel_missing_tail']

golden_rel_missing_tail: 0


[]

In [21]:
print(f"golden_rel_not_predicted: {len(transform_result['golden_rel_not_predicted'])}")
transform_result['golden_rel_not_predicted']

golden_rel_not_predicted: 7


[{'h': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  't': (['Benedict XVI'], [[[3, 35], [3, 37]]]),
  'r': 'chairperson',
  'evidence': [0, 1, 3]},
 {'h': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  't': (['Francis'], [[[2, 6], [2, 7]]]),
  'r': 'chairperson',
  'evidence': [0, 1, 2]},
 {'h': (['Peter Hans Kolvenbach'], [[[3, 7], [3, 10]]]),
  't': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  'r': 'religion',
  'evidence': [0, 1, 3]},
 {'h': (['Benedict XVI'], [[[3, 35], [3, 37]]]),
  't': (['Roman Catholic Church', 'Roman Catholic Church'],
   [[[0, 16], [0, 19]], [[1, 18], [1, 21]]]),
  'r': 'religion',
  'evidence': [0, 1, 3]},
 {'h': (['Venezuela'], [[[5, 21], [5, 22]]]),
  't': (['Arturo Sosa'], [[[5, 18], [5, 20]]]),
  'r': 'head of state',
  'evidence': [5]},
 {'h': (['Arturo Sosa'], [[[5, 18], [5, 20]]]),
  't': 

In [22]:
# transformer data
dict_list_to_be_save = []
for row in data:
    dict_list_to_be_save.append(transform_row(row, id2rel_info, id2rel))

In [23]:
# # save as csv
# save_path = 'report/csv_report.csv'

# import csv
# fieldnames = set().union(*(d.keys() for d in dict_list_to_be_save))
# # 使用字典的键作为CSV的列头
# with open(save_path, 'w', newline='') as csvfile:
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()
#     writer.writerows(dict_list_to_be_save)

# # save as json
# json_file_path = "report/json_report.json"
# with open(json_file_path, "w") as f:
#     json.dump(dict_list_to_be_save, f)

In [24]:
def caculate_mentions_rels(transformed_data):
    gold_rel_predicted_right_cnt_mention_2 = 0
    gold_rel_predicted_right_cnt_mention_3 = 0
    gold_rel_predicted_right_cnt_mention_4 = 0
    gold_rel_predicted_right_cnt_mention_5 = 0
    
    gold_rel_predicted_wrong_left_cnt_mention_2 = 0
    gold_rel_predicted_wrong_left_cnt_mention_3 = 0
    gold_rel_predicted_wrong_left_cnt_mention_4 = 0
    gold_rel_predicted_wrong_left_cnt_mention_5 = 0
    
    for row in transformed_data:
        for rel in row['golden_rel_predicted']:
            if len(rel['h'][0]) + len(rel['t'][0]) == 2:
                gold_rel_predicted_right_cnt_mention_2 += 1
            elif len(rel['h'][0]) + len(rel['t'][0]) == 3:
                gold_rel_predicted_right_cnt_mention_3 += 1
            elif len(rel['h'][0]) + len(rel['t'][0]) == 4:
                gold_rel_predicted_right_cnt_mention_4 += 1
            elif len(rel['h'][0]) + len(rel['t'][0]) >= 5:
                gold_rel_predicted_right_cnt_mention_5 += 1
        for rel in row['golden_rel_not_predicted']:
            if len(rel['h'][0]) + len(rel['t'][0]) == 2:
                gold_rel_predicted_wrong_left_cnt_mention_2 += 1
            elif len(rel['h'][0]) + len(rel['t'][0]) == 3:
                gold_rel_predicted_wrong_left_cnt_mention_3 += 1
            elif len(rel['h'][0]) + len(rel['t'][0]) == 4:
                gold_rel_predicted_wrong_left_cnt_mention_4 += 1
            elif len(rel['h'][0]) + len(rel['t'][0]) >= 5:
                gold_rel_predicted_wrong_left_cnt_mention_5 += 1
    
    result = {}
    result['gold_rel_predicted_right_cnt_mention_2'] = gold_rel_predicted_right_cnt_mention_2
    result['gold_rel_predicted_right_cnt_mention_3'] = gold_rel_predicted_right_cnt_mention_3
    result['gold_rel_predicted_right_cnt_mention_4'] = gold_rel_predicted_right_cnt_mention_4
    result['gold_rel_predicted_right_cnt_mention_5'] = gold_rel_predicted_right_cnt_mention_5
    
    result['gold_rel_predicted_wrong_left_cnt_mention_2'] = gold_rel_predicted_wrong_left_cnt_mention_2
    result['gold_rel_predicted_wrong_left_cnt_mention_3'] = gold_rel_predicted_wrong_left_cnt_mention_3
    result['gold_rel_predicted_wrong_left_cnt_mention_4'] = gold_rel_predicted_wrong_left_cnt_mention_4
    result['gold_rel_predicted_wrong_left_cnt_mention_5'] = gold_rel_predicted_wrong_left_cnt_mention_5
    return result

result = caculate_mentions_rels(dict_list_to_be_save)
result

{'gold_rel_predicted_right_cnt_mention_2': 2167,
 'gold_rel_predicted_right_cnt_mention_3': 1041,
 'gold_rel_predicted_right_cnt_mention_4': 690,
 'gold_rel_predicted_right_cnt_mention_5': 943,
 'gold_rel_predicted_wrong_left_cnt_mention_2': 1559,
 'gold_rel_predicted_wrong_left_cnt_mention_3': 743,
 'gold_rel_predicted_wrong_left_cnt_mention_4': 425,
 'gold_rel_predicted_wrong_left_cnt_mention_5': 536}

In [25]:
def caculate_rels(transformed_data):
    golden_rel_predicted_each_rel_cnt = {}
    golden_rel_not_predicted_each_rel_cnt = {}
    for row in transformed_data:
        for rel in row['golden_rel_predicted']:
            if rel['r'] not in golden_rel_predicted_each_rel_cnt:
                golden_rel_predicted_each_rel_cnt[rel['r']] = 1
            else:
                golden_rel_predicted_each_rel_cnt[rel['r']] += 1
        for rel in row['golden_rel_not_predicted']:
            if rel['r'] not in golden_rel_not_predicted_each_rel_cnt:
                golden_rel_not_predicted_each_rel_cnt[rel['r']] = 1
            else:
                golden_rel_not_predicted_each_rel_cnt[rel['r']] += 1
    return golden_rel_predicted_each_rel_cnt, golden_rel_not_predicted_each_rel_cnt

golden_rel_predicted_each_rel_cnt, golden_rel_not_predicted_each_rel_cnt = caculate_rels(dict_list_to_be_save)

# import csv
# each_rel_cnt_csv_report_path = "report/each_rel_cnt.csv"
# headers = sorted(set(golden_rel_predicted_each_rel_cnt.keys()) | set(golden_rel_not_predicted_each_rel_cnt.keys()))
# with open(each_rel_cnt_csv_report_path, 'w', newline='') as csvfile:
#     writer = csv.writer(csvfile)

#     writer.writerow(['', 'golden_rel_predicted', 'golden_rel_not_predicted'])
#     for key in headers:
#         writer.writerow([key, golden_rel_predicted_each_rel_cnt.get(key, '0'), golden_rel_not_predicted_each_rel_cnt.get(key, '0')])
