# Analyze LS output - Dataset Chiara

In [1]:
import json
import csv
from collections import defaultdict
import spacy
import pandas as pd
import os

In [28]:
nlp = spacy.load("it_core_news_sm")

In [29]:
# labels = [
#     "Target-category-reference",
#     "Behavior-predicate",
#     "Common sup ingroup",
#     "Prosocial behavior",
#     "Other",
#     "Istitutional actor",
#     "Singular pronoun",
#     "Plural pronoun",
#     "Proper noun",
#     "Activity",
#     "Empty",
#     # LEVELS
#     'high-TC-TG-levels', 
#     'middle-TC-TG-levels', 
#     'individual-TC-TG-levels',
#     #LCM
#     'OBS-BH-LCM-behavior', 
#      "CAT-BH-LCM-behavior", 
#      "MENT-ST-LCM-behavior",
#      "TRAIT-LCM-behavior",
#     # Valence ref
#     'Positive-TG-levels', 
#    'Neutral-TG-levels', 
#    'Negative-TG-levels',
#     # Valence behav
#     'Positive-LCM-behavior',
#    'Neutral-LCM-behavior', 
#    'Negative-LCM-behavior',
#     # IN-OUT-REF
#     'Outgroup-TG-levels', 
#     'Ingroup-TG-levels',
#     # In-OUT-BEHAV
#     'Ingroup-LCM-behavior', 
#     "Outgroup-LCM-behavior",
#     # Roles
#     'Perpetrator-TG-levels',
#     "Victim-TG-levels",
#     "Prosocial behavior-TG-levels",
#     "Other-TG-levels",
#     "Istitutional actor-TG-levels",
#     # DEM
#     'Age-TG-levels', 
#    'Residency-TG-levels', 
#    'Gender-TG-levels', 
#    'Origin - TC-TG-levels',
#    'Education-TG-levels', 
#    'Family role-TG-levels',
#    'Profession-TG-levels', 
#    'Origin - TC-TG-levels',
#    'Other category-TG-levels',
#     # DEM behav
#     'Age-LCM-behavior', 
#    'Residency-LCM-behavior', 
#    'Gender-LCM-behavior', 
#    'Origin - TC-LCM-behavior',
#    'Education-LCM-behavior', 
#    'Family role-LCM-behavior',
#    'Profession-LCM-behavior', 
#    'Origin - TC-LCM-behavior',
#    'Other category-LCM-behavior',
# ]

with open('Chiara_full_dataset/labels-chiara.txt') as infile:
    labels = [l.strip().strip("'").strip('"') for l in infile.read().strip().split('\n')]
# labels

In [50]:
def get_labels(annotations):
    offset_label_dict = defaultdict(list)
    id_relation_dict = defaultdict(list)
    # sort_annotations_by_id
    id_ann_dict = defaultdict(list)
    id_name_dict = dict()

    for an in annotations:
        labels_relations = an['result']
        for la in labels_relations:
            # check if label:
            # print(la)
            # print()
            if 'value' in la:
                target_an = la['value']
                start = target_an['start']
                end = target_an['end']
                ann_id = la['id']
                id_ann_dict[ann_id].append(la)
                for i in range(start, end):
                    offset_label_dict[i].append(la)
                # print('offsets')
                # print(start, la, type(start))
                if 'labels' in target_an:
                    id_name_dict[ann_id] = target_an['labels'][0]

            elif 'from_id' in la:
                from_id = la['from_id']
                to_id = la['to_id']
                id_relation_dict[(from_id, to_id)].append(la)
            else:
                print(la)
    return offset_label_dict, id_relation_dict, id_ann_dict, id_name_dict




def get_chains(id_relations_dict):
    chains = defaultdict(set)
    coref_id = dict()
    n = 0
    for rel_pair, relations in id_relations_dict.items():
        rel1, rel2 = rel_pair

        if n == 0:
            chains[0].add(rel1)
            chains[0].add(rel2)
            n+=1 

        else:
            added = False
            for pair_n, chain in chains.items():
                if rel1 in chain:
                    chain.add(rel2)
                    added = True
                if rel2 in chain:
                    chain.add(rel1)
                    added = True
            if added == False:
                chains[n].add(rel1)
                chains[n].add(rel2)
                n+=1

    for chain_id, chain in chains.items():
        for label_id in chain:
            coref_id[label_id] = chain_id

    return chains, coref_id

#separate_chains(token_data)

def filter_relations(id_relation_dict, id_ann_dict, target_label1, target_label2=None):
 # Filter relations on labels
    id_rel_dict_target = dict()
    if target_label2 is None:
        for lid1, lid2 in id_relation_dict.keys():
            l1 = id_ann_dict[lid1]
            l2 = id_ann_dict[lid2]
            for l in l1:
                if 'labels' in l['value']:
                    l1_name = l['value']['labels'][0]   
                    
            for l in l2:
                if 'labels' in l['value']:
                    l2_name = l['value']['labels'][0]
                    all_labels = [l for l in l['value']['labels']]
                    if len(all_labels) > 1:
                        print('found more labels:', len(all_labels))
            if l1_name == l2_name:
                rel = id_relation_dict[(lid1, lid2)]
                id_rel_dict_target[(lid1, lid2)] = rel
    else:
        for lid1, lid2 in id_relation_dict.keys():
            l1 = id_ann_dict[lid1]
            l2 = id_ann_dict[lid2]
            for l in l1:
                if 'labels' in l['value']:
                    l1_name = l['value']['labels'][0]
            for l in l2:
                if 'labels' in l['value']:
                    l2_name = l['value']['labels'][0]
            if ((l1_name == target_label1) and (l2_name == target_label2) 
            or (l1_name == target_label2) and (l2_name == target_label1)):
                rel = id_relation_dict[(lid1, lid2)]
                id_rel_dict_target[(lid1, lid2)] = rel
        
    return id_rel_dict_target



        
    
def add_singletons(token_data):
    
    coref_ids = []
    for d in token_data:
        if d['ref-relations'] != []:
            coref_ids.extend(d['ref-relations'])
    # [d['ref-relations'] for d in token_data if d['ref-relations'] != '']
    if len(coref_ids) > 0:
        max_coref_id = max(coref_ids)
    
        single_ent_id = max_coref_id +1 
    else:
        single_ent_id = 0
        
    token_data_by_cat = defaultdict(list)
    for d in token_data:
        tcs = d['Target-category-reference']
        for t in tcs:
            token_data_by_cat[t].append(d)
        
    
    for t, ds in token_data_by_cat.items():
        added_single_ent = False
        for d in ds:
            #rels = d['ref-relations']
            tcs = d['Target-category-reference']
            ref_id_rels = d['ref-id-relations']

           
            if tcs != [] and  ref_id_rels == []:
                # singleton!
                d['ref-relations'] = [single_ent_id]
                d['ref-id-relations'] = [f'{single_ent_id}-_-{t}']
                added_single_ent = True
            elif len(tcs) != len(ref_id_rels):
                rels_no_coref = [r.split('-_-')[1] for r in ref_id_rels]
                for t in tcs:
                    if t not in rels_no_coref:
                        d['ref-relations'].append(single_ent_id)
                        d['ref-id-relations'].append(f'{single_ent_id}-_-{t}')
                        added_single_ent = True
            else:
                continue
        
        if added_single_ent:
            single_ent_id += 1
            
            
def make_relation_mappings(id_relation_dict, id_name_dict):
    
    coref_mappings = dict()
    behav_mappings = defaultdict(list)
    
    for id_pair, d_list in id_relation_dict.items():
        id1, id2 = id_pair
        label1 = id_name_dict[id1]
        label2 = id_name_dict[id2]

   
        
        #if rel_type == 'ref-behav
        if label1 == label2 == 'Target-category-reference':
            # print('coref', label1, label2)
            coref_mappings[id1] = id2
            coref_mappings[id2] = id1
        # elif label1 == label2 == 'Behavior-predicate':
        #     pass
        elif label1 == 'Behavior-predicate' and label2 == 'Target-category-reference':
            # 
            behav_mappings[id2].append(id1)
        elif label1 == 'Target-category-reference' and label2 == 'Behavior-predicate':
            behav_mappings[id1].append(id2)

        #elif label1 
            
    return coref_mappings, behav_mappings

In [51]:
### TEST COREF CHAIN CODE

name = 'chiara'
output = 'Chiara_full_dataset'
#path = f'output-comparison-v2/{name}.json'
path = f'{output}/{name}.json'

dir_out = f'{output}/{name}'
if not os.path.isdir(dir_out):
    os.mkdir(dir_out)

with open(path) as infile:
    annotation_data = json.load(infile)
    
for n, data in enumerate(annotation_data):
    ls_id = data['id']
    annotations = data['annotations']
    text = data['data']['text']
    text_id = text.split('\n', 1)[0].split(': ')[-1]
    print(text_id, ls_id)
    path_out = f'{dir_out}/output-{text_id}_LS-id{ls_id}.csv'
    print(path_out)
    
    offset_label_dict, id_relation_dict, id_ann_dict, id_name_dict = get_labels(annotations)
    coref_mappings, behav_mappings = make_relation_mappings(id_relation_dict, id_name_dict)
    

    target_label = 'Target-category-reference'
    id_relations_ref = filter_relations(id_relation_dict, id_ann_dict, target_label)
    
    target_label1 = 'Target-category-reference'
    target_label2 =  "Behavior-predicate"
    id_relations_behav = filter_relations(id_relation_dict, id_ann_dict, target_label1, target_label2)
    
    if 'JUuhN06Sm_' in id_ann_dict:
        print(id_ann_dict['JUuhN06Sm_'])
    
    
        
    coref_chains_ref, coref_dict_ref = get_chains(id_relations_ref)
    coref_chains_behav, coref_dict_behav = get_chains(id_relations_behav)
    
    
    doc = nlp(text)

    
    tok_id_ann_id_dict = defaultdict(set)
    
    if 322 in offset_label_dict:
        print('found 322')
        print(offset_label_dict[322])
    
    for tok in doc:
        tok_start = tok.idx

        for i in range(tok.idx, tok.idx+len(tok)):
            if i in offset_label_dict:
                anns = offset_label_dict[i]
                for an in anns:
                    an_id = an['id']
                    tok_id_ann_id_dict[tok.i].add(an_id)
                    
    
                



    token_data = []
    for tok in doc:
        
        tok_id = tok.i
        tok_dict = defaultdict(list)
        tok_dict['tok-id'] = tok_id
        tok_dict['token'] = tok.text
        tok_dict['ref-relations'] = []
        tok_dict['ref-behav-relations'] = []
        tok_dict['ref-id-relations'] = []
        tok_dict['coref-id-mapping'] = []
        tok_dict['behav-id-mapping'] = []
                 
        # tok_dict['ref-behav-relations-new'] = ''

        #tok_dict['TC-COREF'] = ''
        for l in labels:
            tok_dict[l] = []
            
        
        an_ids = tok_id_ann_id_dict[tok_id]
        # print(tok_id, an_ids)

        for an_id in an_ids:        
            anns = id_ann_dict[an_id]
     
            #rels = an_rel_dict[an_id]
            for an in anns:
                add_label = ''

                if 'from_name' in an:
                    add_label = an['from_name']
                    
                if 'labels' in an['value']:
                    label = an['value']['labels'][0]
                    tok_dict[f'{label}'].append(an_id)
                if 'choices' in an['value']:
                    for choice in an['value']['choices']:
                        if add_label != '':
                            tok_dict[f'{choice}-{add_label}'].append(an_id)
                        else:
                            tok_dict[choice].append(an_id)

             # coref:
            if an_id in coref_dict_behav:
                # if tok_dict['relations'] == '':
                #tok_dict['ref-behav-relations'] = coref_dict[an_id]
                tok_dict['ref-behav-relations'].append(coref_dict_behav[an_id])
                #tok_dict['ref-relations'] = coref_dict_ref[an_id]

            if an_id in coref_dict_ref:
                # if tok_dict['relations'] == '':
                #tok_dict['ref-behav-relations'] = coref_dict[an_id]
               
                tok_dict['ref-relations'].append(coref_dict_ref[an_id])
                tok_dict['ref-id-relations'].append(f'{coref_dict_ref[an_id]}-_-{an_id}')
            
            if an_id in coref_mappings:
                tok_dict['coref-id-mapping'].append(f'{an_id}-_-{coref_mappings[an_id]}')
            if an_id in behav_mappings:
                for behav_id in behav_mappings[an_id]:
                    tok_dict['behav-id-mapping'].append(f'{an_id}-_-{behav_id}')
          
  
        token_data.append(tok_dict)

    #interpret_relations(token_data)
    # full_coref_ids(token_data)
    #separate_chains(token_data)
    # map_relations(token_data)
    add_singletons(token_data)
    for tok_dict in token_data:

        for k, v in tok_dict.items():
            if type(v) == list:
                v = [str(i) for i in v]
                tok_dict[k] = ' '.join(v)
            else:
                tok_dict[k] = v
        
    df = pd.DataFrame(token_data)
    df.to_csv(path_out, index=False)
    

    
    



RdC_01feb2023_Fe 13
Chiara_full_dataset/chiara/output-RdC_01feb2023_Fe_LS-id13.csv
found 322
[{'value': {'start': 322, 'end': 339, 'text': 'diciotto indagati', 'labels': ['Target-category-reference']}, 'id': 'ZCHA_Pxqf8', 'from_name': 'label', 'to_name': 'text', 'type': 'labels', 'origin': 'manual'}, {'value': {'start': 322, 'end': 339, 'text': 'diciotto indagati', 'choices': ['middle-TC', 'Neutral', 'Outgroup', 'Perpetrator', 'Activity']}, 'id': 'ZCHA_Pxqf8', 'from_name': 'TG-levels', 'to_name': 'text', 'type': 'choices', 'origin': 'manual'}]
RdC_01feb2023_Mo 14
Chiara_full_dataset/chiara/output-RdC_01feb2023_Mo_LS-id14.csv
RdC_01mar2023_Bo 15
Chiara_full_dataset/chiara/output-RdC_01mar2023_Bo_LS-id15.csv
RdC_02feb2023_Fe 16
Chiara_full_dataset/chiara/output-RdC_02feb2023_Fe_LS-id16.csv
found 322
[{'value': {'start': 308, 'end': 343, 'text': 'ex moglie, una trentenne pachistana', 'labels': ['Target-category-reference']}, 'id': 'uMy8w-Nhp0', 'from_name': 'label', 'to_name': 'text', 'ty

In [251]:
# tok_starts