# Analyze LS output - Dataset Chiara

In [2]:
import json
import csv
from collections import defaultdict
import spacy
import pandas as pd
import os

In [3]:
nlp = spacy.load("nl_core_news_sm")

In [7]:
# labels

labels = [
    "TC-reference",
    "Behav-predicate",
    "individual-TG-levels",
    "middle-TG-levels",
    "high-TG-levels",
    "supra-TG-levels",
    "Exclude", 
    "ChatGPT: punctution",
    "ChatGPT: emoji",
    "Misunderstood task",
    "No consent",
    "Less than 20 words",
    "No consent",
    "Did not finish the study",
    "WPM"

    # negations
    # opposite target category references
]

#    <Header value="Is there a reason to exclude the text?"/>

#     <Choices name="Exclude" toName="text" choice="multiple" showInLine="true">
#     <Choice value="ChatGPT: punctution"/>

#      <Choice value="ChatGPT: structure"/>

#      <Choice value="ChatGPT: emoji"/>

#      <Choice value="Misunderstood task"/>

#      <Choice value="Less than 20 words"/>

#      <Choice value="No consent"/>

#      <Choice value="Did not finish the study"/>

#      <Choice value="WPM"/>


#   </Choices>

In [29]:


# with open('Chiara_full_dataset/labels-chiara.txt') as infile:
#     labels = [l.strip().strip("'").strip('"') for l in infile.read().strip().split('\n')]
# # labels

In [11]:
def get_labels(annotations):
    offset_label_dict = defaultdict(list)
    id_relation_dict = defaultdict(list)
    # sort_annotations_by_id
    id_ann_dict = defaultdict(list)
    id_name_dict = dict()
    span_relations_dict = dict()
    text_level = []

    for an in annotations:
        labels_relations = an['result']
        for la in labels_relations:
            # check if label:
            # print(la)
            # print()
            if 'value' in la:
                target_an = la['value']
                if "start" in target_an:
                    start = target_an['start']
                    end = target_an['end']
                    ann_id = la['id']
                    id_ann_dict[ann_id].append(la)
                    for i in range(start, end):
                        offset_label_dict[i].append(la)
                    if 'labels' in target_an:
                        id_name_dict[ann_id] = target_an['labels'][0]
                elif "choices" in target_an:
                    print("no start key but choices")
                    print(target_an)
                    text_level = target_an['choices']
                
                
                # print('offsets')
                # print(start, la, type(start))
                

            elif 'from_id' in la:
                from_id = la['from_id']
                to_id = la['to_id']
                id_relation_dict[(from_id, to_id)].append(la)
                if 'labels' in la:
                    print('relation has labels')
                    print(la['labels'])
                    print()
                    span_relations_dict[(from_id, to_id)] = la['labels']
                
                    
            else:
                print(la)
    return offset_label_dict, id_relation_dict, id_ann_dict, id_name_dict, span_relations_dict, text_level




def get_chains(id_relations_dict):
    chains = defaultdict(set)
    coref_id = dict()
    n = 0
    for rel_pair, relations in id_relations_dict.items():
        rel1, rel2 = rel_pair

        if n == 0:
            chains[0].add(rel1)
            chains[0].add(rel2)
            n+=1 

        else:
            added = False
            for pair_n, chain in chains.items():
                if rel1 in chain:
                    chain.add(rel2)
                    added = True
                if rel2 in chain:
                    chain.add(rel1)
                    added = True
            if added == False:
                chains[n].add(rel1)
                chains[n].add(rel2)
                n+=1

    for chain_id, chain in chains.items():
        for label_id in chain:
            coref_id[label_id] = chain_id

    return chains, coref_id

#separate_chains(token_data)

def filter_relations(id_relation_dict, id_ann_dict, target_label1, target_label2=None):
 # Filter relations on labels
    id_rel_dict_target = dict()
    if target_label2 is None:
        for lid1, lid2 in id_relation_dict.keys():
            l1 = id_ann_dict[lid1]
            l2 = id_ann_dict[lid2]
            for l in l1:
                if 'labels' in l['value']:
                    l1_name = l['value']['labels'][0]
            
                    
            for l in l2:
                if 'labels' in l['value']:
                    l2_name = l['value']['labels'][0]
                    all_labels = [l for l in l['value']['labels']]
                    if len(all_labels) > 1:
                        print('found more labels:', len(all_labels))
            if l1_name == l2_name:
                rel = id_relation_dict[(lid1, lid2)]
                id_rel_dict_target[(lid1, lid2)] = rel
    else:
        for lid1, lid2 in id_relation_dict.keys():
            l1 = id_ann_dict[lid1]
            l2 = id_ann_dict[lid2]
            for l in l1:
                if 'labels' in l['value']:
                    l1_name = l['value']['labels'][0]
        
            for l in l2:
                if 'labels' in l['value']:
                    l2_name = l['value']['labels'][0]
            if ((l1_name == target_label1) and (l2_name == target_label2) 
            or (l1_name == target_label2) and (l2_name == target_label1)):
                rel = id_relation_dict[(lid1, lid2)]
                id_rel_dict_target[(lid1, lid2)] = rel
        
    return id_rel_dict_target

        
    
def add_singletons(token_data):
    
    coref_ids = []
    for d in token_data:
        if d['ref-relations'] != []:
            coref_ids.extend(d['ref-relations'])
    # [d['ref-relations'] for d in token_data if d['ref-relations'] != '']
    if len(coref_ids) > 0:
        max_coref_id = max(coref_ids)
    
        single_ent_id = max_coref_id +1 
    else:
        single_ent_id = 0
        
    token_data_by_cat = defaultdict(list)
    for d in token_data:
        tcs = d['TC-reference']
        for t in tcs:
            token_data_by_cat[t].append(d)
        
    
    for t, ds in token_data_by_cat.items():
        added_single_ent = False
        for d in ds:
            #rels = d['ref-relations']
            tcs = d['TC-reference']
            ref_id_rels = d['ref-id-relations']

           
            if tcs != [] and  ref_id_rels == []:
                # singleton!
                d['ref-relations'] = [single_ent_id]
                d['ref-id-relations'] = [f'{single_ent_id}-_-{t}']
                added_single_ent = True
            elif len(tcs) != len(ref_id_rels):
                rels_no_coref = [r.split('-_-')[1] for r in ref_id_rels]
                for t in tcs:
                    if t not in rels_no_coref:
                        d['ref-relations'].append(single_ent_id)
                        d['ref-id-relations'].append(f'{single_ent_id}-_-{t}')
                        added_single_ent = True
            else:
                continue
        
        if added_single_ent:
            single_ent_id += 1
            
            
def make_relation_mappings(id_relation_dict, id_name_dict):
    
    coref_mappings = dict()
    behav_mappings = defaultdict(list)
    
    for id_pair, d_list in id_relation_dict.items():
        id1, id2 = id_pair
        label1 = id_name_dict[id1]
        label2 = id_name_dict[id2]

   
        
        #if rel_type == 'ref-behav
        if label1 == label2 == 'TC-reference':
            # print('coref', label1, label2)
            coref_mappings[id1] = id2
            coref_mappings[id2] = id1
        # elif label1 == label2 == 'Behavior-predicate':
        #     pass
        elif label1 == 'Behav-predicate' and label2 == 'TC-reference':
            # 
            behav_mappings[id2].append(id1)
        elif label1 == 'TC-reference' and label2 == 'Behav-predicate':
            behav_mappings[id1].append(id2)

        #elif label1 
            
    return coref_mappings, behav_mappings

In [251]:
# tok_starts

In [12]:
### TEST COREF CHAIN CODE

name = 'kim-full-data-update1'
output = 'data'
path = f'../{output}/{name}.json'

dir_out = f'../results/{name}'
if not os.path.isdir(dir_out):
    os.mkdir(dir_out)

with open(path) as infile:
    annotation_data = json.load(infile)
    
for n, data in enumerate(annotation_data):
    ls_id = data['id']
    annotations = data['annotations']

    
    text = data['data']['text']
    text_id = text.split('\n', 1)[0].split(': ')[-1]
    print('text id', text_id)
    print('ls id', ls_id)
    path_out = f'{dir_out}/output-{text_id}_LS-id{ls_id}.csv'
    print('path out', path_out)
    
    offset_label_dict, id_relation_dict, id_ann_dict, id_name_dict, span_relations_dict, text_level = get_labels(annotations)
    coref_mappings, behav_mappings = make_relation_mappings(id_relation_dict, id_name_dict)
    

    target_label = 'TC-reference'
    id_relations_ref = filter_relations(id_relation_dict, id_ann_dict, target_label)
    
    target_label1 = 'TC-reference'
    target_label2 =  "Behavior-predicate"
    id_relations_behav = filter_relations(id_relation_dict, id_ann_dict, target_label1, target_label2)

    span_dict_end = dict()
    # span_dict_start = dict()
    
    for (l1, l2), la in span_relations_dict.items():
        # span_dict_start[l1] = l2
        span_dict_end[l2] = l1
        
        
    coref_chains_ref, coref_dict_ref = get_chains(id_relations_ref)
    coref_chains_behav, coref_dict_behav = get_chains(id_relations_behav)
    
      
    
    doc = nlp(text)

    
    tok_id_ann_id_dict = defaultdict(set)
  
    
    for tok in doc:
        tok_start = tok.idx

        for i in range(tok.idx, tok.idx+len(tok)):
            if i in offset_label_dict:
                anns = offset_label_dict[i]
                for an in anns:
                    an_id = an['id']
                    tok_id_ann_id_dict[tok.i].add(an_id)

    token_data = []
    for tok in doc:
        
        tok_id = tok.i
        tok_dict = defaultdict(list)
        tok_dict['tok-id'] = tok_id
        tok_dict['token'] = tok.text
        tok_dict['ref-relations'] = []
        tok_dict['ref-behav-relations'] = []
        tok_dict['ref-id-relations'] = []
        tok_dict['coref-id-mapping'] = []
        tok_dict['behav-id-mapping'] = []
        tok_dict['ref-span'] = ''
                 
        # tok_dict['ref-behav-relations-new'] = ''

        #tok_dict['TC-COREF'] = ''
        for l in labels:
            tok_dict[l] = []
            if l in text_level:
                tok_dict[l] = True
            
        
        an_ids = tok_id_ann_id_dict[tok_id]
        # print(tok_id, an_ids)

        for an_id in an_ids:        
            anns = id_ann_dict[an_id]
            
     
            #rels = an_rel_dict[an_id]
            for an in anns:
                add_label = ''

                if 'from_name' in an:
                    add_label = an['from_name']
                    
                if 'labels' in an['value']:
                    label = an['value']['labels'][0]
                    if an_id in span_dict_end:
                        an_id_new = span_dict_end[an_id]
                        tok_dict[f'{label}'].append(an_id_new)
                    else:
                        tok_dict[f'{label}'].append(an_id)
                if 'choices' in an['value']:
                    for choice in an['value']['choices']:
                        if add_label != '':
                            if an_id in span_dict_end:
                                an_id_new = span_dict_end[an_id]
                                tok_dict[f'{choice}-{add_label}'].append(an_id_new)
                            else:
                                tok_dict[f'{choice}-{add_label}'].append(an_id)
                        else:
                            if an_id in span_dict_end:
                                an_id_new = span_dict_end[an_id]
                                tok_dict[choice].append(an_id_new)
                            else:
                                tok_dict[choice].append(an_id)

             # coref:
            # if an_id in coref_dict_behav:
            #     pass
            #     # if tok_dict['relations'] == '':
            #     #tok_dict['ref-behav-relations'] = coref_dict[an_id]
            #     # if an_id in span_dict_end:
            #     #     an_id_new = span_dict_end[an_id]
            #     #     tok_dict['ref-behav-relations'].append(coref_dict_behav[an_id_new])
            #     # else:
            #     #     tok_dict['ref-behav-relations'].append(coref_dict_behav[an_id])
            #     #tok_dict['ref-relations'] = coref_dict_ref[an_id]

            if an_id in coref_dict_ref:
                
                if an_id in span_dict_end:
                    an_id_new = span_dict_end[an_id]
                    tok_dict['ref-relations'].append(coref_dict_ref[an_id])
                    tok_dict['ref-id-relations'].append(f'{coref_dict_ref[an_id]}-_-{an_id_new}')
                else:
                    tok_dict['ref-relations'].append(coref_dict_ref[an_id])
                    tok_dict['ref-id-relations'].append(f'{coref_dict_ref[an_id]}-_-{an_id}')
            
            if an_id in coref_mappings:
                if an_id in span_dict_end:
                    an_id_new = span_dict_end[an_id]
                    coref_id = coref_mappings[an_id]
                    if coref_id in span_dict_end:
                        coref_id = span_dict_end[coref_id]
                
                    tok_dict['coref-id-mapping'].append(f'{an_id_new}-_-{coref_id}')
                else:
                    coref_id = coref_mappings[an_id]
                    if coref_id in span_dict_end:
                        coref_id = span_dict_end[coref_id]
                    tok_dict['coref-id-mapping'].append(f'{an_id}-_-{coref_id}')
                
            if an_id in behav_mappings:
                if an_id in span_dict_end:
                    an_id_new = span_dict_end[an_id]
                    for behav_id in behav_mappings[an_id_new]:
                        tok_dict['behav-id-mapping'].append(f'{an_id_new}-_-{behav_id}')
                else:
                    for behav_id in behav_mappings[an_id]:
                        tok_dict['behav-id-mapping'].append(f'{an_id}-_-{behav_id}')
          
  
        token_data.append(tok_dict)


    add_singletons(token_data)
    


    
    for tok_dict in token_data:

        for k, v in tok_dict.items():
            if type(v) == list:
                v = [str(i) for i in v]
                tok_dict[k] = ' '.join(v)
            else:
                tok_dict[k] = v
                
                
    # apply span information - same id 
    df = pd.DataFrame(token_data)
    df.to_csv(path_out, index=False)

text id pinl2004022643db-13712588e47d Huisfeest oud txt.1
ls id 1057
path out ../results/kim-full-data-update1/output-pinl2004022643db-13712588e47d Huisfeest oud txt.1_LS-id1057.csv
relation has labels
['behav-span']

text id pinl2004022643db-13712588e47d Ex oud txt.1
ls id 1058
path out ../results/kim-full-data-update1/output-pinl2004022643db-13712588e47d Ex oud txt.1_LS-id1058.csv
relation has labels
['behav-span']

text id pinl2004022643db-13712588e47d Klassiek oud txt.1
ls id 1059
path out ../results/kim-full-data-update1/output-pinl2004022643db-13712588e47d Klassiek oud txt.1_LS-id1059.csv
text id pinl2004022643db-13712588e47d Bridge oud txt.1
ls id 1060
path out ../results/kim-full-data-update1/output-pinl2004022643db-13712588e47d Bridge oud txt.1_LS-id1060.csv
text id 63fd069cb47d538f11dc7ed2 Bingo oud txt.1
ls id 1061
path out ../results/kim-full-data-update1/output-63fd069cb47d538f11dc7ed2 Bingo oud txt.1_LS-id1061.csv
no start key but choices
{'choices': ['ChatGPT: punctution