# FrameNet data and tasks


Overview

* show incident types
* show incidents per type
* show number of documents in English and Dutch


Samples

* Show one text with its annotions
* Make a list of possible NLP tasks
* Provide ideas for prompts for automatic labeling
* Provide ideas for prompts for automatic data generation

In [1]:
import json
import pandas as pd
import os
from lxml import etree as et

from utils_structured import generate_overview
from utils_docs import show_docs_by_time
import utils_naf

In [2]:
path_data = '../dfn-data-cleaning/data-headlines/unstructured'
path_structured = f'../DFNDataReleases/structured/'

In [3]:
os.listdir(path_structured)

['labels.json',
 'proj2inc_index.json',
 'annotation_status.json',
 'type2label.json',
 'inc2str_index.json',
 'inc2lang2doc_index.json',
 'data_release_inc_coll_obj.p',
 'type2inc_index.json',
 'inc2label.json']

In [4]:
type2inc_path = f'{path_structured}/type2inc_index.json'
inc2str_path = f'{path_structured}/inc2str_index.json'

type2label_path = f'{path_structured}/type2label.json'
inc2label_path = f'{path_structured}/inc2label.json'

inc2lang2doc_path = f'{path_structured}/inc2lang2doc_index.json'
annotation_status_path = f'{path_structured}/annotation_status.json'

with open(type2inc_path) as infile:
    type2inc = json.load(infile)
     
with open(type2label_path) as infile:
    type2label = json.load(infile)
    
with open(inc2label_path) as infile:
    inc2label = json.load(infile)
  
with open(inc2lang2doc_path) as infile:
    inc2lang2doc = json.load(infile)
    
with open(annotation_status_path) as infile:
    annotation_status_dict = json.load(infile)

In [5]:
# Overview
# table: event type, inc, annotated docs, not annotated docs, not found

lang = 'en'
df_en = generate_overview(lang, type2inc, type2label, inc2label, inc2lang2doc, annotation_status_dict)
lang = 'nl'
df_nl = generate_overview(lang, type2inc, type2label, inc2label, inc2lang2doc, annotation_status_dict)

table written to: overview_en.csv
table written to: overview_nl.csv


In [6]:
#df_en

In [7]:
# incidents with more than 10 docs

df_en_higher10 = df_en[df_en['annotated-manual'] > 4]
df_en_higher10
    

Unnamed: 0,type,inc,total,file not found,annotated-manual,file exists,annotated-system,annotated-deprecated
77,mass shooting,2016 Hesston shooting,15,,14.0,1.0,,
96,mass shooting,2010 University of Alabama in Huntsville shooting,10,3.0,6.0,,1.0,
100,mass shooting,2011 Seal Beach shooting,12,,12.0,,,
108,mass shooting,2014 Fort Hood shooting,12,,12.0,,,
118,mass shooting,2012 Azana Spa shootings,6,1.0,5.0,,,
123,mass shooting,2019 Utrecht shooting,22,5.0,17.0,,,
127,mass shooting,2015 Bamako hotel attack,14,2.0,12.0,,,
135,mass shooting,2011 Tucson shooting,10,2.0,8.0,,,
136,mass shooting,2015 San Bernardino shooting,15,,14.0,,1.0,
139,mass shooting,2016 Citronelle homicides,14,1.0,12.0,,1.0,


In [8]:
# # get docs

target_inc = '2016 Kalamazoo shootings'
lang = 'en'

with open(f'{path_structured}/inc2str_index.json') as infile:
    inc2str = json.load(infile)

label2incid = dict()
for inc_id, label in inc2label.items():
    label2incid[label] = inc_id
    
inc_id = label2incid[target_inc]
print(inc_id)
docs = inc2lang2doc[inc_id][lang]

print('Incident info')
print(inc2str[inc_id])
print()

for doc in docs:
    status = annotation_status_dict[doc]
    print(status, '\t', doc)

Q22910769
Incident info
{'sem:hasPlace': ['http://www.wikidata.org/entity/Q30 | United States of America'], 'sem:hasTimeStamp': ['2016-02-20T00:00:00UTC | 2016-02-20T00:00:00UTC']}

annotated-manual 	 Michigan Teen Injured in Uber-Driver Shooting Says 'I'm a Warrior Princess'
annotated-system 	 'He kept ... shooting, shooting, shooting,' Kalamazoo victim recalls
file not found 	 Rick Snyder on Twitter
file not found 	 Eusebio Solis
annotated-manual 	 High school senior from Mattawan killed in Kalamazoo mass shootings
annotated-manual 	 Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone
annotated-manual 	 4 things we know about Kalamazoo mass shooting suspect's gun
annotated-manual 	 Trial date set in Kalamazoo Uber shootings
annotated-manual 	 What we know so far: Kalamazoo shooting victims
annotated-manual 	 Police dashcam video from Kalamazoo mass shooting scenes released
annotated-manual 	 Service in Battle Creek set for Kalamazoo shooting victim Ba

In [9]:
# load file 
lang = 'en'

path_data = f'../dfn-data-cleaning/data-headlines/unstructured'

#target_text = 'Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone'
target_text = 'High school senior from Mattawan killed in Kalamazoo mass shootings'

path_texts = f'{path_data}/{lang}'

path_naf = f'{path_texts}/{target_text}.naf'

tree = et.parse(path_naf)
root = tree.getroot()



#get_text(root)

doc_time = utils_naf.get_timestamp(root)
print(doc_time)

2016-02-21T18:27:27UTC


In [10]:
docs


["Michigan Teen Injured in Uber-Driver Shooting Says 'I'm a Warrior Princess'",
 "'He kept ... shooting, shooting, shooting,' Kalamazoo victim recalls",
 'Rick Snyder on Twitter',
 'Eusebio Solis',
 'High school senior from Mattawan killed in Kalamazoo mass shootings',
 'Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone',
 "4 things we know about Kalamazoo mass shooting suspect's gun",
 'Trial date set in Kalamazoo Uber shootings',
 'What we know so far: Kalamazoo shooting victims',
 'Police dashcam video from Kalamazoo mass shooting scenes released',
 'Service in Battle Creek set for Kalamazoo shooting victim Barbara Hawthorne, 68',
 'Public threat alert bill inspired by mass shooting signed by Gov. Snyder',
 'Police: Jason Dalton said Uber app controlled him ‘like A.I.’',
 'June 13 trial date set for Jason Dalton in Uber shooting case',
 'Judge orders Dalton to stand trial in Kalamazoo murders',
 'Kalamazoo Shooting Update',
 'Kalamazoo driver blame

In [11]:
show_docs_by_time(path_texts, docs, annotation_status_dict)

1-01-01T00:00:00UTC Kalamazoo Shooting Update
2016-02-21T13:31:22UTC Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone
2016-02-21T16:48:39UTC What we know so far: Kalamazoo shooting victims
2016-02-21T18:27:27UTC High school senior from Mattawan killed in Kalamazoo mass shootings
2016-02-24T17:59:05UTC 4 things we know about Kalamazoo mass shooting suspect's gun
2016-02-25T00:00:00UTC The Latest: Police have found no motive for shooting rampage - Northern Michigan's News Leader
2016-02-25T20:40:40UTC Service in Battle Creek set for Kalamazoo shooting victim Barbara Hawthorne, 68
2016-03-14T20:38:43UTC Kalamazoo driver blamed Uber app for shooting rampage
2016-03-14T21:23:43UTC Police: Jason Dalton said Uber app controlled him ‘like A.I.’
2016-04-25T20:35:00UTC Michigan Teen Injured in Uber-Driver Shooting Says 'I'm a Warrior Princess'
2016-05-02T21:31:49UTC Police dashcam video from Kalamazoo mass shooting scenes released
2016-05-20T14:34:41UTC Judge 

{'2016-04-25T20:35:00UTC': "Michigan Teen Injured in Uber-Driver Shooting Says 'I'm a Warrior Princess'",
 '2016-02-21T18:27:27UTC': 'High school senior from Mattawan killed in Kalamazoo mass shootings',
 '2016-02-21T13:31:22UTC': 'Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone',
 '2016-02-24T17:59:05UTC': "4 things we know about Kalamazoo mass shooting suspect's gun",
 '2018-08-10T18:34:30UTC': 'Trial date set in Kalamazoo Uber shootings',
 '2016-02-21T16:48:39UTC': 'What we know so far: Kalamazoo shooting victims',
 '2016-05-02T21:31:49UTC': 'Police dashcam video from Kalamazoo mass shooting scenes released',
 '2016-02-25T20:40:40UTC': 'Service in Battle Creek set for Kalamazoo shooting victim Barbara Hawthorne, 68',
 '2016-06-24T16:34:34UTC': 'Public threat alert bill inspired by mass shooting signed by Gov. Snyder',
 '2016-03-14T21:23:43UTC': 'Police: Jason Dalton said Uber app controlled him ‘like A.I.’',
 '2017-03-07T18:55:06UTC': 'June 13 tr

In [12]:
os.listdir('../dfn-data-cleaning/data-headlines/unstructured/')

['nl', 'en']

In [14]:
target_text = "Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone"
path_naf = f'{path_texts}/{target_text}.naf'
tree = et.parse(path_naf)
root = tree.getroot()
text = utils_naf.get_text(root)
print(text.text)

Suspect in Michigan killing spree captured; Authorities say Jason Dalton acted alone.

KALAMAZOO, Mich. - The 45-year-old man authorities said went on a killing spree Saturday night in Kalamazoo, Michigan was calm when he was arrested by police roughly seven hours after the shootings first began, according to Michigan officials.
The suspect in the shootings has been identified as Jason Brian Dalton. The Kalamazoo Department of Public Safety said on Facebook that Dalton has no known criminal record.
Dalton was even-tempered during his arrest Sunday morning, according to the department. He was arrested sometime around 12:40 a.m. Authorities said Dalton had a semi-automatic handgun in his vehicle.
"He was cooperative," the Kalamazoo Department of Public Safety wrote. "Acts committed were intentional."
Kalamazoo County Prosecutor Jeff Getting held a press conference Sunday after the three shooting incidents that began at 6 p.m. Saturday. He said a total of six people were fatally shot and 

In [17]:

        
        

event_q = 'Q22910769'

# predicate = predicates[3]
# pred_dict = get_predicate_info(predicate, tok_coref_dict, term_dict, event_q)
# #pred_dict
# roles = predicate.findall('role')
# role = roles[0]
# get_role_info(role, tok_coref_dict, term_dict, event_q)
predicate_role_info = utils_naf.get_predicate_role_info(root, event_q, anchor_filter=True)

#predicate_role_info

In [18]:
# multiword issue



In [50]:
lang = 'en'

path_data = f'../dfn-data-cleaning/data-headlines/unstructured'


target_text = 'Kansas Shooting Victim Says Coworker Hesitated Before Firing'

path_texts = f'{path_data}/{lang}'

path_naf = f'{path_texts}/{target_text}.naf'
print(path_naf)

tree = et.parse(path_naf)
root = tree.getroot()


#get_text(root)
doc_time = utils_naf.get_timestamp(root)
print(doc_time)

../dfn-data-cleaning/data-headlines/unstructured/en/Kansas Shooting Victim Says Coworker Hesitated Before Firing.naf
2016-02-28T14:30:00UTC


In [51]:

#''component_of': 'mw1''

def get_term_mw_dict(root):
    terms = root.findall('terms/term')
    term_mw_dict = dict()
    for term in terms:
        #print(term.attrib)
        if 'component_of' in term.attrib:
            # get mw info:
            term_mw_dict[term.get('id')] = term.get('component_of')
    return term_mw_dict
    
term_mw_dict = get_term_mw_dict(root)

In [52]:
def get_predicate_info(predicate, tok_coref_dict, term_dict, term_mw_dict, event_q):
    
    predicate_dict = dict()
    
    ref = predicate.find('externalReferences/externalRef')
    span = predicate.find('span')
    source = ref.get('source')
    
    fn = ref.get('reference')
    ref_type = ref.get('reftype')
    
    predicate_dict['frame'] = fn
    predicate_dict['source'] = source
    predicate_dict['toks'] = []
    predicate_dict['tok_ids'] = []
    predicate_dict['coref_chain'] = set()
    predicate_dict['coref_ids'] = set()
    coref_ids = set()
    
    targets = span.findall('target')
    for t in targets:
        target_span = t.get('id')
        tok_ids = []
        if target_span in term_mw_dict.values():
            tok_ids = [tok_id for tok_id, mw_id in term_mw_dict.items() if mw_id == target_span]
        else:
            tok_ids = [target_span]
        for tok_id in tok_ids:
            w = term_dict[tok_id]
            predicate_dict['toks'].append(w.strip())
            predicate_dict['tok_ids'].append(tok_id)

            if tok_id in tok_coref_dict:
                coref = tok_coref_dict[tok_id]
                coref_ids.add(coref)
                predicate_dict['coref_chain'].update([term_dict[tok_id].strip() for tok_id, cf in tok_coref_dict.items() if cf == coref])
                predicate_dict['coref_ids'].update([tok_id for tok_id, cf in tok_coref_dict.items() if cf == coref])
    if event_q in coref_ids:
        predicate_dict['anchor'] = True
    else:
        predicate_dict['anchor'] = False
    return predicate_dict

event_q = 'Q23012840'
srl = utils_naf.get_srl(root)
predicates = srl.findall('predicate')
tok_coref_dict = utils_naf.get_tok_coref_dict(root)
term_dict = utils_naf.get_term_dict(root)
term_mw_dict = get_term_mw_dict(root)

for predicate in predicates:
    pred_info = get_predicate_info(predicate, tok_coref_dict, term_dict, term_mw_dict, event_q)

In [53]:
pred_info


{'frame': 'http://premon.fbk.eu/resource/fn17-becoming_aware',
 'source': '2UkwTd53TetC5ximnQn8gaRYj6Lyc12o',
 'toks': ['recognized'],
 'tok_ids': ['t123'],
 'coref_chain': set(),
 'coref_ids': set(),
 'anchor': False}