In [1]:
import json
from nltk.tokenize import TreebankWordTokenizer as tbwt

In [2]:
all_examples = []

for file in ['train', 'test', 'dev']:
    with open(f'{file}.json', 'r') as f:
        all_examples += list(json.loads(f.read()).values())

In [3]:
words_by_type = {}

In [4]:
for ex in all_examples:
    template = ex['templates'][0]
    incident_type = template['incident_type']
    tokens_spans = list(tbwt().span_tokenize(ex['doctext']))

    if not incident_type in words_by_type:
        words_by_type[incident_type] = {}
    
    for token_span in tokens_spans:
        normalized_span = ex['doctext'][token_span[0] : token_span[1]].strip().lower()
        if not normalized_span in words_by_type[incident_type]:
            words_by_type[incident_type][normalized_span] = 1
        else:
            words_by_type[incident_type][normalized_span] += 1

In [5]:
words_by_type.keys()

dict_keys(['life.die.deathcausedbyviolentevents', 'movement.transportartifact.hide', 'conflict.attack.selfdirectedbattle', 'life.injure.illnessdegradationphysical', 'contact.commitmentpromiseexpressintent.n/a', 'justice.arrestjaildetain.arrestjaildetain', 'contact.discussion.meet', 'life.injure.injurycausedbyviolentevents', 'personnel.endposition.n/a', 'transaction.transferownership.n/a', 'justice.investigate.investigatecrime', 'contact.collaborate.n/a', 'government.agreements.violateagreement', 'movement.transportperson.prevententry', 'contact.commandorder.broadcast', 'transaction.transfermoney.n/a', 'justice.initiatejudicialprocess.n/a', 'contact.prevarication.broadcast', 'conflict.attack.stealrobhijack', 'life.injure.illnessdegradationhungerthirst', 'contact.negotiate.meet', 'contact.threatencoerce.n/a', 'contact.commitmentpromiseexpressintent.broadcast', 'personnel.elect.n/a', 'transaction.transfermoney.purchase', 'contact.mediastatement.broadcast', 'contact.requestadvise.correspon

In [478]:
event_type = "movement.transportperson.n/a"
print(json.dumps({k: v for k, v in sorted(words_by_type[event_type].items(), key=lambda item: -1 * item[1])}, indent=4))

{
    "the": 1140,
    ",": 950,
    ".": 711,
    "of": 544,
    "to": 514,
    "and": 483,
    "in": 470,
    "a": 394,
    "that": 199,
    "-": 187,
    "on": 174,
    "is": 154,
    "for": 142,
    "\u201d": 138,
    "\u201c": 136,
    "it": 127,
    "\"": 125,
    "as": 121,
    "\u2019s": 117,
    "was": 116,
    "said": 110,
    "from": 109,
    "by": 108,
    "he": 105,
    "has": 99,
    "at": 98,
    "with": 95,
    "an": 92,
    "have": 87,
    "his": 79,
    "they": 76,
    "'s": 69,
    "are": 69,
    "not": 68,
    "who": 68,
    "were": 66,
    "trump": 66,
    "be": 63,
    "but": 61,
    "after": 55,
    "their": 55,
    "which": 52,
    "this": 52,
    "been": 52,
    ":": 51,
    "one": 50,
    "russia": 48,
    "i": 48,
    "\u2014": 47,
    "she": 47,
    "people": 46,
    "her": 46,
    "or": 45,
    ")": 45,
    "(": 44,
    "more": 42,
    "clinton": 42,
    "government": 42,
    "russian": 41,
    "we": 41,
    "had": 40,
    "would": 39,
    "also": 38,
    "

In [752]:
keywords = {
    'movement.transportartifact.nonviolentthrowlaunch': ['threw', 'hurl', 'flung', 'throw', 'fling', 'heave'],
    'life.die.deathcausedbyviolentevents': ['death', 'war', 'kill', 'genocide', 'attack', 'shoot', 'massacre', 'die', 'shot', 'homicide', 'execut', 'slay', 'slain', 'violence', 'murder', 'assassinat', 'slaughter', 'dead', 'bombing', 'assault', 'decapitat', 'drown', 'fatal', 'exterminat', 'holocaust'],
    'movement.transportartifact.hide': ['smuggl', 'conceal'],
    'conflict.attack.selfdirectedbattle': ['war', 'fight', 'kill', 'coup', 'attack', 'conflict', 'clash', 'bomb', 'combat', 'strike', 'massacr', 'assault', 'battle', 'annihilat', 'slaughter', 'skirmish', 'assail', 'aggress'],
    'life.injure.illnessdegradationphysical': ['disease', 'illness', 'sick', 'surger', 'condition', 'fever'],
    'contact.commitmentpromiseexpressintent.n/a': ['pledg', 'promise', 'vow', 'oath'],
    'justice.arrestjaildetain.arrestjaildetain': ['detain', 'arrest', 'prison', 'capture', 'incarcerat', 'crime', 'jail'],
    'contact.discussion.meet': ['debate', 'interview', 'meet', 'met', 'talk', 'question', 'convene', 'discuss', 'dialogue', 'negotiat', 'confer'],
    'life.injure.injurycausedbyviolentevents': ['attack', 'kill', 'assault', 'injur', 'wound', 'shoot', 'shot', 'tortur', 'violen', 'rape', 'raping', 'strik', 'death', 'crime', 'punch', 'bruis', 'harm', 'hurt', 'kick', 'brutal', 'shed blood', 'mutilate'],
    'personnel.endposition.n/a': ['retir', 'resign', 'oust', 'fir', 'quit', 'dismiss', 'discharg', 'layoff', 'sack'],
    'transaction.transferownership.n/a': ['donat', 'auction', 'purchas', 'acquire', 'stolen', 'buy', 'loan', 'lend', 'lent', 'provid', 'rent', 'borrow', 'grant', 'obtain', 'bought', 'fund', 'embargo', 'sanction'],
    'justice.investigate.investigatecrime': ['investigat'],
    'contact.collaborate.n/a': ['email', 'told', 'telephone', 'meet', 'congregate', 'fax', 'met'],
    'government.agreements.violateagreement': ['ceasefire', 'agreement', 'violat', 'treaty', 'met'],
    'movement.transportperson.prevententry': ['denied', 'obstruct', 'block'],
    'contact.commandorder.broadcast': ['demand', 'command', 'urg'],
    'transaction.transfermoney.n/a': ['pay', 'purchas', 'buy', 'rent', 'lend', 'loan', 'lent', 'reimburs', 'sanction', 'fund', 'donate', 'borrow', 'bought', 'provid', 'acquir', 'compensat', 'auction', 'paid', 'stipend', 'income', 'donat', 'embargo', 'boycott'],
    'justice.initiatejudicialprocess.n/a': ['charg', 'trial', 'proceed', 'accus', 'testif', 'indict'],
    'contact.prevarication.broadcast': ['lie', 'deciev', 'mislead', 'misled', 'deceive'],
    'conflict.attack.stealrobhijack': ['rob', 'hijack', 'commandeer'],
    'life.injure.illnessdegradationhungerthirst': ['starv', 'dehydrat'],
    'contact.negotiate.meet': ['debate', 'negotiat'],
    'contact.threatencoerce.n/a': ['threaten', 'intimidat', 'blackmail'],
    'contact.commitmentpromiseexpressintent.broadcast': ['vow', 'pledg', 'oath', 'promis'],
    'personnel.elect.n/a': ['elect'],
    'transaction.transfermoney.purchase': ['purchas', 'buy', 'bought', 'auction', 'obtain', 'acquir'],
    'contact.mediastatement.broadcast': ['preach', 'broadcast', 'address'],
    'contact.requestadvise.correspondence': ['urg', 'recommend', 'advocat', 'suggest', 'advis'],
    'movement.transportartifact.disperseseparate': ['shipment', 'transport', 'freight'],
    'government.legislate.legislate': ['abolish', 'sign', 'enact', 'vot', 'legislat'],
    'movement.transportperson.preventexit': ['imprison', 'detain', 'captur', 'arrest', 'incarcerat', 'jail'],
    'contact.negotiate.n/a': ['negotiat', 'debat'],
    'government.agreements.n/a': ['agreement', 'treat', 'ceasefire'],
    'life.injure.n/a': ['assault', 'harm', 'kick', 'shot', 'shoot', 'starv', 'wound', 'rape', 'dehydrat', 'punch', 'bruis', 'attack', 'violen', 'injur', 'strik', 'brutaliz', 'hurt', 'sick', 'ill', 'tortur', 'whip', 'fever', 'surgery'],
    'justice.judicialconsequences.extradite': ['extradit'],
    'personnel.endposition.firinglayoff': ['oust', 'fir', 'discharg', 'sack', 'layoff', 'dismiss'],
    'justice.investigate.n/a': ['investigat'],
    'movement.transportartifact.sendsupplyexport': ['suppl', 'export'],
    'government.agreements.acceptagreementcontractceasefire': ['agreement', 'treat', 'ceasefire'],
    'disaster.fireexplosion.fireexplosion': ['fire', 'explosion', 'blaz', 'explod'],
    'contact.collaborate.correspondence': ['fax', 'email', 'telephon'],
    'transaction.transaction.transfercontrol': ['grab', 'seiz', 'confiscat', 'transfer'],
    'transaction.transfermoney.giftgrantprovideaid': ['donat', 'provid', 'fund', 'grant'],
    'movement.transportartifact.nonviolentthrowlaunch': ['heav', 'throw', 'threw', 'flung', 'hurl', 'fling'],
    'contact.commitmentpromiseexpressintent.correspondence': ['vow', 'promis', 'pledg', 'oath'],
    'conflict.attack.airstrikemissilestrike': ['airstrike', 'bombard'],
    'government.formation.n/a': ['take over', 'annex', 'establish', 'took over', 'inaugurat'],
    'movement.transportperson.hide': ['conceal', 'smuggl'],
    'justice.judicialconsequences.execute': ['death penalt', 'execut'],
    'transaction.transaction.embargosanction': ['boycot', 'sanction', 'embargo', 'injunction'],
    'conflict.attack.stabbing': ['stab', 'gouge'],
    'conflict.yield.retreat': ['withdraw', 'retreat', 'withdrew'],
    'transaction.transfermoney.embargosanction': ['sanction', 'boycott', 'embargo', 'injunction'],
    'manufacture.artifact.build': ['construct', 'buil', 'assembl'],
    'inspection.sensoryobserve.n/a': ['inspect', 'immigrat', 'elect', 'vot'],
    'justice.initiatejudicialprocess.trialhearing': ['proceed', 'testif', 'trial'],
    'movement.transportartifact.smuggleextract': ['smuggl', 'traffick'],
    'contact.requestadvise.broadcast': ['advocat', 'recommend', 'advis', 'urg', 'suggest'],
    'contact.commitmentpromiseexpressintent.meet': ['promis', 'vow', 'pledg', 'promis', 'oath'],
    'government.spy.spy': ['spy', 'spie', 'surveillanc'],
    'government.vote.violationspreventvote': ['voter intimidation', 'vot'],
    'contact.discussion.n/a': ['interview', 'dialogue', 'meet', 'convene', 'debat', 'met', 'discuss', 'negotiat', 'confer'],
    'contact.commandorder.correspondence': ['command', 'urg', 'demand'],
    'justice.judicialconsequences.n/a': ['extradit', 'convict', 'execut', 'death penalty', 'guilty'],
    'conflict.attack.firearmattack': ['shoot', 'shot', 'firefight'],
    'contact.prevarication.correspondence': ['misldead', 'deceiv', 'lying', 'lie', 'mislead', 'misled'],
    'movement.transportartifact.bringcarryunload': ['shipment', 'transport', 'freight'],
    'conflict.attack.strangling': ['strangl'],
    'contact.requestadvise.n/a': ['advis', 'advocat', 'recommend', 'urg', 'suggest'],
    'artifactexistence.damagedestroy.destroy': ['destroy', 'eradicat', 'demolish', 'ransack'],
    'life.die.n/a': ['shot', 'deceas', 'slain', 'fatal', 'kill', 'perish', 'massacre', 'die', 'murder', 'shoot', 'execut', 'assassinat', 'genocid', 'death', 'homicide', 'slaughter', 'decapitat', 'drown', 'exterminate', 'passed away', 'holocaust'],
    'contact.threatencoerce.meet': ['intimidat', 'blackmail', 'threaten'],
    'personnel.startposition.hiring': ['hir', 'employ', 'appoint', 'recruit'],
    'conflict.attack.n/a': ['attack', 'strik', 'assault',' hijack', 'rob', 'stab', 'coup', 'fight', 'invad', 'war', 'massacr', 'hang', 'combat', 'bombard', 'slaughter', 'assail', 'kill', 'shoot', 'violence', 'clash', 'explo', 'blaz', 'skirmish', 'battl', 'strangl', 'bomb', 'annihilat', 'blast', 'poison', 'encroach', 'aggress', 'arson', 'gouge', 'hung', 'inferno'],
    'personnel.endposition.quitretire': ['quit', 'retir', 'resign'],
    'justice.initiatejudicialprocess.chargeindict': ['indict', 'accus', 'charg'],
    'contact.requestadvise.meet': ['urg', 'advis', 'recommend', 'suggest', 'advocat'],
    'government.formation.startgpe': ['inaugurat',' establish'],
    'transaction.transfermoney.payforservice': ['reimburs', 'compensat', 'pay', 'incom', 'paid', 'wage', 'stipend'],
    'personnel.elect.winelection': ['win', 'won', 'elect'],
    'movement.transportperson.grantentryasylum': ['immigrat', 'asylum', 'migrat'],
    'movement.transportartifact.n/a': ['suppli', 'supply', 'flung', 'import', 'ship', 'hurl', 'traffick', 'smuggl', 'freight', 'plung', 'bring', 'brought', 'transport', 'export', 'fling', 'obstruct', 'throw', 'detain', 'incarcerate', 'plummet', 'conceal', 'imprison'],
    'contact.publicstatementinperson.broadcast': ['iterat', 'proclaim', 'announc', 'propos', 'profess'],
    'contact.discussion.correspondence': ['conven', 'interview', 'discus', 'confer', 'meet', 'debat', 'negotiat', 'dialogue'],
    'movement.transportperson.disperseseparate': ['shipment', 'transport', 'freight'],
    'transaction.transferownership.purchase': ['acquir', 'purchas', 'obtain', 'buy', 'auction', 'bought'],
    'movement.transportperson.n/a': ['fled', 'escape', 'flee', 'migrat', 'plummet', 'transport', 'traffick', 'evacuat', 'asylum', 'smuggl', 'walk', 'roam', 'incarcerate', 'stumbl', 'unload', 'capture', 'imprison', 'march', 'detain', 'jail', 'plung', 'shipment', 'arrest', 'conceal', 'entry'],
    'conflict.demonstrate.n/a': ['protest'],
    'conflict.demonstrate.marchprotestpoliticalgathering': ['protest'],
    'movement.transportperson.smuggleextract': ['traffick', 'smuggl'],
    'inspection.sensoryobserve.physicalinvestigateinspect': ['inspect', 'immigrat'],
    'contact.publicstatementinperson.n/a': ['reiterat', 'announc', 'profess', 'proclaim', 'propos'],
    'justice.judicialconsequences.convict': ['convict', 'guilty'],
    'contact.funeralvigil.meet': ['funeral', 'burial'],
    'manufacture.artifact.createmanufacture': ['produc', 'manufactur', 'assembl', 'made', 'create'],
    'conflict.yield.n/a': ['withdraw', 'capitulat', 'retreate', 'surrender', 'retreat', 'pullback'],
    'government.formation.mergegpe': ['annex', 'take over', 'took over', 'taken over', 'taking over'],
    'transaction.transfermoney.borrowlend': ['loan', 'lend', 'borrow', 'rent', 'lent'],
    'transaction.transaction.n/a': ['fund', 'grab', 'donat', 'provid', 'grant', 'seiz', 'confiscat', 'embargo', 'transfer', 'injunction'],
    'transaction.transferownership.embargosanction': ['injunction', 'embargo', 'boycott', 'sanction'],
    'transaction.transaction.giftgrantprovideaid': ['donat', 'grant', 'fund', 'provid'],
    'artifactexistence.damagedestroy.damage': ['damag', 'tor', 'wreck', 'batter', 'vandaliz', 'tear', 'punctur', 'pierc'],
    'contact.prevarication.n/a': ['mislead', 'lie', 'deceiv', 'lying'],
    'government.vote.n/a': ['vot'],
    'conflict.attack.invade': ['invad', 'encroach'],
    'contact.threatencoerce.correspondence': ['intimidat', 'blackmail', 'threaten'],
    'personnel.startposition.n/a': ['hir', 'employ', 'recruit', 'appoint'],
    'contact.funeralvigil.n/a': ['burial', 'funeral'],
    'contact.threatencoerce.broadcast': ['intimidat', 'blackmail', 'threaten'],
    'conflict.attack.biologicalchemicalpoisonattack': ['poison'],
    'conflict.attack.bombing': ['bomb', 'strik', 'struck', 'detonate', 'blast', 'explod'],
    'life.die.nonviolentdeath': ['die', 'decease', 'pass away', 'passed away', 'drown', 'perish', 'passes away'],
    'contact.collaborate.meet': ['met', 'meet', 'congregat'],
    'contact.negotiate.correspondence': ['negotiat', 'debat'],
    'government.agreements.rejectnullifyagreementcontractceasefire': ['agreement', 'treat', 'ceasefire'],
    'disaster.accidentcrash.accidentcrash': ['collision', 'crash', 'collid', 'accident'],
    'transaction.transferownership.borrowlend': ['loan', 'borrow', 'rent', 'lent', 'lend'],
    'movement.transportartifact.preventexit': ['incarcerat', 'arrest', 'jail', 'detain', 'captur', 'imprison'],
    'movement.transportperson.evacuationrescue': ['escap', 'flee', 'fled', 'evacuat'],
    'movement.transportartifact.receiveimport': ['import'],
    'artifactexistence.damagedestroy.n/a': ['vandaliz', 'wreck', 'damag', 'punctur', 'demolish', 'batter', 'torn', 'destroy', 'tear', 'eradicat', 'ransack', 'pierc'],
    'movement.transportartifact.grantentry': ['import', 'bring', 'brought'],
    'government.vote.castvote': ['vot'],
    'contact.commandorder.meet': ['demand', 'urg', 'command'],
    'conflict.yield.surrender': ['capitulat', 'surrender'],
    'contact.commandorder.n/a': ['demand', 'command', 'urg'],
    'contact.prevarication.meet': ['deceiv', 'mislead', 'misled', 'lie', 'lying'],
    'transaction.transferownership.giftgrantprovideaid': ['fund', 'grant', 'provid', 'donat'],
    'manufacture.artifact.n/a': ['produc', 'compos', 'construct', 'manufactur', 'buil', 'creat', 'make', 'made'],
    'inspection.sensoryobserve.inspectpeopleorganization': ['immigrat', 'inspect'],
    'movement.transportperson.bringcarryunload': ['transport', 'unload'],
    'movement.transportartifact.fall': ['plummet', 'transport', 'shipment', 'plung', 'freight'],
    'inspection.sensoryobserve.monitorelection': ['vot', 'election'],
    'movement.transportperson.fall': ['stumbl', 'plummet'],
    'manufacture.artifact.createintellectualproperty': ['compos', 'produc'],
    'contact.mediastatement.n/a': ['broadcast', 'address', 'preach'],
    'conflict.attack.hanging': ['hang', 'hung'],
    'movement.transportartifact.prevententry': ['obstruct', 'denied entry', 'roadblock'],
    'conflict.attack.setfire': ['arson', 'blaze', 'inferno'],
    'movement.transportperson.selfmotion': ['walk', 'roam', 'march']
}

In [753]:
for ex in all_examples:
    template = ex['templates'][0]
    if template['incident_type'] in keywords:
        if all(not keyword in ex['doctext'].lower() for keyword in keywords[template['incident_type']]):
            print(template)