# Exploration of Narrative Information

## Base functionality from spaCy

* Entities and their types - PERSON, ORG, LOCATION, GPE, DATE, PRODUCT, NORP (nationality, religion or political group)
  * These can be updated to condense, expand and provide alternate names
* Dependency parses
  * These can be used to find the verbs and their subjects/objects


In [1]:
# Imports and set up
import csv
import json
import requests
import subprocess

from SPARQLWrapper import SPARQLWrapper

import spacy   # SpaCy v2 must be used if neuralcoref is used ()
from spacy import displacy
from spacy.symbols import nsubj, nsubjpass, VERB

print(f'Spacy version: {spacy.__version__}')

nlp = spacy.load('en_core_web_trf')

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")


Spacy version: 3.0.6


In [2]:
# Strings
ner_types = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 
             'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
noun_ner_types = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW' ]

get_id_url = 'https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&titles=sub_title&format=json'
wikidata_query = 'SELECT ?type ?subclass WHERE { wd:wikidata_id wdt:P31 ?type . ?type wdt:P279 ?subclass . ' \
                 'SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } }'

## Text processing

* To prepare text from PDFs for analysis (by paragraph)
* To change third person text to first person

In [3]:
def clean_text(text: str, meta: dict) -> str:
    """
    Remove extraneous beginning white-space, non-ASCII marks, and combine lines into paragraphs
    Have 2 CR/LFs between paragraphs
    Remove headers/footers and <NP> page breaks
    """
    new_text = ''
    if meta['Person'] == '3':          # Third person summary of the person's life
        lines = text.split('\n')[1:]   # Remove the first line which is the person's name (in 'Echoes' PDFs)
    else:
        # Remove the first 4 lines which are the title of the narrative and the person's name
        lines = text.split('\n')[4:]   
    for line in lines:
        if not line:
            new_text += '\n\n'         # 2 CR/LFs between paragraphs
        elif ('MEMORIAL ' in line and ' MUSEUM' in line) or 'ECHOES' in line:   # Remove headers/footers
            continue
        else:
            # Remove white space at beginning of lines, and have 1 white space between words
            new_text += line.strip() + ' '   
    return new_text
        
    
def simplify_text(text: str, meta: dict) -> str:
    """
    Update the text to change instances of full name, given name and full maiden name to 'I', and 
    to change 'the {maiden_name}s' and 'the {surname}s' to Family

    This addresses problems in coref resolution from neuralcoref where the cluster, [Erika Neuman Eckstut, Her, Her father, his],
    is resolved to Erika Neuman Eckstut. But Erika != Her father or his
    """
       
    given_name = meta['Given']
    maiden_name = meta['Maiden']
    surname = meta['Surname']
    
    new_text = text
    # If third person, update name, 'she/he' to be 'I' and 'her/him' to be 'my'
    if meta['Person'] == '3':
        new_text = new_text.replace(f"{given_name} ({maiden_name}) {surname}'s", 'my')
        new_text = new_text.replace(f"{given_name} {maiden_name} {surname}'s", 'my')
        new_text = new_text.replace(f"{given_name} {surname}'s", 'my')
        new_text = new_text.replace(f"{given_name}'s", 'my')
        new_text = new_text.replace(f"{given_name} ({maiden_name}) {surname}", 'I')
        new_text = new_text.replace(f"{given_name} {maiden_name} {surname}", 'I')
        new_text = new_text.replace(f"{given_name} {surname}", 'I')
        new_text = new_text.replace(f"{given_name}", 'I')
        if meta['Gender'] == 'F':
            new_text = new_text.replace(' she ', ' I ')
            new_text = new_text.replace(' She ', ' I ')
            new_text = new_text.replace(' her ', ' my ')
            new_text = new_text.replace(' Her ', ' My ')
        else:
            new_text = new_text.replace(' he ', ' I ')
            new_text = new_text.replace(' He ', ' I ')
            new_text = new_text.replace(' his ', ' my' )
            new_text = new_text.replace(' His ', ' My' )
            
    # Update occurrences of maiden name or surname to be 'my family'
    new_text = new_text.replace(f"the {maiden_name}s", 'my family')
    new_text = new_text.replace(f"the {surname}s", 'my family')
    new_text = new_text.replace(f"The {maiden_name}s", 'My family')
    new_text = new_text.replace(f"The {surname}s", 'My family')
    
    # Update family roles preceded by his, her, ... to be 'FamMember'
    # for member in family_members:
        # for possessive in possessives:
            # new_text = new_text.replace(f'{possessive} {member}', 'FamMember')
        
    # Update family roles without possessive to be 'FamMember'
    # for member in family_members:
        # new_text = new_text.replace(member, 'FamMember')
        
    return new_text


In [4]:
# Process meta-data of narratives
narr_metadata = list()
docs = list()
with open('Narratives-Meta.csv', newline='') as csvfile:
    narr_dict = csv.DictReader(csvfile)
    
    # Process each narrative (Source,Title,Start,End,Person,Given,Surname,Maiden,Gender)
    for narr_meta in narr_dict:
        print(narr_meta)
        narr_metadata.append(narr_meta)
        title = narr_meta['Title']
        # Capture each narrative text from the metadata-details in the CSV
        subprocess.run(['../tools/pdftotext', '-f', narr_meta['Start'], '-l', narr_meta['End'],
                       '-simple', narr_meta['Source'], title])
        with open(title, 'r', encoding='utf8', errors='ignore') as narr_in:
            text = clean_text(narr_in.read(), narr_meta)
            narr = simplify_text(text, narr_meta)
            docs.append(nlp(narr))
            # Use neuralcoref to remove anaphora, cataphora
            # new_narr = update_by_corefs(narr)
            # print(new_narr)


{'Source': '200090122-echoes-vol_1.pdf', 'Title': 'Erika Eckstut.txt', 'Start': '12', 'End': '12', 'Person': '3', 'Given': 'Erika', 'Surname': 'Eckstut', 'Maiden': 'Neuman', 'Gender': 'F'}
{'Source': '200090122-echoes-vol_1.pdf', 'Title': 'Teach Love.txt', 'Start': '13', 'End': '14', 'Person': '1', 'Given': 'Erika', 'Surname': 'Eckstut', 'Maiden': 'Neuman', 'Gender': 'F'}
{'Source': '200090122-echoes-vol_1.pdf', 'Title': 'Lasting Memory.txt', 'Start': '15', 'End': '15', 'Person': '1', 'Given': 'Erika', 'Surname': 'Eckstut', 'Maiden': 'Neuman', 'Gender': 'F'}


In [8]:
# Experiment with the first document and first paragraph
doc = docs[0]
paragraphs = doc.text.split('\n\n')
sentences = paragraphs[0].split('.')
for sentence in sentences:
    nlp_sentence = nlp(f'{sentence.strip()}.')
    displacy.render(nlp_sentence, style="dep")

In [12]:
# Use dependency tree to get main verb, subject(s) and object(s)
for root in [token for token in doc if token.dep_ == 'ROOT']:
    print(root)
    for child in root.children:
        if child.dep_ != 'punct' and child.dep_ != 'auxpass':
            print(child.text, child.dep_)
            if child.dep_ == 'prep':
                for child2 in child.children:
                    print(child2.text, child2.dep_, child2.head.text)
    print()

born
I nsubjpass
on prep
June pobj on
in prep
Znojmo pobj in

was
father nsubj
attorney attr

moved
In prep
1931 pobj In
family nsubj
to prep
Stanesti pobj to

attended
In prep
Stanesti pobj In
I nsubj
school dobj

loved
I nsubj
play xcomp

filled
childhood nsubjpass
with prep
hopes pobj with

tried
In prep
1937 pobj In
however advmod
members nsubj
remove xcomp

cleared
Eventually advmod
court nsubj
him dobj
of prep
charges pobj of
and cc
restored conj

occupied
In prep
1940 pobj In
Union nsubj
Bukovina dobj

driven
later advmod
joined advcl
Soviets nsubjpass
from prep
Stanesti pobj from

carried
Mobs nsubj
then advmod
out prt
attacks dobj

fled
During prep
violence pobj During
I nsubj
to prep
Czernowitz pobj to
with prep
aid pobj with

forced
In prep
fall pobj In
family nsubjpass
settle xcomp

escaped
In prep
1943 pobj In
I nsubj
from prep
ghetto pobj from
using advcl

returned
After prep
escaping pcomp After
I nsubj
to prep
Czechoslovakia pobj to
after prep
II pobj after

married
I n

In [15]:
# for doc in docs:

# Get entities
entities_dict = dict()
for ent in docs[0].ents:
    
    # Get wikidata ids for noun entities and get their data types
    ner_type = ent.label_
    if ner_type in noun_ner_types:
        name = ent.text.replace('the ', '').replace('The ', '').replace(' ', '_')
        id_url = get_id_url.replace('sub_title', name)
        resp = requests.get(id_url)
        resp_dict = resp.json()
        if name not in entities_dict.keys():
            ids = list()
            for page in resp_dict['query']['pages']:
                if page != '-1' and 'pageprops' in resp_dict['query']['pages'][page].keys():
                    wikidata_id = resp_dict['query']['pages'][page]['pageprops']['wikibase_item']
                    print(f'{name}, {ner_type}, {wikidata_id}')
                    ids.append(wikidata_id)
                    new_query = wikidata_query.replace('wikidata_id', wikidata_id)
                    sparql.setQuery(new_query)
                    sparql.setReturnFormat(JSON)
                    results = sparql.query().convert()
                    for result in results["results"]["bindings"]:
                        print(f'{result["type"]["value"]} , {result["subclass"]["value"]}')
                else:
                    print(f'{name}, {ner_type}')
            entities_dict[name] = ids
            
    

Znojmo, GPE, Q214956
http://www.wikidata.org/entity/Q5153359 , http://www.wikidata.org/entity/Q15284
http://www.wikidata.org/entity/Q5153359 , http://www.wikidata.org/entity/Q2183520
http://www.wikidata.org/entity/Q5153359 , http://www.wikidata.org/entity/Q3507889
http://www.wikidata.org/entity/Q5153359 , http://www.wikidata.org/entity/Q14757767
http://www.wikidata.org/entity/Q7819319 , http://www.wikidata.org/entity/Q515
http://www.wikidata.org/entity/Q7819319 , http://www.wikidata.org/entity/Q5153359
http://www.wikidata.org/entity/Q7841907 , http://www.wikidata.org/entity/Q5153359
http://www.wikidata.org/entity/Q8452914 , http://www.wikidata.org/entity/Q515
http://www.wikidata.org/entity/Q15978299 , http://www.wikidata.org/entity/Q5153359
http://www.wikidata.org/entity/Q15978299 , http://www.wikidata.org/entity/Q7930989
Moravian, NORP, Q2368517
http://www.wikidata.org/entity/Q4167410 , http://www.wikidata.org/entity/Q12139612
http://www.wikidata.org/entity/Q4167410 , http://www.wikid

In [44]:
roots = [token for token in doc if token.dep_ == 'ROOT']
for root in roots:
    print(root)
    subjects = list(root.lefts)
    for subject in subjects:
        print(f'  {subject}')
        for descendant in subject.subtree:
            assert subject is descendant or subject.is_ancestor(descendant)
            print(descendant.text, descendant.dep_, descendant.n_lefts, descendant.n_rights,
                [ancestor.text for ancestor in descendant.ancestors])

born
  Eckstut
Erika compound 0 0 ['Neuman', 'Eckstut', 'born']
Neuman compound 1 0 ['Eckstut', 'born']
Eckstut nsubjpass 1 0 ['born']
  was
was auxpass 0 0 ['born']
was
  father
Her poss 0 0 ['father', 'was']
father nsubj 1 0 ['was']
moved
  In
In prep 0 1 ['moved']
1931 pobj 0 0 ['In', 'moved']
  ,
, punct 0 0 ['moved']
  Neumans
the det 0 0 ['Neumans', 'moved']
Neumans nsubj 1 0 ['moved']
attended
  In
In prep 0 1 ['attended']
Stanesti pobj 0 0 ['In', 'attended']
  ,
, punct 0 0 ['attended']
  Erika
Erika nsubj 0 0 ['attended']
loved
  She
She nsubj 0 0 ['loved']
filled
  childhood
Her poss 0 0 ['childhood', 'filled']
childhood nsubjpass 1 0 ['filled']
  was
was auxpass 0 0 ['filled']
tried
  In
In prep 0 1 ['tried']
1937 pobj 0 0 ['In', 'tried']
  ,
, punct 0 0 ['tried']
  however
however advmod 0 0 ['tried']
  ,
, punct 0 0 ['tried']
  members
members nsubj 0 1 ['tried']
of prep 0 1 ['members', 'tried']
the det 0 0 ['fascist', 'of', 'members', 'tried']
fascist pobj 1 1 ['of', 'mem

In [36]:
family_members = ['father', 'mother', 'parent', 'brother', 'sister', 'sibling', 'aunt', 'uncle', 'cousin', 
                  'grandmother', 'grandfather', 'grandparent']

possessives = ['his', 'her', 'His', 'Her', 'their', 'Their',  'our', 'Our']

In [23]:
# Example CoNLL output 
paragraph1 = new_paragraphs[0]
doc_paragraph1 = nlp(paragraph1)
print(doc_paragraph1._.conll_str)

1	Erika	Erika	PROPN	NNP	NounType=prop|Number=sing	2	compound	_	_
2	Neuman	Neuman	PROPN	NNP	NounType=prop|Number=sing	3	compound	_	_
3	Eckstut	Eckstut	PROPN	NNP	NounType=prop|Number=sing	5	nsubjpass	_	_
4	was	be	AUX	VBD	VerbForm=fin|Tense=past	5	auxpass	_	_
5	born	bear	VERB	VBN	VerbForm=part|Tense=past|Aspect=perf	0	ROOT	_	_
6	on	on	ADP	IN	_	5	prep	_	_
7	June	June	PROPN	NNP	NounType=prop|Number=sing	6	pobj	_	_
8	12	12	NUM	CD	NumType=card	7	nummod	_	SpaceAfter=No
9	,	,	PUNCT	,	PunctType=comm	7	punct	_	_
10	1928	1928	NUM	CD	NumType=card	7	nummod	_	SpaceAfter=No
11	,	,	PUNCT	,	PunctType=comm	7	punct	_	_
12	in	in	ADP	IN	_	5	prep	_	_
13	Znojmo	Znojmo	PROPN	NNP	NounType=prop|Number=sing	12	pobj	_	SpaceAfter=No
14	,	,	PUNCT	,	PunctType=comm	13	punct	_	_
15	a	a	DET	DT	_	16	det	_	_
16	town	town	NOUN	NN	Number=sing	13	appos	_	_
17	in	in	ADP	IN	_	16	prep	_	_
18	the	the	DET	DT	_	20	det	_	_
19	Moravian	moravian	ADJ	JJ	Degree=pos	20	amod	_	_
20	region	region	NOUN	NN	Number=sing	17	pobj	_	_
21	of	of	A

In [31]:
# Example dependency parse
for token in doc_paragraph1:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Erika compound Neuman PROPN []
Neuman compound Eckstut PROPN [Erika]
Eckstut nsubjpass born VERB [Neuman]
was auxpass born VERB []
born ROOT born VERB [Eckstut, was, on, in, .]
on prep born VERB [June]
June pobj on ADP [12, ,, 1928, ,]
12 nummod June PROPN []
, punct June PROPN []
1928 nummod June PROPN []
, punct June PROPN []
in prep born VERB [Znojmo]
Znojmo pobj in ADP [,, town]
, punct Znojmo PROPN []
a det town NOUN []
town appos Znojmo PROPN [a, in, with]
in prep town NOUN [region]
the det region NOUN []
Moravian amod region NOUN []
region pobj in ADP [the, Moravian, of]
of prep region NOUN [Czechoslovakia]
Czechoslovakia pobj of ADP []
with prep town NOUN [community]
a det community NOUN []
Jewish amod community NOUN []
community pobj with ADP [a, Jewish, dating]
dating acl community NOUN [back]
back advmod dating VERB [to]
to prep back ADV [century]
the det century NOUN []
thirteenth amod century NOUN []
century pobj to ADP [the, thirteenth]
. punct born VERB [ ]
   . PUNCT []

In [40]:
# Dependency parsing - get all verbs and their subjects
verbs = list()
for possible_subject in doc:
    if (possible_subject.dep == nsubj or possible_subject.dep == nsubjpass) and possible_subject.head.pos == VERB:
        verbs.append(f'{possible_subject.text} {possible_subject.head}')
print(verbs)

['Eckstut born', 'who hoped', 'Neumans moved', 'grandparents lived', 'Erika attended', 'father found', 'She loved', 'childhood filled', 'members tried', 'court cleared', 'he restored', 'Union occupied', 'Romania joined', 'Soviets driven', 'Mobs carried', 'Erika fled', 'Neumans forced', 'Erika escaped', 'father obtained', 'Erika returned', 'they reunited', 'Erika married', 'she permitted', 'Erika became']


In [5]:
# Get entities
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Erika Neuman Eckstut 0 20 PERSON
June 12, 1928 33 46 DATE
Znojmo 51 57 GPE
Moravian 73 81 NORP
Czechoslovakia 92 106 GPE
Jewish 114 120 NORP
the thirteenth century 146 168 DATE
Zionist 220 227 NORP
Palestine 269 278 GPE
1931 283 287 DATE
Neumans 293 300 ORG
Stanesti 310 318 ORG
Romanian 334 342 NORP
Bukovina 355 363 GPE
Erika 371 376 PERSON
Stanesti 412 420 PERSON
Erika 422 427 PERSON
Hebrew 470 476 NORP
Beatrice 554 562 PERSON
1937 704 708 DATE
Iron Guard 742 752 ORG
Erika 769 774 PERSON
Stanesti 833 841 ORG
1940 938 942 DATE
the Soviet Union 944 960 GPE
Bukovina 970 978 ORG
A year later 980 992 DATE
Romania 999 1006 GPE
Nazi 1014 1018 NORP
Germany 1019 1026 GPE
the Soviet Union 1046 1062 GPE
Soviets 1068 1075 NORP
Stanesti 1093 1101 ORG
Jews 1154 1158 NORP
Erika 1181 1186 PERSON
Czernowitz 1210 1220 PRODUCT
1941 1272 1276 DATE
Neumans 1282 1289 NORP
Czernowitz 1319 1329 PRODUCT
Transnistria 1412 1424 GPE
1943 1429 1433 DATE
Erika 1435 1440 PERSON
Beatrice 1445 1453 PERSON
the Soviet 

In [6]:
# Determine all PERSONS, DATES, ... in the narrative
persons = set()
dates_dict = dict()
locations = set()
for ent in doc.ents:
    if ent.label_ == 'PERSON':
        persons.add(ent.text)
    elif ent.label_ == 'DATE':
        dates_dict[f'{ent.text}-{str(ent.start_char)}'] = (ent.start_char, ent.end_char)
    elif ent.label_ == 'GPE':
        locations.add(ent.text)
        
if ''

print(persons)
print(dates_dict)
print(locations)

{'Beatrice', 'Erika', 'Erika Neuman Eckstut', 'Stanesti', 'Nikita Khrushchev'}
{'June 12, 1928-33': (33, 46), 'the thirteenth century-146': (146, 168), '1931-283': (283, 287), '1937-704': (704, 708), '1940-938': (938, 942), 'A year later-980': (980, 992), '1941-1272': (1272, 1276), '1943-1429': (1429, 1433), 'many years-1762': (1762, 1772), '1960-1934': (1934, 1938), 'three years-1940': (1940, 1951)}
{'Germany', 'Transnistria', 'Znojmo', 'the Soviet Union', 'the United States', 'Bukovina', 'Romania', 'Czechoslovakia', 'Palestine'}


In [None]:
# Get DATE and then following verbs
starting_index = 0
dict_date_indices = dict()
dict_date_verb = dict()

for ent in doc.ents:
    if ent.label_ == 'DATE':
        dict_date_indices[ent.text + str(ent.start_char)] = (ent.start_char, ent.end_char)

for token in doc:
    
    if (possible_subject.dep == nsubj or possible_subject.dep == nsubjpass) and possible_subject.head.pos == VERB:
        verbs.append(f'{possible_subject.text} {possible_subject.head}')

In [42]:
# Process by paragraph
paragraphs = contents.split('\n\n')
# for paragraph in paragraphs:
doc = nlp(paragraphs[0])
text = doc.text
print(f'Original: {text}')
for coref in doc._.coref_clusters:
    print(f'{coref.main}: {coref.mentions}')
    for mention in coref.mentions:
        if mention.text.lower() == 'his' or mention.text.lower() == 'her':
            if mention.text.isupper():
                text = text.replace(mention.text, f"{coref.main.text.title()}'s")
            else:
                text = text.replace(mention.text, f"{coref.main.text}'s")
        else:
            text = text.replace(mention.text, coref.main.text)
print(f'New: {text}\n')

Original: Erika (Neuman) Eckstut was born on June 12, 1928, in Znojmo, a town in the Moravian region of Czechoslovakia with a Jewish community dating back to the thirteenth century. Her father was a respected attorney and an ardent Zionist who hoped to emigrate with his family to Palestine. In 1931, the Neumans moved to Stanesti, a town in the Romanian province of Bukovina, where Erika's paternal grandparents lived.
Eckstut: [Eckstut, Her]
Her father: [Her father, his]
New: Erika (Neuman) Eckstut was born on June 12, 1928, in Znojmo, a town in the Moravian region of Czechoslovakia with a Jewish community dating back to the thirteenth century. Eckstut's father was a respected attorney and an ardent Zionist who hoped to emigrate with Her father's family to Palestine. In 1931, the Neumans moved to Stanesti, a town in the Romanian province of Bukovina, where Erika's paternal grandparents lived.



In [6]:
# From https://gist.github.com/tomasonjo/24fbd20a5ffc6c03249147b09b168959#file-ie_coreference-py
# Substitute coref.main for coref.mentions
def coref_resolution(text):
    """Function that executes coreference resolution on a given text"""
    doc = nlp(text)
    # fetches tokens with whitespaces from spacy document
    tok_list = list(token.text_with_ws for token in doc)
    for cluster in doc._.coref_clusters:
        # get tokens from representative cluster name
        cluster_main_words = set(cluster.main.text.split(' '))
        for coref in cluster:
            if coref != cluster.main:  # if coreference element is not the representative element of that cluster
                if coref.text != cluster.main.text and bool(set(coref.text.split(' ')).intersection(cluster_main_words)) == False:
                    # if coreference element text and representative element text are not equal and none of the coreference element words are in representative element. This was done to handle nested coreference scenarios
                    tok_list[coref.start] = cluster.main.text + \
                        doc[coref.end-1].whitespace_
                    for i in range(coref.start+1, coref.end):
                        tok_list[i] = ""
    return "".join(tok_list)

In [8]:
print(coref_resolution(contents[0]))


Erika (Neuman) Eckstut was born on June 12, 1928, in Znojmo, a town in the Moravian region of Czechoslovakia with a Jewish community dating
back to the thirteenth century. Eckstut father was a respected attorney and an ardent Zionist who hoped to emigrate with Her father family to Palestine. In 1931, the Neumans moved to Stanesti, a town in the Romanian province of Bukovina, where Erika's paternal grandparents lived.


In [8]:
doc = nlp("I dedicated myself to the cause.")
for token in doc:
    print(token.text, token.pos_, token.head.text, token.head.pos_,
            [child for child in token.children])

I PRON dedicated VERB []
dedicated VERB dedicated VERB [I, myself, to, .]
myself PRON dedicated VERB []
to ADP dedicated VERB [cause]
the DET cause NOUN []
cause NOUN to ADP [the]
. PUNCT dedicated VERB []


In [19]:
import spacy   
from spacy.matcher import DependencyMatcher

nlp = spacy.load("en_core_web_trf")
matcher = DependencyMatcher(nlp.vocab)

In [29]:
# "was born ... on ... in ..."
born_pattern = [
  # anchor token: was born
  {
    'RIGHT_ID': 'born',
    'RIGHT_ATTRS': {'ORTH': 'born'}
  },
  # "on" follows "was born"
  {
    'LEFT_ID': "born",
    'REL_OP': '>',
    'RIGHT_ID': "born_on",
    'RIGHT_ATTRS': {'ORTH': 'on'}
  },
  # date follows "on"
  {
    'LEFT_ID': "born_on",
    'REL_OP': '>>',
    'RIGHT_ID': "born_date",
    'RIGHT_ATTRS': {'ENT_TYPE': 'DATE'}
  }
]

family_pattern = [
  # anchor token: family relation
  {
    'RIGHT_ID': 'family_member',
    'RIGHT_ATTRS': {'ORTH': {'IN': ['sister', 'brother', 'mother', 'father', 'cousin',
                                    'grandmother', 'grandfather', 'aunt', 'uncle']} }
  },
  # subject should be 'I'
  {
    'LEFT_ID': 'family_member',
    'REL_OP': '>',
    'RIGHT_ID': 'proper_name',
    'RIGHT_ATTRS': {'DEP': 'appos', 'POS': 'PROPN'}
  }
]

matcher.add("born", [born_pattern])
matcher.add("family", [family_pattern])

doc = nlp("I was born on June 12, 1928, in Znojmo, and my sister Beatrice was born in 1929.")
matches = matcher(doc)
print(matches)
for match in matches:
    match_id, token_ids = match   # Indicates which pattern is matched and the specific tokens
    # Get the string representation
    string_id = nlp.vocab.strings[match_id]
    for i in token_ids:
        print(string_id, doc[i].text)

[(14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (14611628419808709137, [2, 3, 4]), (14611628419808709137, [2, 3, 5]), (14611628419808709137, [2, 3, 6]), (14611628419808709137, [2, 3, 7]), (146116284198087091

In [4]:
doc = nlp("I was born on June 12, 1928, in Znojmo, a town in the Moravian region of Czechoslovakia with a Jewish community dating back to the thirteenth century. My father was a respected attorney and an ardent Zionist who hoped to emigrate with his family to Palestine. In 1931, my family moved to Stanesti, a town in the Romanian province of Bukovina, where my paternal grandparents lived. In Stanesti, I attended the public school as well as the Hebrew school, which my father had helped found. I loved to play with my sister Beatrice and the other children in the town and enjoyed being with my grandfather. My childhood was filled with hopes and dreams for the future. In 1937, however, members of the fascist Iron Guard tried to remove my father from his position as the chief civil official in Stanesti. Eventually, a court cleared him of the fabricated charges and he was restored to his post. In 1940, the Soviet Union occupied Bukovina. A year later, when Romania joined Nazi Germany in the war against the Soviet Union, the Soviets were driven from Stanesti. Mobs then carried out bloody attacks on the town's Jews. During the violence, I and my family fled to Czernowitz with the aid of the local police chief. In fall of 1941, my family were forced to settle in the Czernowitz ghetto, where living conditions were poor and they were subject to deportation to Transnistria. In 1943, I and Beatrice escaped from the ghetto using false papers that their father had obtained. After escaping to the Soviet Union, I and Beatrice returned to Czechoslovakia after World War II, where they were eventually reunited with their parents. I married an officer in the Czech army and raised two children. After many years of hard effort and my mother and sister's appeals to Soviet leader Nikita Khrushchev, I was permitted to emigrate from Czechoslovakia to the United States in 1960, three years after the death of my husband. Once in the United States, I became a supervisor of a pathology lab.")
matches = matcher(doc)

print(matches) 

born_on_date = set()
born_in_place = set()
# Each token_id corresponds to one pattern dict
for match in matches:
    match_id, token_ids = match
    for i in range(len(token_ids)):
        # print(born_pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)
        if born_pattern[i]["RIGHT_ID"] == 'born_date':
            born_on_date.add(doc[token_ids[i]].text)
        if born_pattern[i]['RIGHT_ID'] == 'born_place':
            born_in_place.add(doc[token_ids[i]].text)
            
print(born_on_date)
print(born_in_place)

[(14611628419808709137, [2, 3, 4, 9, 10]), (14611628419808709137, [2, 3, 4, 9, 19]), (14611628419808709137, [2, 3, 5, 9, 10]), (14611628419808709137, [2, 3, 5, 9, 19]), (14611628419808709137, [2, 3, 6, 9, 10]), (14611628419808709137, [2, 3, 6, 9, 19]), (14611628419808709137, [2, 3, 7, 9, 10]), (14611628419808709137, [2, 3, 7, 9, 19])]
{'12', '1928', 'June', ','}
{'Znojmo', 'Czechoslovakia'}


In [38]:
doc = nlp("I remember the time we left Russia and we fled to Poland. We had to leave Kiev in a hurry in 1944. My friend Monika told me that the NKVD secret police were coming to get my sister and the lady we werewith, Mrs. Dirnfeld. Monika didn't know that Beatrice was my sister. I never talked about my sister and who she was, or the lady, Mrs. Dirnfeld.")
matches = matcher(doc)

print(matches) 

born_on_date = set()
born_in_place = set()
# Each token_id corresponds to one pattern dict
for match in matches:
    match_id, token_ids = match
    for i in range(len(token_ids)):
        # print(born_pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)
        if born_pattern[i]["RIGHT_ID"] == 'born_date':
            born_on_date.add(doc[token_ids[i]].text)
        if born_pattern[i]['RIGHT_ID'] == 'born_place':
            born_in_place.add(doc[token_ids[i]].text)
            
print(born_on_date)
print(born_in_place)  

[]
set()
set()


In [40]:
print(matcher.get('was_born'))


(None, [[{'RIGHT_ID': 'was_born', 'RIGHT_ATTRS': {'ORTH': 'was born'}}, {'LEFT_ID': 'was_born', 'REL_OP': '<', 'RIGHT_ID': 'born_on', 'RIGHT_ATTRS': {'ORTH': ' on '}}, {'LEFT_ID': 'was_born', 'REL_OP': '<', 'RIGHT_ID': 'born_in', 'RIGHT_ATTRS': {'ORTH': ' in '}}], [{'RIGHT_ID': 'was_born', 'RIGHT_ATTRS': {'ORTH': 'was born'}}, {'LEFT_ID': 'was_born', 'REL_OP': '>', 'RIGHT_ID': 'born_on', 'RIGHT_ATTRS': {'ORTH': ' on '}}, {'LEFT_ID': 'was_born', 'REL_OP': '>', 'RIGHT_ID': 'born_in', 'RIGHT_ATTRS': {'ORTH': ' in '}}], [{'RIGHT_ID': 'was_born', 'RIGHT_ATTRS': {'ORTH': 'was born'}}, {'LEFT_ID': 'was_born', 'REL_OP': ';', 'RIGHT_ID': 'born_on', 'RIGHT_ATTRS': {'ORTH': ' on '}}, {'LEFT_ID': 'was_born', 'REL_OP': ';', 'RIGHT_ID': 'born_in', 'RIGHT_ATTRS': {'ORTH': ' in '}}], [{'RIGHT_ID': 'was_born', 'RIGHT_ATTRS': {'ORTH': 'was born'}}, {'LEFT_ID': 'was_born', 'REL_OP': '$+', 'RIGHT_ID': 'born_on', 'RIGHT_ATTRS': {'ORTH': ' on '}}, {'LEFT_ID': 'was_born', 'REL_OP': '$+', 'RIGHT_ID': 'born_in

In [3]:
text1 = "In 1931, my family moved to Stanesti, a town in the Romanian province of Bukovina, where my paternal grandparents lived."
nlp_sentence = nlp(text1)
displacy.render(nlp_sentence, style="dep")

In [3]:
text2 = "In 1937, however, members of the fascist Iron Guard tried to remove my father from his position as the chief civil official in Stanesti."
nlp_sentence = nlp(text2)
displacy.render(nlp_sentence, style="dep")

In [2]:
text5 = "After escaping to the Soviet Union, I and Beatrice returned to Czechoslovakia after World War II, where they were eventually reunited with their parents."

nlp_sentence = nlp(text5)
displacy.render(nlp_sentence, style="dep")

In [2]:
text6 = "The bed on which I slept broke."

nlp_sentence = nlp(text6)
displacy.render(nlp_sentence, style="dep")

In [3]:
text7 = "John mopped the floor with the dress Mary bought while studying and traveling in Thailand."

nlp_sentence = nlp(text7)
displacy.render(nlp_sentence, style="dep")