In [1]:
import spacy
from spacy.displacy import render
from spacy.matcher import Matcher
import pandas as pd
from collections import OrderedDict

In [2]:
en_model = spacy.load('en_core_web_md')
sentence = ("In 1541 Desoto wrote in his journal that the Pascagoula "
            "people ranged as far north as the confluence "
            "of the Leaf and Chickasawhay rivers at 30.4, -88.5.")

parsed_sent = en_model(sentence)
parsed_sent.ents

(1541, Desoto, Pascagoula, Chickasawhay, 30.4)

In [3]:
' '.join(['{}_{}'.format(tok, tok.tag_) for tok in parsed_sent])
# spaCy used the 'Onto Notes 5" POS tags.

'In_IN 1541_CD Desoto_NN wrote_VBD in_IN his_PRP$ journal_NN that_IN the_DT Pascagoula_NNP people_NNS ranged_VBD as_RB far_RB north_RB as_IN the_DT confluence_NN of_IN the_DT Leaf_NNP and_CC Chickasawhay_NNP rivers_NNS at_IN 30.4_CD ,_, -88.5_NN ._.'

In [4]:
sentence = "In 1541 Desoto wrote in his journal about the Pascagoula."
parsed_sent = en_model(sentence)

render(docs=parsed_sent, page=True, options=dict(compact=True))

In [5]:
def token_dict(token):
    return OrderedDict(ORTH=token.orth_, LEMMA=token.lemma_, POS=token.pos_,
                       TAG=token.tag_, DEP=token.dep_)


def doc_dataframe(doc):
    return pd.DataFrame([token_dict(tok) for tok in doc])

In [6]:
doc_dataframe(en_model("In 1541 Desoto met the Pascagoula."))

Unnamed: 0,ORTH,LEMMA,POS,TAG,DEP
0,In,in,ADP,IN,prep
1,1541,1541,NUM,CD,pobj
2,Desoto,desoto,NOUN,NN,nsubj
3,met,meet,VERB,VBD,ROOT
4,the,the,DET,DT,det
5,Pascagoula,Pascagoula,PROPN,NNP,dobj
6,.,.,PUNCT,.,punct


In [19]:
pattern = [
    # [{'TAG': 'NNP', 'OP': '+'}, {'IS_ALPHA': True, 'OP': '*'}],
    # [{'LEMMA': 'meet'}],
    [{'IS_ALPHA': True, 'OP': '*'}, {'TAG': 'NNP', 'OP': '+'}]
]

In [20]:
doc = en_model("In 1541 Desoto met the Pascagoula.")
matcher = Matcher(en_model.vocab)
matcher.add('met', pattern)
m = matcher(doc)
m
# expected out: [(12280034159272152371, 2, 6)]

[(14332210279624491740, 2, 6),
 (14332210279624491740, 3, 6),
 (14332210279624491740, 4, 6),
 (14332210279624491740, 5, 6)]

In [21]:
for i in range(len(m)):
    print('{}'.format(i), doc[m[i][1]:m[i][2]])
# expected out: Desoto met the Pascagoula

0 Desoto met the Pascagoula
1 met the Pascagoula
2 the Pascagoula
3 Pascagoula


In [10]:
doc = en_model("October 24: Lewis and Clark met their first Mandan Chief, Big White.")
m = matcher(doc)
m

[(14332210279624491740, 0, 1),
 (14332210279624491740, 3, 4),
 (14332210279624491740, 3, 5),
 (14332210279624491740, 3, 6),
 (14332210279624491740, 4, 6),
 (14332210279624491740, 5, 6),
 (14332210279624491740, 6, 7),
 (14332210279624491740, 3, 7),
 (14332210279624491740, 5, 7),
 (14332210279624491740, 3, 8),
 (14332210279624491740, 5, 8),
 (14332210279624491740, 3, 9),
 (14332210279624491740, 5, 9),
 (14332210279624491740, 3, 10),
 (14332210279624491740, 5, 10),
 (14332210279624491740, 4, 10),
 (14332210279624491740, 6, 10),
 (14332210279624491740, 7, 10),
 (14332210279624491740, 8, 10),
 (14332210279624491740, 9, 10),
 (14332210279624491740, 3, 11),
 (14332210279624491740, 5, 11),
 (14332210279624491740, 4, 11),
 (14332210279624491740, 6, 11),
 (14332210279624491740, 7, 11),
 (14332210279624491740, 8, 11),
 (14332210279624491740, 9, 11),
 (14332210279624491740, 10, 11),
 (14332210279624491740, 12, 13),
 (14332210279624491740, 12, 14),
 (14332210279624491740, 13, 14)]

In [11]:
for i in range(len(m)):
    print('{}'.format(i), doc[m[i][1]:m[i][2]])

0 October
1 Lewis
2 Lewis and
3 Lewis and Clark
4 and Clark
5 Clark
6 met
7 Lewis and Clark met
8 Clark met
9 Lewis and Clark met their
10 Clark met their
11 Lewis and Clark met their first
12 Clark met their first
13 Lewis and Clark met their first Mandan
14 Clark met their first Mandan
15 and Clark met their first Mandan
16 met their first Mandan
17 their first Mandan
18 first Mandan
19 Mandan
20 Lewis and Clark met their first Mandan Chief
21 Clark met their first Mandan Chief
22 and Clark met their first Mandan Chief
23 met their first Mandan Chief
24 their first Mandan Chief
25 first Mandan Chief
26 Mandan Chief
27 Chief
28 Big
29 Big White
30 White


In [12]:
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")
m = matcher(doc)
# expected: []

In [13]:
for i in range(len(m)):
    print('{}'.format(i), doc[m[i][1]:m[i][2]])

0 October
1 Gorbachev
2 Gorbachev and
3 Gorbachev and Reagan
4 and Reagan
5 Reagan
6 met
7 Gorbachev and Reagan met
8 Reagan met
9 Gorbachev and Reagan met at
10 Reagan met at
11 Gorbachev and Reagan met at a
12 Reagan met at a
13 Gorbachev and Reagan met at a house
14 Reagan met at a house


In [14]:
doc = en_model("On 11 October 1986, Gorbachev and Reagan met at a house")

In [15]:
pattern = [
    [{'TAG': 'NNP', 'OP': '+'}, {'LEMMA': 'and'}, {'TAG': 'NNP', 'OP': '+'},
     {'IS_ALPHA': True, 'OP': '*'}, {'LEMMA': 'meet'}]
]

In [16]:
matcher.add('met', pattern)
m = matcher(doc)
m

[(14332210279624491740, 2, 3),
 (14332210279624491740, 5, 6),
 (14332210279624491740, 5, 7),
 (14332210279624491740, 5, 8),
 (14332210279624491740, 5, 9),
 (14332210279624491740, 6, 8),
 (14332210279624491740, 7, 8),
 (14332210279624491740, 8, 9),
 (14332210279624491740, 7, 9),
 (14332210279624491740, 5, 10),
 (14332210279624491740, 7, 10),
 (14332210279624491740, 5, 11),
 (14332210279624491740, 7, 11),
 (14332210279624491740, 5, 12),
 (14332210279624491740, 7, 12)]

In [17]:
for i in range(len(m)):
    print('{}'.format(i), doc[m[i][1]:m[i][2]])

0 October
1 Gorbachev
2 Gorbachev and
3 Gorbachev and Reagan
4 Gorbachev and Reagan met
5 and Reagan
6 Reagan
7 met
8 Reagan met
9 Gorbachev and Reagan met at
10 Reagan met at
11 Gorbachev and Reagan met at a
12 Reagan met at a
13 Gorbachev and Reagan met at a house
14 Reagan met at a house
