In [1]:
import spacy

In [2]:
nlp = spacy.load('en')

In [3]:
from spacy.matcher import Matcher

In [4]:
# create Matcher object
matcher = Matcher(nlp.vocab)

## Create Rule-based Matcher patterns
pattern = [{'LOWER':'text'}]

In [5]:
# Solar power
pattern1 = [{'LOWER':'solarpower'}]

In [6]:
# Solar-power
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]

In [7]:
# SolarPower
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [8]:
# matcher.add('name',callbacks,patterns to match)
matcher.add('SolarPower',None,pattern1,pattern2,pattern3)

In [9]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

In [10]:
found_matches = matcher(doc)

In [11]:
# print(matcher(matchID, start, stop))
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


`matcher` returns a list of tuples. Each tuple contains an ID for the match, with start & end tokens that map to the span `doc[start:end]`

In [12]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


## Remove patterns from Matcher object

In [37]:
matcher.remove('SolarPower')

## Create new Matcher patterns

In [39]:
# matches: solarpower, SolarPower
pattern1 = [{'LOWER':'solarpower'}]

In [40]:
# matches solarpower, SolarPower, solar-power, Solar$%Power
# match 0 or more times: 'OP':'*'
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]

In [41]:
matcher.add('SolarPower',None,pattern1,pattern2)

In [42]:
doc2 = nlp(u"Solar--power is solarpower yay!")

In [43]:
found_matches = matcher(doc2)

In [44]:
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


## Create Phrase Matcher

In [46]:
from spacy.matcher import PhraseMatcher

In [47]:
matcher = PhraseMatcher(nlp.vocab)

In [48]:
with open('../TextFiles/reaganomics.txt') as f:
    doc3 = nlp(f.read())

In [50]:
# create list of phrases to search
phrase_list = ['voodoo economics','supply-side economics','trickle-down economics','free-market economics']

In [51]:
# convert each phrase to a document object
phrase_patterns = [nlp(text) for text in phrase_list]

In [52]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [53]:
type(phrase_patterns[0])

spacy.tokens.doc.Doc

In [54]:
# pass each document object into matcher
matcher.add('EconMatcher',None,*phrase_patterns)

In [55]:
# build list of matches
found_matches = matcher(doc3)

In [56]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [57]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2985 2989 trickle-down economics


In [60]:
# get context around found matches
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start-3:end+3]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 commonly associated with supply-side economics, referred to
3680293220734633682 EconMatcher 49 53 referred to as trickle-down economics or voodoo economics
3680293220734633682 EconMatcher 54 56 down economics or voodoo economics by political opponents
3680293220734633682 EconMatcher 61 65 opponents, and free-market economics by political advocates
3680293220734633682 EconMatcher 673 677 following from the supply-side economics movement, which
3680293220734633682 EconMatcher 2985 2989 known as "trickle-down economics", due
