# Phrase Matching & Vocabulary

In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
pattern1 = [{'LOWER':'solarpower'}] #solarpower

In [4]:
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':'True'},{'LOWER':'power'}] #Solar_Power

In [5]:
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}] #Solar Power

### Adding all three patterns to matcher

In [6]:
matcher = Matcher(nlp.vocab)
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [7]:
doc = nlp(u"The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing.")

### Find the matches

In [8]:
found_matches = matcher(doc)

In [9]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9)]


In [10]:
 for match_id, start, end in found_matches:
        string_id = nlp.vocab.strings[match_id] #SolarPower
        span = doc[start:end]
        print(match_id,string_id,start,end,span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower


### How to remove a particular pattern

In [11]:
matcher.remove('SolarPower')

### Making token rules

In [12]:
pattern1 = [{'LOWER':'solarpower'}] # used for solarpower and SolarPower
pattern2 =[{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}] # solar power, solar-power.
#In between can be anything like solar_power, solar+power etc

In [14]:
#make sure you remove old solar power set
matcher.add('SolarPower', None, pattern1, pattern2)
doc2 = nlp(u"Solar--power is solarpower Yay!")

In [17]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


## Phrase Matching

In [18]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [24]:
with open('reaganomics.txt', encoding = 'cp1252' ) as f: #encoding='utf8' was written
    doc3 = nlp(f.read())

In [26]:
phrase_list = ['voodoo economics',
               'supply-side economics', 'trickle-down economics', 'free-market economics']

### Converting each phrase to a document object

In [28]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [29]:
phrase_patterns

[voodoo economics,
 supply-side economics,
 trickle-down economics,
 free-market economics]

In [30]:
type(phrase_patterns)

list

In [32]:
matcher.add('EconMatcher',None, phrase_patterns)

TypeError: Cannot convert list to spacy.tokens.doc.Doc

In [33]:
# '*' - essentially graps each document and passes indiviually into this matter as patterns
matcher.add('EconMatcher',None,*phrase_patterns)

In [34]:
found_matches = matcher(doc3)

In [35]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2985, 2989)]

In [36]:
#If you actually wanna grab the context of the document 
#where you can do is subtract a liitle bit from your start and add a little bit to your end.

for matcher_id,start,end in found_matches:
    string_id = nlp.vocab.strings[matcher_id]
    span = doc3[start-5:end+5] #5 tokens back and 5 tokens forward
    print(matcher_id, string_id,start,end,span.text)

3680293220734633682 EconMatcher 41 45 policies are commonly associated with supply-side economics, referred to as trickle
3680293220734633682 EconMatcher 49 53 economics, referred to as trickle-down economics or voodoo economics by political
3680293220734633682 EconMatcher 54 56 trickle-down economics or voodoo economics by political opponents, and
3680293220734633682 EconMatcher 61 65 by political opponents, and free-market economics by political advocates.


3680293220734633682 EconMatcher 673 677 attracted a following from the supply-side economics movement, which formed in
3680293220734633682 EconMatcher 2985 2989 became widely known as "trickle-down economics", due to the


In [None]:
""" 5 tokens are there before supply-side economics and 5 tokens afterwards"""
