## 1) Rule-based matching

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [17]:
#import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab) #create matcher object and pass nlp.vocab

#### Here matcher is an object that pairs to current Vocab object
#### We can add and remove specific named matcheers to matcher as needed

## Creating patterns

In [3]:
# Create a list, and inside that list add series of dictionaries
# Hello World caan appear in the following ways,
# 1) Hello World hello world Hellow WORLD
# 2) Hello-World

In [18]:
pattern_1 = [{'LOWER':'hello'},{'LOWER':'world'}]
pattern_2 = [{'LOWER':'hello'},{'IS_PUNCT':True},{'LOWER':'world'}]
pattern = [{'LOWER':"hello"},{"LOWER":"world"}]

In [23]:
# Add patterns to matcher object

#Add a match rule to matcher, A match rule consists of,
# 1) An ID Key
# 2) An on_match callback
# 3) one or more patterns

###matcher.add('Hello World',pattern_2)
matcher.add("Hello World",[pattern_1,pattern_2])


In [24]:
doc = nlp("'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common ")

In [25]:
doc

'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common 

### Finding the matches

In [27]:
#pass in doc to matcher object and store thiss in a variable
find_matches = matcher(doc)
print(find_matches)


#it returns output list of tuples
#string ID, index start and index end

[(15578876784678163569, 1, 3), (8585552006568828647, 1, 3), (8585552006568828647, 18, 21)]


In [28]:
# define a function to find the matches
for match_id,start,end in find_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    span = doc[start:end]
    print(match_id,string_id,start,end,span.text)

15578876784678163569 HelloWorld 1 3 Hello World
8585552006568828647 Hello World 1 3 Hello World
8585552006568828647 Hello World 18 21 Hello-World


### Remov the matches

In [29]:
matcher.remove('Hello World')

### Setting pattern options and quantifiers

In [30]:
# Redefine the patterns:
pattern_3 = [{'LOWER':'hello'},{'LOWER':'world'}]
pattern_4 = [{'LOWER':'hellow'},{'IS_PUNCT':True},{'OP':'*'},{'LOWER':'world'}]

#'OP:'*'  ---->   This is going to allow this pattern to match zero or more times for any punctuation

#Add the new set of patterns to the 'Hello World' matcher
matcher.add('Hello World',[pattern_3,pattern_4])

In [31]:
doc2 = nlp('You can print Hello World or hello world or Hello-World')

In [34]:
find_matches = matcher(doc2)
print(find_matches)

[(15578876784678163569, 3, 5), (8585552006568828647, 3, 5), (15578876784678163569, 6, 8), (8585552006568828647, 6, 8)]


# 2) Phase Matching

In [35]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [36]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [37]:
phrase_list = ['Barack Obama','Angela Markel','Washington, D.C.']

In [38]:
# Convert each phrase to a document object
phrase_patterns = [nlp(text) for text in phrase_list] # to do that we are using list comprehension

In [39]:
#phrase objects are not strings
phrase_patterns 

[Barack Obama, Angela Markel, Washington, D.C.]

In [42]:
for i in phrase_patterns:
    print(i)
    print(type(i))

Barack Obama
<class 'spacy.tokens.doc.Doc'>
Angela Markel
<class 'spacy.tokens.doc.Doc'>
Washington, D.C.
<class 'spacy.tokens.doc.Doc'>


In [43]:
# pass each doc object into matcher
# thats we have to add asterisk mark before phrase_pattern
matcher.add('TerminologyList',None,*phrase_patterns)

In [44]:
doc3 = nlp('German Chancellor Angela Merkel and US President Barack Obama '
          'converse in the Oval Office inside the White House in Washington, D.C. ')

In [45]:
# pass in doc to another object and store this in a variable
find_matches = matcher(doc3)
print(find_matches)

[(3766102292120407359, 7, 9), (3766102292120407359, 19, 22)]


In [47]:
# define a function to find the matches
for match_id,start,end in find_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc3[start:end]
    print(match_id,string_id,start,end,span.text)

3766102292120407359 TerminologyList 7 9 Barack Obama
3766102292120407359 TerminologyList 19 22 Washington, D.C.
