# Rule based Matching

In [6]:
import spacy
nlp= spacy.load('en_core_web_sm')

In [7]:
#import matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab) #matcher object is created and pass nlp.vocab
#here matcher is an object that pairs to current Vocab Object
# We can add and remove specific named matchers to matcher as needed



In [10]:
#create a list and inside that list add series of dictionaries
#Hello World can appear in the following ways,
#1. Hello World, hello world, Hello WORLD, 
#2. Hello-World

pattern_1 = [{'LOWER': 'hello'},{'LOWER': 'world'}]
pattern_2 = [{'LOWER': 'hello'},{'IS_PUNCT': True},{'LOWER': 'world'}]

#'LOWER','IS_PUNCT' are the attributes
#they has to be written in that way only


In [15]:
#add patterns to matcher object

#Add a match rule to the matcher, a match rule consists of,
#1. An ID Key
#2. one or more patterns as List
#3. an on_match callback (optional)

matcher.add('Hello World', [pattern_1, pattern_2])

In [28]:
#creation of a document
doc = nlp("'Hello World' is a very interesting word that Hello WORLD has been used in numerous instances, printing 'Hello-World' is like a basic pre-requisite for beginners .")

In [29]:
find_matches = matcher(doc)
#Here we try to pass the document to matcher object and store that in a variable
print(find_matches)

#it will return output list of tuples
#string ID, index start and index end
#it doesn't consider the ' ' signs.

[(8585552006568828647, 1, 3), (8585552006568828647, 10, 12), (8585552006568828647, 21, 24)]


In [30]:
#define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id] #get string representation
    span = doc[start:end]  #get the matched span
    print(match_id, string_id, start, end, span.text)

8585552006568828647 Hello World 1 3 Hello World
8585552006568828647 Hello World 10 12 Hello WORLD
8585552006568828647 Hello World 21 24 Hello-World


In [32]:
#Redefine the patterns:
pattern_3 = [{'LOWER': 'hello'},{'LOWER': 'world'}]
pattern_4 = [{'LOWER': 'hello'},{'IS_PUNCT': True, 'OP':'*' },{'LOWER': 'world'}]
#is_punct indicates whether the token is a punctuation symbol or not.
#'OP':'*' ----> This is going to allow this pattern to match zero or more times for any punctuation.

#ADD NEW SET OF PATTERNS IN THE MATCHER:
matcher.add('Hello World', [pattern_3,pattern_4])


In [33]:
doc_2 = nlp(" You can print hello World or Hello world or HELLO WORLD")

In [34]:
find_matches = matcher(doc_2)
print(find_matches)

[(8585552006568828647, 4, 6), (8585552006568828647, 7, 9), (8585552006568828647, 10, 12)]
