# 1) Rule-Based Matching

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# Import the Matcher library
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [5]:
pattern_1 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_2 = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]


In [7]:
matcher.add('Hello World', [pattern_1, pattern_2])

In [11]:
doc = nlp(" 'Hello World' are the first two printed words for most of the programmers, printing 'Hello-World' is most common for beginners, Hello world dear")

In [12]:
find_matches = matcher(doc) # passin doc to matcher object and store this in a variable
print(find_matches)

[(8585552006568828647, 2, 4), (8585552006568828647, 19, 22), (8585552006568828647, 29, 31)]


In [13]:
for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

8585552006568828647 Hello World 2 4 Hello World
8585552006568828647 Hello World 19 22 Hello-World
8585552006568828647 Hello World 29 31 Hello world


In [14]:
matcher.remove('Hello World')

## Setting pattern options and quantifiers

> Add blockquote

> Add blockquote





In [16]:
# Redefine the patterns:
pattern_3 = [{'LOWER': 'hello'}, {'LOWER': 'world'}]
pattern_4 = [{'LOWER': 'hello'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'world'}]
# 'OP':'*' ----> Thisis going to allow this pattern to match zero or more times for any punctuation

# Add the new set of patterns to the 'Hellow World' matcher:
matcher.add('Hello World', [pattern_3, pattern_4])

In [18]:
doc_2 = nlp("You can print Hello World or hello world or Hello-World")

In [19]:
find_matches = matcher(doc_2)
print(find_matches)

[(8585552006568828647, 3, 5), (8585552006568828647, 6, 8), (8585552006568828647, 9, 12)]


# 2) Phrase Matching

In [20]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [21]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [23]:
phrase_list = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

In [24]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [25]:
phrase_patterns


[Barack Obama, Angela Merkel, Washington, D.C.]

In [27]:
matcher.add("TerminologyList", None, *phrase_patterns)


In [29]:
doc_3 = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")

In [30]:
find_matches = matcher(doc_3) # passin doc to matcher object and store this in a variable
print(find_matches)

[(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 19, 22)]


In [31]:
for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc_3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3766102292120407359 TerminologyList 2 4 Angela Merkel
3766102292120407359 TerminologyList 7 9 Barack Obama
3766102292120407359 TerminologyList 19 22 Washington, D.C.
