In [2]:
import pandas as pd
import numpy as np
import spacy

In [4]:
nlp = spacy.load('en')

In [5]:
doc = nlp("Tea is healthy and calming, don't you think?")

In [7]:
print(f"Token \t\tLemma \t\tStopword".format('Token','Lemma','Stopword'))
print('-'*40)
for token in doc:
    print(f'{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}')

Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calm		False
,		,		False
do		do		True
n't		not		True
you		-PRON-		True
think		think		False
?		?		False


In [8]:
#Pattern Matching
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

#The matcher is created using the vocabulary of your model. Here we're using the small English model you loaded earlier.
#Setting atrr='LOWER' will match the phrases on lowercased text. This provides case INSENSITIVE matching

In [9]:
#Create a list of terms to match in the text.
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList", patterns)

In [10]:
#Then create a document from the text to search and use the phrase matcher to find where the terms occur in the text
text_doc = nlp("Glowing review overall, and some really interseting side-by-side"
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year's iphone XS and Google Pixel 3.")
matches = matcher(text_doc)
print(matches)

[(3766102292120407359, 16, 18), (3766102292120407359, 21, 23), (3766102292120407359, 29, 31), (3766102292120407359, 32, 34)]


#### The matches are a tuple (match id, position of start, position of end of phrase)

In [11]:
match_id, start, end = matches[0]
print(nlp.vocab.strings[match_id], text_doc[start:end])

TerminologyList iPhone 11
