In [1]:
import spacy

In [4]:
nlp = spacy.load('en_core_web_sm')
text = "West Chestertenfieldville was referenced in Mr. Deeds."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [5]:
ruler = nlp.add_pipe("entity_ruler")
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [6]:
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"}
]
ruler.add_patterns(patterns)

doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [7]:
nlp2 = spacy.load('en_core_web_sm')
ruler = nlp2.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)
doc = nlp2(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [8]:
nlp3 = spacy.load('en_core_web_sm')
ruler = nlp3.add_pipe("entity_ruler", before="ner")
patterns = [
    {"label": "GPE", "pattern": "West Chestertenfieldville"},
    {"label": "FILM", "pattern": "Mr. Deeds"}
]
ruler.add_patterns(patterns)
doc = nlp3(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

West Chestertenfieldville GPE
Mr. Deeds FILM


In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
pattern = [
    {"LIKE_EMAIL": True}
]
matcher.add("EMAIL_ADRESS", [pattern])

In [3]:
doc = nlp("This is an email address: wmattingly@aol.com")
matches = matcher(doc)

In [4]:
print(matches)

[(2197859665807148658, 6, 7)]


In [5]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADRESS


In [6]:
with open ("data/wiki_us.txt", "r") as f:
    text = f.read()

In [7]:
print (text)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j] At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d] The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world. The national capital is Washington, D.C., and the most populous city is New York.

Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century. The United States emerged from the thirteen British colonies est

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
matcher = Matcher(nlp.vocab)
pattern = [
    {"POS" : "PROPN"},
]
matcher.add("PROPER_NOUN", [pattern])

In [10]:
doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

97
(451313080118390996, 1, 2) United
(451313080118390996, 2, 3) States
(451313080118390996, 4, 5) America
(451313080118390996, 6, 7) U.S.A.
(451313080118390996, 8, 9) USA
(451313080118390996, 15, 16) United
(451313080118390996, 16, 17) States
(451313080118390996, 18, 19) U.S.
(451313080118390996, 20, 21) US
(451313080118390996, 23, 24) America


In [11]:
matcher = Matcher(nlp.vocab)
pattern = [
    {"POS" : "PROPN", "OP" : "+"},
]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])

144
(451313080118390996, 1, 2) United
(451313080118390996, 1, 3) United States
(451313080118390996, 2, 3) States
(451313080118390996, 4, 5) America
(451313080118390996, 6, 7) U.S.A.
(451313080118390996, 8, 9) USA
(451313080118390996, 15, 16) United
(451313080118390996, 15, 17) United States
(451313080118390996, 16, 17) States
(451313080118390996, 18, 19) U.S.


In [None]:
matcher = Matcher(nlp.vocab)
pattern = [
    {"POS" : "PROPN", "OP" : "+"},
]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
print(len(matches))

for match in matches[:10]:
    print(match, doc[match[1]:match[2]])