In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp(u"Stephen is planning to host a party next weekend.")

In [4]:
print(doc.ents)

(Stephen, next weekend)


In [24]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(f"{ent}-{ent.label_:{20}} {spacy.explain(ent.label_)} Location: ({ent.start},{ent.end})")
    else:
        print("No entities found")

In [25]:
show_ents(doc)

Stephen-PERSON               People, including fictional Location: (0,1)
next weekend-DATE                 Absolute or relative dates or periods Location: (7,9)


In [31]:
doc.ents[1].lemma_

'next weekend'

In [33]:
doc.ents[1].similarity(doc.ents[0])

  doc.ents[1].similarity(doc.ents[0])


0.24794980883598328

In [34]:
doc2 = nlp(u"Peter has invested in about 300 shares of Apple to further grow his business.")

In [35]:
show_ents(doc2)

Peter-PERSON               People, including fictional Location: (0,1)
about 300-CARDINAL             Numerals that do not fall under another type Location: (4,6)
Apple-ORG                  Companies, agencies, institutions, etc. Location: (8,9)


In [39]:
doc3 = nlp(u"Laura spends lavishly on the latest jewellery whenever she visits the Walmart supermarket during the summers.")

In [40]:
show_ents(doc3)

Laura-PERSON               People, including fictional Location: (0,1)
Walmart-ORG                  Companies, agencies, institutions, etc. Location: (11,12)


In [42]:
from spacy.tokens import Span

In [84]:
PRODUCT = doc.vocab.strings[u'PRODUCT']
PRODUCT

386

In [85]:
from spacy.matcher import Matcher

In [86]:
pattern = [{'LOWER':'jewellery'}]
token_matcher = Matcher(nlp.vocab)
token_matcher.add('ProductMatcher',[pattern])

In [87]:
start_idx = token_matcher(doc3)[0][1]
end_idx = token_matcher(doc3)[0][2]

In [88]:
new_entity = Span(doc3,start_idx,end_idx,label=ent_id)
new_entity

jewellery

In [91]:
doc3.ents = list(doc3.ents) + [new_entity]

In [92]:
show_ents(doc3)

Laura-PERSON               People, including fictional Location: (0,1)
jewellery-PRODUCT              Objects, vehicles, foods, etc. (not services) Location: (6,7)
Walmart-ORG                  Companies, agencies, institutions, etc. Location: (11,12)


In [99]:
doc = nlp(u"The U.S. government is spending $226.79 million on the construction of bridges and highways across the countryside.")

In [100]:
show_ents(doc)

U.S.-GPE                  Countries, cities, states Location: (1,2)
$226.79 million-MONEY                Monetary values, including unit Location: (5,8)


In [102]:
matcher = Matcher(nlp.vocab)
pattern1 = [{'LOWER': 'bridges'}]
pattern2 = [{'LOWER': 'highways'}]
matcher.add('INFRA',[pattern1,pattern2])

In [103]:
token_matcher = matcher(doc)
token_matcher

[(17980410162252471113, 12, 13), (17980410162252471113, 14, 15)]

In [105]:
FAC = doc.vocab.strings[u'FAC']
FAC

9191306739292312949

In [108]:
for match_id, start, end in token_matcher:
    string = nlp.vocab.strings[match_id]
    span = doc[start:end]
    new_entity = Span(doc,start,end,label=FAC)
    doc.ents = list(doc.ents) + [new_entity]

In [109]:
show_ents(doc)

U.S.-GPE                  Countries, cities, states Location: (1,2)
$226.79 million-MONEY                Monetary values, including unit Location: (5,8)
bridges-FAC                  Buildings, airports, highways, bridges, etc. Location: (12,13)
highways-FAC                  Buildings, airports, highways, bridges, etc. Location: (14,15)


In [123]:
from spacy.matcher import PhraseMatcher

In [124]:
phrase_matcher = PhraseMatcher(nlp.vocab)

In [125]:
doc = nlp(u"The post office is closed for today. You can visit the post-office anytime during the next week.")

In [126]:
phrase_list = ['post office','post-office']

In [127]:
phrase_patterns = [nlp(phrase) for phrase in phrase_list]

In [128]:
phrase_matcher.add('neworganizations',phrase_patterns)

In [130]:
found_matches = phrase_matcher(doc)
found_matches

[(10500718827724275468, 1, 3), (10500718827724275468, 12, 15)]

In [132]:
ORG = doc.vocab.strings[u"ORG"]
ORG

383

In [133]:
named_entities = [Span(doc,match[1],match[2],label=ORG) for match in found_matches]
named_entities

[post office, post-office]

In [134]:
doc.ents = list(doc.ents) + named_entities

In [135]:
show_ents(doc)

post office-ORG                  Companies, agencies, institutions, etc. Location: (1,3)
today-DATE                 Absolute or relative dates or periods Location: (6,7)
post-office-ORG                  Companies, agencies, institutions, etc. Location: (12,15)
the next week-DATE                 Absolute or relative dates or periods Location: (17,20)


In [136]:
len([ent for ent in doc.ents if ent.label_ == 'ORG'])

2

In [137]:
len([ent for ent in doc.ents if ent.label_ == 'DATE'])

2