In [107]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [108]:
#loop to identify the entities in a document and out its details
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print("No entities found")

In [109]:
doc = nlp(u"Hi, how are you?")
show_ents(doc)

No entities found


In [110]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [111]:
doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')
show_ents(doc)

500 dollars - MONEY - Monetary values, including unit
Microsoft - ORG - Companies, agencies, institutions, etc.


In [112]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [113]:
#adding a named entity to a span

from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

In [114]:
# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [115]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [116]:
doc = nlp(u'Our company plans to introduce a new vaccum cleaner. '
          u'If successful, the vacuum-cleaner will be our first product.')

In [117]:
show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [118]:
# Import PhraseMatcher and create a matcher object
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [119]:
# Create the desired phrase patterns
phrase_list = ['vaccum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [120]:
# Apply the patterns to our matcher object:
matcher.add('newPhrases', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

In [121]:
matches

[(10480813636310641909, 7, 9), (10480813636310641909, 14, 17)]

In [122]:
# Here we create Spans from each match, and create named entities from them

#adding a named entity to a span
from spacy.tokens import Span

# Get the hash value of the ORG entity label
PROD = doc.vocab.strings[u'PRODUCT']  

In [123]:
new_ents = [Span(doc, match[1], match[2], label="PROD") for match in matches]

In [124]:
doc.ents = list(doc.ents) + new_ents

In [125]:
show_ents(doc)

vaccum cleaner - PROD - Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas
vacuum-cleaner - PROD - Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas
first - ORDINAL - "first", "second", etc.


In [126]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [127]:
#entities with a particular label
[ent for ent in doc.ents if ent.label_ == "MONEY"]

[29.50, five dollars]

In [128]:
#count number of entities
len([ent for ent in doc.ents if ent.label_ == "MONEY"])

2