In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')


In [None]:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
      print("no entities found")

In [None]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

Washington, DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [None]:
for ent in doc.ents:
  print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

Washington, DC 4 7 12 26 GPE
next May 7 9 27 35 DATE
the Washington Monument 11 14 43 66 ORG


In [None]:
#adding a name entity to a span
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)


U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
from spacy.tokens import Span

#get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']

#create a span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

#add the entity to the existing doc object
doc.ents = list(doc.ents) + [new_ent]


In [None]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [None]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner.'
         u'If successful, the vacuum cleaner will be our first product.')
show_ents(doc)

first - ORDINAL - "first", "second", etc.


In [None]:
#import phrasematcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [None]:
#create the desired phrase patterns:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [None]:
#Apply the patterns to our matcher object:
matcher.add('newproduct', None, *phrase_patterns)

#apply the matcher to our doc object:
matches = matcher(doc)

#see what matches occur:
matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [None]:
# here we create spans from each match and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc, match[1], match[2], label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

In [None]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [None]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

PROBLEM WITH LINE BREAK

In [None]:
doc = nlp(u'Originally priced at $29.50,\nthe sweater was marked down to five dollars.')
show_ents(doc)

29.50 - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [None]:
from spacy import displacy

In [None]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
       u'by contrast, Sony sold 7 thousand walkman music players')
displacy.render(doc, style='ent')

viewing line by line

In [None]:
for sent in doc.sents:
  displacy.render(nlp(sent.text), style='ent')

In [None]:
doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
       u'By contrast, my kids sold a lot of lemonade.')

In [None]:
for sent in doc2.sents:
  displacy.render(nlp(sent.text), style='ent')



In [None]:
# remove warning
for sent in doc2.sents:
  docx = nlp(sent.text)
  if docx.ents:
    displacy.render(docx, style='ent')
  else:
    print(docx.text)

By contrast, my kids sold a lot of lemonade.


In [None]:
#viewing specific entity
options = {'ents': ['ORG','CARDINAL']}
displacy.render(doc, style='ent', options=options)


In [None]:
colors = {'ORG': 'lineradient(90deg, #aa9cfc, #fc9ce7)'}
options = {'ents': ['ORG','CARDINAL'], 'colors':colors}
displacy.render(doc, style='ent', options=options)

In [None]:
displacy.serve(doc, style='ent', options=options)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
