In [1]:
# POS tagging and NER

import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp(u'The quick brown fox jumped over the lazy fox.')

In [13]:
for token in doc:
    print(f"{token.text:{10}} {token.pos_:{5}} {token.tag_:{5}} {spacy.explain(token.tag_)}")

The        DET   DT    determiner
quick      ADJ   JJ    adjective
brown      ADJ   JJ    adjective
fox        NOUN  NN    noun, singular or mass
jumped     VERB  VBD   verb, past tense
over       ADP   IN    conjunction, subordinating or preposition
the        DET   DT    determiner
lazy       ADJ   JJ    adjective
fox        NOUN  NN    noun, singular or mass
.          PUNCT .     punctuation mark, sentence closer


In [30]:
# POS counts 
pos_counts = doc.count_by(spacy.attrs.POS)

In [51]:
pos_counts  
# returns a dict of POStag index in the vocab and the count of it

{96: 1, 83: 3, 99: 1, 84: 1, 89: 2, 91: 2}

In [52]:
print(doc[2].text, doc[2].pos, doc[2].pos_)

brown 83 ADJ


In [53]:
doc.vocab[83].text  
# 83 is an ADJ and there are 3 of them in the sentence


'ADJ'

In [56]:
# frequency of pos tags

for k,v in sorted(pos_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{6}} {v}')

83. ADJ    3
84. ADP    1
89. DET    2
91. NOUN   2
96. PUNCT  1
99. VERB   1


In [57]:
TAG_counts = doc.count_by(spacy.attrs.TAG)

for k,v in sorted(TAG_counts.items()): 
    print(f'{k}. {doc.vocab[k].text:{6}} {v}')



1292078113972184607. IN     1
10554686591937588953. JJ     3
12646065887601541794. .      1
15267657372422890137. DT     2
15308085513773655218. NN     2
17109001835818727656. VBD    1


In [58]:
DEP_counts = doc.count_by(spacy.attrs.DEP)

In [62]:
for k,v in sorted(DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{6}} {v}')

399. amod   3
412. det    2
426. nsubj  1
436. pobj   1
440. prep   1
442. punct  1
8206900633647566924. ROOT   1


In [69]:
# Visualizing POS 

from spacy import displacy
displacy.render(doc,style='dep', jupyter=True)

options = {'distace':110, 'compact':'True', 'color':'yellow', 'bg':'cyan', 'font':'Times'}

displacy.render(doc,jupyter=True, options=options,)

In [71]:
# NER 

def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - '+ ent.label_+ ' - ' + 
                  str(spacy.explain(ent.label_)))
    else:
        print('No entities found')

In [87]:
doc = nlp(u'Hi! How are you? I am Pri and I am so glad to meet The Flash.\
May I go to Washington D.C.? and see the Mr. Washington? Can I have Apple stock\
of 500 dollars worth as of today? BigOY')

In [88]:
show_ents(doc)

Pri - NORP - Nationalities or religious or political groups
Flash - PERSON - People, including fictional
Washington D.C. - GPE - Countries, cities, states
Washington - PERSON - People, including fictional
Apple - ORG - Companies, agencies, institutions, etc.
500 dollars - MONEY - Monetary values, including unit
today - DATE - Absolute or relative dates or periods


In [89]:
# Adding new entities 

from spacy.tokens import Span

In [101]:
ORGy = doc.vocab.strings[u'ORG']

In [102]:
ORGy  # hash value of ORG

381

In [103]:
doc[44]

BigOY

In [104]:
# Create a span for ORG

new_ent = Span(doc,44,45,label=ORGy) 
# 44 is where BigOY appears, 45 not including, label 


In [105]:
doc.ents = list(doc.ents) + [new_ent]

In [106]:
doc.ents

(Pri, Flash, Washington D.C., Washington, Apple, 500 dollars, today, BigOY)

In [107]:
show_ents(doc)

# not sure why the given ORGy wasn't shpwn

Pri - NORP - Nationalities or religious or political groups
Flash - PERSON - People, including fictional
Washington D.C. - GPE - Countries, cities, states
Washington - PERSON - People, including fictional
Apple - ORG - Companies, agencies, institutions, etc.
500 dollars - MONEY - Monetary values, including unit
today - DATE - Absolute or relative dates or periods
BigOY - ORG - Companies, agencies, institutions, etc.


In [118]:
# Adding NE to all matching spans 

# multiple NE

doc1 = nlp(u'Our company created a new soft ware. '
          u'Voila is the name of the new software. '
          u'new soft-ware!')


In [119]:
doc1

Our company created a new soft ware. Voila is the name of the new software. new soft-ware!

In [120]:
show_ents(doc1)

Voila - PERSON - People, including fictional


In [121]:
from spacy.matcher import PhraseMatcher
p_matcher = PhraseMatcher(nlp.vocab)

In [122]:
# create a desired list of matchers 

phr_list = ['software', 'soft ware', 'soft-ware']

phr_pattern = [nlp(text) for text in phr_list]  

In [123]:
p_matcher.add('newprod', None, *phr_pattern)

In [124]:
found_matches = p_matcher(doc1)

In [127]:
found_matches 
# three matches, 2nd and 3rd are the spans (starts and ends of each match)

[(7467514733791696242, 5, 7),
 (7467514733791696242, 15, 16),
 (7467514733791696242, 18, 21)]

In [132]:
# create span for each match and create NE from them 

from spacy.tokens import Span

In [137]:
PROD = doc1.vocab.strings[u'PRODUCT']

In [138]:
new_ents = [Span(doc1, match[1], match[2],label=PROD) for match in found_matches]

In [140]:
doc1.ents = list(doc1.ents) + new_ents

In [147]:
show_ents(doc)

Pri - NORP - Nationalities or religious or political groups
Flash - PERSON - People, including fictional
Washington D.C. - GPE - Countries, cities, states
Washington - PERSON - People, including fictional
Apple - ORG - Companies, agencies, institutions, etc.
500 dollars - MONEY - Monetary values, including unit
today - DATE - Absolute or relative dates or periods
BigOY - ORG - Companies, agencies, institutions, etc.


In [166]:
# Freq or specific tags 

print([ent for ent in doc.ents if ent.label_ =='PERSON'])

# count
len([ent for ent in doc1.ents if ent.label_ == 'PRODUCT'])

[Flash, Washington]


3

In [167]:
## Displaying 

displacy.render(doc, style='ent',jupyter=True)

In [168]:
displacy.render(doc1, style='ent', jupyter=True)

In [169]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True)

  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


In [172]:
# Options 

options = {'ents': ['PERSON', 'MONEY']}

In [173]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True, options=options)

  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


In [180]:
colors = {'PERSON':'red'}

options = {'ents':['PRODUCT', 'MONEY'],'colors':colors}

In [181]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style='ent', jupyter=True, options=options)

  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)
