<a href="https://colab.research.google.com/github/VishalMaurya/NLPwithPython/blob/master/Course/NLP_02_Part-speech-tagging/POS_tag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [0]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")

# Coarse-grained Part-of-speech Tags

In [13]:
print(f'text    POS tag     POS-ID reference\n')
for token in doc:
  print(f'{token.text:{8}} {token.pos_:{8}} {token.pos:{8}} {spacy.explain(token.pos_)}')

text    POS tag     POS-ID reference

The      DET            90 determiner
quick    ADJ            84 adjective
brown    ADJ            84 adjective
fox      NOUN           92 noun
jumped   VERB          100 verb
over     ADP            85 adposition
the      DET            90 determiner
lazy     ADJ            84 adjective
dog      NOUN           92 noun
's       PART           94 particle
back     NOUN           92 noun
.        PUNCT          97 punctuation


In [16]:
# count by POS tag present in doc
doc.count_by(spacy.attrs.POS)

{84: 3, 85: 1, 90: 2, 92: 3, 94: 1, 97: 1, 100: 1}

In [21]:
for key, value in sorted(doc.count_by(spacy.attrs.POS).items()):
  print(f'{key} {doc.vocab[key].text} {value}')

84 ADJ 3
85 ADP 1
90 DET 2
92 NOUN 3
94 PART 1
97 PUNCT 1
100 VERB 1


# Fine-grained Part-of-speech Tags

In [52]:
print(f'text        TAG        TAG-ID     reference\n')
for token in doc:
  print(f'{token.text:{10}} {token.tag_:<{5}} {token.tag:>{22}} {spacy.explain(token.tag_)}')

text        TAG        TAG-ID     reference

The        DT      15267657372422890137 determiner
quick      JJ      10554686591937588953 adjective
brown      JJ      10554686591937588953 adjective
fox        NN      15308085513773655218 noun, singular or mass
jumped     VBD     17109001835818727656 verb, past tense
over       IN       1292078113972184607 conjunction, subordinating or preposition
the        DT      15267657372422890137 determiner
lazy       JJ      10554686591937588953 adjective
dog        NN      15308085513773655218 noun, singular or mass
's         POS                       74 possessive ending
back       NN      15308085513773655218 noun, singular or mass
.          .       12646065887601541794 punctuation mark, sentence closer


In [38]:
# count by TAG tag present in doc
doc.count_by(spacy.attrs.TAG)

{74: 1,
 1292078113972184607: 1,
 10554686591937588953: 3,
 12646065887601541794: 1,
 15267657372422890137: 2,
 15308085513773655218: 3,
 17109001835818727656: 1}

In [46]:
for key, value in sorted(doc.count_by(spacy.attrs.TAG).items()):
  print(f'{key:<{25}} {doc.vocab[key].text:>{10}} {value}')

74                               POS 1
1292078113972184607               IN 1
10554686591937588953              JJ 3
12646065887601541794               . 1
15267657372422890137              DT 2
15308085513773655218              NN 3
17109001835818727656             VBD 1


# DEP tag

In [58]:
print(f'text    DEP tag              DEP-ID       reference\n')
for token in doc:
  print(f'{token.text:{8}} {token.dep_:{8}} {token.dep:{22}} {spacy.explain(token.dep_)}')

text    DEP tag              DEP-ID       reference

The      det                         415 determiner
quick    amod                        402 adjectival modifier
brown    amod                        402 adjectival modifier
fox      nsubj                       429 nominal subject
jumped   ROOT        8206900633647566924 None
over     prep                        443 prepositional modifier
the      det                         415 determiner
lazy     amod                        402 adjectival modifier
dog      poss                        440 possession modifier
's       case        8110129090154140942 case marking
back     pobj                        439 object of preposition
.        punct                       445 punctuation


In [59]:
# count by DEP tag present in doc
doc.count_by(spacy.attrs.DEP)

{402: 3,
 415: 2,
 429: 1,
 439: 1,
 440: 1,
 443: 1,
 445: 1,
 8110129090154140942: 1,
 8206900633647566924: 1}

In [60]:
for key, value in sorted(doc.count_by(spacy.attrs.DEP).items()):
  print(f'{key:<{25}} {doc.vocab[key].text:>{10}} {value}')

402                             amod 3
415                              det 2
429                            nsubj 1
439                             pobj 1
440                             poss 1
443                             prep 1
445                            punct 1
8110129090154140942             case 1
8206900633647566924             ROOT 1


# lemma tag

In [61]:
print(f'text    lemma tag              lemma-ID       reference\n')
for token in doc:
  print(f'{token.text:{8}} {token.lemma_:{8}} {token.lemma:{22}} {spacy.explain(token.lemma_)}')

text    lemma tag              lemma-ID       reference

The      the         7425985699627899538 None
quick    quick      12442504647632856847 None
brown    brown      15566893785393968193 None
fox      fox         4333436952782779665 None
jumped   jump        7269679109085336853 None
over     over        5456543204961066030 None
the      the         7425985699627899538 None
lazy     lazy        8463806658378306174 None
dog      dog         7562983679033046312 None
's       's         16428057658620181782 None
back     back       15255859468896132977 None
.        .          12646065887601541794 punctuation mark, sentence closer


In [62]:
# count by DEP tag present in doc
doc.count_by(spacy.attrs.LEMMA)

{4333436952782779665: 1,
 5456543204961066030: 1,
 7269679109085336853: 1,
 7425985699627899538: 2,
 7562983679033046312: 1,
 8463806658378306174: 1,
 12442504647632856847: 1,
 12646065887601541794: 1,
 15255859468896132977: 1,
 15566893785393968193: 1,
 16428057658620181782: 1}

In [63]:
for key, value in sorted(doc.count_by(spacy.attrs.LEMMA).items()):
  print(f'{key:<{25}} {doc.vocab[key].text:>{10}} {value}')

4333436952782779665              fox 1
5456543204961066030             over 1
7269679109085336853             jump 1
7425985699627899538              the 2
7562983679033046312              dog 1
8463806658378306174             lazy 1
12442504647632856847           quick 1
12646065887601541794               . 1
15255859468896132977            back 1
15566893785393968193           brown 1
16428057658620181782              's 1


# Visualization POS

In [0]:
from spacy import displacy

In [77]:
# Render the dependency parse immediately inside Jupyter:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})


In [0]:
# render by serving on port
# displacy.serve(doc, style='dep', options={'distance': 110})

In [80]:
options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#00cc', 'font': 'Times'}

displacy.render(doc, style='dep', jupyter=True, options=options)

# Name Entity Recognition

In [0]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [106]:
doc = nlp(u"The new consumer prices will be Rs 45.20 per kg in Delhi and Rs 51.35 per kg in Noida, Greater Noida and Ghaziabad, it said. \"The price of CNG being supplied by IGL in other areas shall remain unchanged as of now.\"")

show_ents(doc)

45.20 per kg - QUANTITY - Measurements, as of weight or distance
Delhi - GPE - Countries, cities, states
Rs - ORG - Companies, agencies, institutions, etc.
51.35 - PRODUCT - Objects, vehicles, foods, etc. (not services)
Noida - GPE - Countries, cities, states
Noida - PERSON - People, including fictional
Ghaziabad - GPE - Countries, cities, states
CNG - ORG - Companies, agencies, institutions, etc.
IGL - ORG - Companies, agencies, institutions, etc.


In [95]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 110})


In [96]:
def show_ents2(doc):
  for ent in doc.ents:
      print(ent.text,'----',ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

show_ents2(doc)

45.20 per kg ---- 7 10 35 47 QUANTITY
Delhi ---- 11 12 51 56 GPE
Rs ---- 13 14 61 63 ORG
51.35 ---- 14 15 64 69 PRODUCT
Noida ---- 18 19 80 85 GPE
Noida ---- 21 22 95 100 PERSON
Ghaziabad ---- 23 24 105 114 GPE
CNG ---- 32 33 139 142 ORG
IGL ---- 36 37 161 164 ORG


In [97]:
from spacy.strings import StringStore

rate_hash = StringStore([u'RATE']) # <-- match id
nlp.vocab.strings.add('RATE')

6378230409799124499

In [0]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
RATE = doc.vocab.strings[u'RATE']  

# Create a Span for the new entity
new_ent = Span(doc, 15, 17, label=RATE)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

In [102]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 110})


In [0]:
# for all phrase matching

doc = nlp(u"The new consumer prices will be Rs 45.20 per kg in Delhi and Rs 51.35 per kg in Noida, Greater Noida and Ghaziabad, it said. \"The price of CNG being supplied by IGL in other areas shall remain unchanged as of now.\"")

# Import PhraseMatcher and create a matcher object:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [0]:
# Create the desired phrase patterns:
phrase_list = ['per kg']
phrase_patterns = [nlp(text) for text in phrase_list]

In [116]:
# Apply the patterns to our matcher object:
matcher.add('newrate', None, *phrase_patterns)

# Apply the matcher to our Doc object:
matches = matcher(doc)

# See what matches occur:
matches

[(7295095847136214177, 8, 10), (7295095847136214177, 15, 17)]

In [110]:
list(doc.ents)[1:]

[Delhi, Rs, 51.35, Noida, Noida, Ghaziabad, CNG, IGL]

In [0]:
# Here we create Spans from each match, and create named entities from them:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'RATE']

new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents)[1:] + new_ents

In [118]:
show_ents(doc)

per kg - RATE - None
Delhi - GPE - Countries, cities, states
Rs - ORG - Companies, agencies, institutions, etc.
51.35 - PRODUCT - Objects, vehicles, foods, etc. (not services)
per kg - RATE - None
Noida - GPE - Countries, cities, states
Noida - PERSON - People, including fictional
Ghaziabad - GPE - Countries, cities, states
CNG - ORG - Companies, agencies, institutions, etc.
IGL - ORG - Companies, agencies, institutions, etc.


In [119]:
displacy.render(doc, style='ent', jupyter=True, options={'distance': 110})
