In [1]:
! pip install -q spacy 
! pip install -q tabulate
! python -m spacy download en_core_web_lg


import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from collections import Counter
import spacy
from tabulate import tabulate
nlp = spacy.load('en_core_web_lg')


Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.0.0/en_core_web_lg-2.0.0.tar.gz (852.3MB)
[K    75% |██████�

[K    100% |████████████████████████████████| 852.3MB 33.7MB/s 
[?25hInstalling collected packages: en-core-web-lg
  Running setup.py install for en-core-web-lg ... [?25l- \ | / - \ | / - \ | / done
[?25hSuccessfully installed en-core-web-lg-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python2.7/dist-packages/en_core_web_lg -->
    /usr/local/lib/python2.7/dist-packages/spacy/data/en_core_web_lg

    You can now load the model via spacy.load('en_core_web_lg')

[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /content/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Basic NLP pipeline


*   Sentence tokenizer
*   Word Tokenizer
*   Parts of speech tagger
*   Noun extraction
*   Verb Extraction



In [0]:
text = u"""
Dealing with textual data is very crucial so to handle these text data we need some 
basic text processing steps. Most of the processing steps covered in this section are 
commonly used in NLP and involve the combination of several steps into a single 
executable flow. This is usually referred to as the NLP pipeline. These flow 
can be a combination of tokenization, stemming, word frequency, parts of 
speech tagging, etc.
"""

sentenses = nltk.sent_tokenize(text)

words = [nltk.word_tokenize(s) for s in sentenses]

tagged_wt = [nltk.pos_tag(w)for w in words]

patternPOS= []
for tag in tagged_wt:
  patternPOS.append([v for k,v in tag])
  
nouns = []  
for tag in tagged_wt:
  nouns.append([k for k,v in tag if v in ['NN','NNS','NNP','NNPS']])


verbs = []  
for tag in tagged_wt:
  verbs.append([k for k,v in tag if v in ['VB','VBD','VBG','VBN','VBP','VBZ']])





In [8]:
print "Sentences are:", sentenses
print "Words are: ", words
print "POS are: ", tagged_wt
print "POS pattern : ",patternPOS
print "Extracted nouns are: ",nouns
print "Extracted verbs are: ",verbs


Sentences are: [u'\nDealing with textual data is very crucial so to handle these text data we need some \nbasic text processing steps.', u'Most of the processing steps covered in this section are \ncommonly used in NLP and involve the combination of several steps into a single \nexecutable flow.', u'This is usually referred to as the NLP pipeline.', u'These flow \ncan be a combination of tokenization, stemming, word frequency, parts of \nspeech tagging, etc.']
Words are:  [[u'Dealing', u'with', u'textual', u'data', u'is', u'very', u'crucial', u'so', u'to', u'handle', u'these', u'text', u'data', u'we', u'need', u'some', u'basic', u'text', u'processing', u'steps', u'.'], [u'Most', u'of', u'the', u'processing', u'steps', u'covered', u'in', u'this', u'section', u'are', u'commonly', u'used', u'in', u'NLP', u'and', u'involve', u'the', u'combination', u'of', u'several', u'steps', u'into', u'a', u'single', u'executable', u'flow', u'.'], [u'This', u'is', u'usually', u'referred', u'to', u'as', u

In [4]:
doc = nlp(text)
noun_counter = Counter(token.lemma_ for token in doc if token.pos_ == 'NOUN')

print(tabulate(noun_counter.most_common(5), headers=['Noun', 'Count']))

Noun           Count
-----------  -------
step               3
combination        2
text               2
processing         2
datum              2


# Dependency parsing

In [5]:
doc = nlp(sentenses[2])
spacy.displacy.render(doc,style='dep', options={'distance' : 140}, jupyter=True)

# Name Entity Extraction

In [6]:
# doc = nlp("Jill laughed at John Johnson.")
doc = nlp(sentenses[1])
entity_types = ((ent.text, ent.label_) for ent in doc.ents)
print(tabulate(entity_types, headers=['Entity', 'Entity Type']))
print()
token_entity_info = ((token.text, token.ent_iob_, token.ent_type_,) for token in doc)
print(tabulate(token_entity_info, headers=['Token', 'IOB Annotation', 'Entity Type']))

Entity    Entity Type
--------  -------------
NLP       ORG
()
Token        IOB Annotation    Entity Type
-----------  ----------------  -------------
Most         O
of           O
the          O
processing   O
steps        O
covered      O
in           O
this         O
section      O
are          O
             O
commonly     O
used         O
in           O
NLP          B                 ORG
and          O
involve      O
the          O
combination  O
of           O
several      O
steps        O
into         O
a            O
single       O
             O
executable   O
flow         O
.            O


In [7]:
doc = nlp(u"My name is Jack and I live in India.")

entity_types = ((ent.text, ent.label_) for ent in doc.ents)
print(tabulate(entity_types, headers=['Entity', 'Entity Type']))

Entity    Entity Type
--------  -------------
Jack      PERSON
India     GPE
