<a href="https://colab.research.google.com/github/VishalMaurya/NLPwithPython/blob/master/Course/NLP_01_python_basic/Python_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# spaCy Basics
spaCy (https://spacy.io/) is an open-source Python library that parses and "understands" large volumes of text. Separate models are available that cater to specific languages (English, French, German, etc.).

In [0]:
import numpy as np
import pandas as pd
import spacy

In [0]:
nlp = spacy.load('en_core_web_sm')

In [0]:
# create doc object
doc = nlp(u'Punjab CM Capt Amarinder Singh has invited President Ram Nath Kovind, Prime Minister Narendra Modi and former Prime Minister Manmohan Singh to join the first all-party group that would attend the historic event of the opening of Kartarpur Corridor in Pakistan.')

In [4]:
for token in doc:
  print(f'{token.text:{10}} {token.pos_:{10}} {token.dep_:{10}} {spacy.explain(token.dep_):}')

Punjab     PROPN      compound   compound
CM         PROPN      compound   compound
Capt       PROPN      compound   compound
Amarinder  PROPN      compound   compound
Singh      PROPN      nsubj      nominal subject
has        VERB       aux        auxiliary
invited    VERB       ROOT       None
President  PROPN      compound   compound
Ram        PROPN      compound   compound
Nath       PROPN      compound   compound
Kovind     PROPN      dobj       direct object
,          PUNCT      punct      punctuation
Prime      PROPN      compound   compound
Minister   PROPN      compound   compound
Narendra   PROPN      compound   compound
Modi       PROPN      conj       conjunct
and        CCONJ      cc         coordinating conjunction
former     ADJ        amod       adjectival modifier
Prime      PROPN      compound   compound
Minister   PROPN      compound   compound
Manmohan   PROPN      compound   compound
Singh      PROPN      conj       conjunct
to         PART       aux        auxi

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fa656c5ca90>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fa656628708>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fa656628768>)]

# Tokenization

In [6]:
for token in doc:
  print(f'{token.text:{10}} {token.tag_:{10}} {token.lemma_:{10}} {token.is_alpha:{10}} {spacy.explain(token.tag_):>{25}}')

Punjab     NNP        Punjab              1     noun, proper singular
CM         NNP        CM                  1     noun, proper singular
Capt       NNP        Capt                1     noun, proper singular
Amarinder  NNP        Amarinder           1     noun, proper singular
Singh      NNP        Singh               1     noun, proper singular
has        VBZ        have                1 verb, 3rd person singular present
invited    VBN        invite              1     verb, past participle
President  NNP        President           1     noun, proper singular
Ram        NNP        Ram                 1     noun, proper singular
Nath       NNP        Nath                1     noun, proper singular
Kovind     NNP        Kovind              1     noun, proper singular
,          ,          ,                   0   punctuation mark, comma
Prime      NNP        Prime               1     noun, proper singular
Minister   NNP        Minister            1     noun, proper singular
Narendra   N

In [7]:
doc2 = nlp(u'In nutshell, Pakistan came out of the cold to shoot itself in foot and give India yet another sweet victory.\
 The previous one for India was in the International Court of Justice (ICJ) which stayed execution of Kulbhushan Jadhav, the former Indian Navy officer, and ordered review of the judgment of a Pakistan military court.')
for sent in doc2.sents:
  print(sent)

In nutshell, Pakistan came out of the cold to shoot itself in foot and give India yet another sweet victory.
The previous one for India was in the International Court of Justice (ICJ) which stayed execution of Kulbhushan Jadhav, the former Indian Navy officer, and ordered review of the judgment of a Pakistan military court.


In [8]:
span_doc = doc2[30:34]
span_doc

International Court of Justice

In [9]:
print(type(span_doc))
print(type(doc2))

<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.doc.Doc'>


In [10]:
for entity in doc2.ents:
  print(entity) 
  print(entity.label_) 
  print(spacy.explain(entity.label_))
  print('\n')

Pakistan
GPE
Countries, cities, states


India
GPE
Countries, cities, states


India
GPE
Countries, cities, states


the International Court of Justice
ORG
Companies, agencies, institutions, etc.


ICJ
ORG
Companies, agencies, institutions, etc.


Kulbhushan Jadhav
PERSON
People, including fictional


Indian
NORP
Nationalities or religious or political groups


Navy
ORG
Companies, agencies, institutions, etc.


Pakistan
GPE
Countries, cities, states




In [11]:
for chunk in doc2.noun_chunks:
  print(chunk)

nutshell
Pakistan
the cold
itself
foot
India
yet another sweet victory
The previous one
India
Justice
execution
Kulbhushan Jadhav
the former Indian Navy officer
review
the judgment
a Pakistan military court


In [12]:
spacy.displacy.render(doc2, style='ent', jupyter=True)

In [13]:
spacy.displacy.render(doc2, style='dep', jupyter=True, options={'distance':80})


# Stemming

In [0]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [15]:
doc2

In nutshell, Pakistan came out of the cold to shoot itself in foot and give India yet another sweet victory. The previous one for India was in the International Court of Justice (ICJ) which stayed execution of Kulbhushan Jadhav, the former Indian Navy officer, and ordered review of the judgment of a Pakistan military court.

In [16]:
Porter = PorterStemmer()
Snowball = SnowballStemmer(language='english')
for token in doc2:
  print(token.text,'----->',Porter.stem(token.text),'|',Snowball.stem(token.text))

In -----> In | in
nutshell -----> nutshel | nutshel
, -----> , | ,
Pakistan -----> pakistan | pakistan
came -----> came | came
out -----> out | out
of -----> of | of
the -----> the | the
cold -----> cold | cold
to -----> to | to
shoot -----> shoot | shoot
itself -----> itself | itself
in -----> in | in
foot -----> foot | foot
and -----> and | and
give -----> give | give
India -----> india | india
yet -----> yet | yet
another -----> anoth | anoth
sweet -----> sweet | sweet
victory -----> victori | victori
. -----> . | .
The -----> the | the
previous -----> previou | previous
one -----> one | one
for -----> for | for
India -----> india | india
was -----> wa | was
in -----> in | in
the -----> the | the
International -----> intern | intern
Court -----> court | court
of -----> of | of
Justice -----> justic | justic
( -----> ( | (
ICJ -----> icj | icj
) -----> ) | )
which -----> which | which
stayed -----> stay | stay
execution -----> execut | execut
of -----> of | of
Kulbhushan -----> kulbh

# Lemmatization

In [20]:
doc3 = nlp(u"I am a runner running in a race because I love to run since I ran today")

for token in doc3:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 561228191312463089 	 -PRON-
am 	 VERB 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 ADP 	 16950148841647037698 	 because
I 	 PRON 	 561228191312463089 	 -PRON-
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 ADP 	 10066841407251338481 	 since
I 	 PRON 	 561228191312463089 	 -PRON-
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


# Stopwords

In [25]:
# total number of stopwords in spaCy
(nlp.Defaults.stop_words)

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [29]:
word_list = ['ran', 'parsimonious', 'witch']
for word in word_list:
  print(nlp.vocab[word].is_stop)

False
False
False


In [27]:
 nlp.vocab['since'].is_stop

True

In [0]:
# add a new stop word
nlp.Defaults.stop_words.add('btw')

In [36]:
len(nlp.Defaults.stop_words)

326

In [0]:
nlp.Defaults.stop_words.remove('btw')

# Vocabulary & Matcher

In [0]:
from spacy.matcher import Matcher, PhraseMatcher

# word matching
matcher = Matcher(nlp.vocab)

# phrase matching
Pmathcer = PhraseMatcher(nlp.vocab)

In [0]:
doc5 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [0]:
pattern = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'powered', 'OP':'*'}]

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern)

In [42]:
matcher(doc5)

[(8656102463236116519, 0, 1),
 (8656102463236116519, 0, 2),
 (8656102463236116519, 0, 3),
 (8656102463236116519, 5, 6),
 (8656102463236116519, 5, 7),
 (8656102463236116519, 5, 8)]