In [1]:
import spacy

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [2]:
english = spacy.load("en_core_web_sm")

In [3]:
doc1 = "We're moving to U.S. !"

In [4]:
doc1

"We're moving to U.S. !"

In [5]:
doc1.split(" ")

["We're", 'moving', 'to', 'U.S.', '!']

In [6]:
e1=english(doc1)

In [7]:
e1[2]

moving

In [8]:
for token in e1:
    print(token.text)

We
're
moving
to
U.S.
!


In [9]:
doc2 = english("We're here to help! , can you send email to support@lwindia.com or visit website http://www.lwindia.com!")

In [10]:
for t in doc2:
    print(t.text)

We
're
here
to
help
!
,
can
you
send
email
to
support@lwindia.com
or
visit
website
http://www.lwindia.com
!


In [11]:
doc3 = english("A 10km Mumbai cab ride costs me $10.50")

In [12]:
for token in doc3:
    print(token.text)

A
10
km
Mumbai
cab
ride
costs
me
$
10.50


In [13]:
doc4 = english("Let's visit St. Clara in the U.S. valley next year.")

In [14]:
for token in doc4:
    print(token.text)

Let
's
visit
St.
Clara
in
the
U.S.
valley
next
year
.


In [15]:
doc5 = english("Microsoft to build a Hong Kong factory for $7 million.")

In [16]:
for token in doc5:
    print(token)

Microsoft
to
build
a
Hong
Kong
factory
for
$
7
million
.


In [17]:
# name entity
for token in doc5.ents:
    print(token, end=" : ")
    print(token.label_)

Microsoft : ORG
Hong Kong : GPE
$7 million : MONEY


In [18]:
doc7 = english("Autonomous cars shift insurance liability towards manufacturers.")

In [19]:
for token in doc7:
    print(token)

Autonomous
cars
shift
insurance
liability
towards
manufacturers
.


In [20]:
# Noun Chunks
for token in doc7.noun_chunks:
    print(token)

Autonomous cars
insurance liability
manufacturers


In [21]:
'''
Spacy does not support stemming. 

Stemming: converting words into base words. Ex. running => run

In spacy stemming == lemmatization
'''

'\nSpacy does not support stemming. \n\nStemming: converting words into base words. Ex. running => run\n\nIn spacy stemming == lemmatization\n'

In [22]:
# using sapcy

for token in doc7:
    print(token.text, token.pos_, token.lemma, token.lemma_)

Autonomous ADJ 9558915150818865765 autonomous
cars NOUN 17545852598994811774 car
shift VERB 8057115410453277513 shift
insurance NOUN 1031890164806390884 insurance
liability NOUN 14514743720633195770 liability
towards ADP 9315050841437086371 towards
manufacturers NOUN 2527826152045135642 manufacturer
. PUNCT 12646065887601541794 .


In [23]:
doc10 = 

SyntaxError: invalid syntax (<ipython-input-23-62886faa834c>, line 1)

In [58]:
words = ["ran", "run", "running"]

In [83]:
# Stemming using nltk library

import nltk

In [61]:
# Using PorterStemmemr
from nltk.stem.porter import PorterStemmer

In [62]:
pstem = PorterStemmer()

In [63]:
pstem.stem("run")

'run'

In [64]:
pstem.stem("running")

'run'

In [65]:
# does not recognise base word properly
pstem.stem("ran")

'ran'

In [67]:
pstem.stem("runs")

'run'

In [68]:
# not giving proper output
pstem.stem("easily")

'easili'

In [84]:
# Using Snowball Stemmer

from nltk.stem.snowball import SnowballStemmer

In [70]:
snowball = SnowballStemmer(language="english")

In [71]:
snowball.stem("running")

'run'

In [73]:
# It is not able to understand 'ran' as well.
snowball.stem("ran")

'ran'

In [74]:
snowball.stem("fairly")

'fair'

In [81]:
doc11 = english("I saw eight ice yesterday.")

In [82]:
for token in doc11:
    print(token.text, " : ", token.lemma_)

I  :  I
saw  :  see
eight  :  eight
ice  :  ice
yesterday  :  yesterday
.  :  .


In [85]:
# All the stop words.
# Stop words : Not necessary words in the sentence.
english.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [86]:
len(english.Defaults.stop_words)

326

In [88]:
len(english.vocab)

812

In [89]:
english.vocab['can'].is_stop

True

In [90]:
english.vocab['run'].is_stop

False

In [95]:
english.vocab['os'].is_stop

False

In [24]:
# to add your own stop word Dict. 

english.Defaults.stop_words.add('os')

In [25]:
english.vocab['os'].is_stop

True