In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc8 = nlp(u"Apple to build a Home Kong factory for $6 millon")
for token in doc8:
    print(token.text, end=' | ')
print('\n----')
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Apple | to | build | a | Home | Kong | factory | for | $ | 6 | millon | 
----
Apple - ORG - Companies, agencies, institutions, etc.
Home Kong - GPE - Countries, cities, states
$6 - MONEY - Monetary values, including unit


In [2]:
#noun chunk
#when their is noun in sentence then use noun chuck
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufactures.")
for chunk in doc9.noun_chunks: 
    print(chunk.text)

Autonomous cars
insurance liability
manufactures


In [3]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")
for chunk in doc10.noun_chunks:#chunking or grouping the nouns like red car
    print(chunk.text)

Red cars
higher insurance rates


In [None]:
"""
stemming is a somewhat crude method for cataloging related words; it essentialy chops off letters
from the end until the stem is reached.
"""

In [8]:
#import the toolkit and the full Porter Stemmer library
import nltk
from nltk.stem.porter import *

In [9]:
p_stemmer = PorterStemmer()

In [10]:
words=['run','runner','running','ran','runs','easily','fairly']

In [11]:
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [None]:
"""
Note how the stemmer recognizes "runner" as a noun, not a verb form or participle. Also
the adverbs "easily" and "fairly" are stemmed to the 

"""

In [13]:
from nltk.stem.snowball import SnowballStemmer

#The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language="english")

In [14]:
words = ['run','runner','running','ran','runs','easily','fairly']
#words = ['generous','generation','generously','generate']

In [15]:
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair


In [None]:
#runner it treat as a runner hence it cannot do run

In [18]:
phrase = "I am meeting him tomorrow at the meeting"
for word in phrase.split():
    print(word+' --> '+p_stemmer.stem(word))

I --> i
am --> am
meeting --> meet
him --> him
tomorrow --> tomorrow
at --> at
the --> the
meeting --> meet


In [None]:
#here the word "meeting" appears twice- once as a verb, and once as a noun, and yet the stemmer treats both equally

In [None]:
"""
Lemmitization check the contex. then the word in it convert into orginal form which depends on 
the sentence
"""

In [None]:
"""

"""

In [19]:
#perform standard imports:
import spacy 
nlp = spacy.load('en_core_web_sm')

In [21]:
doc1 = nlp(u"I am runner running in a race because I love to run since I ran today")
for token in doc1:
    print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
runner 	 NOUN 	 12640964157389618806 	 runner
running 	 VERB 	 12767647472892411841 	 run
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [None]:
"""
Function we are an f-string to format the printed text by setting minimum field
widths and adding a left-align to the lemma hash value
"""

In [24]:
def show_lemmas(text):
    for token in text:
        print(f"{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}")

In [25]:
doc2 = nlp(u"I saw eighteen mice today!")
show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
eighteen     NUM    9609336664675087640    eighteen
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today
!            PUNCT  17494803046312582752   !


In [26]:
doc3 = nlp(u"I am meeting him tomorrow at the meeting.")
show_lemmas(doc3)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


In [27]:
doc4 = nlp(u"That's an enormous automobile")
show_lemmas(doc4)

That         PRON   4380130941430378203    that
's           AUX    10382539506755952630   be
an           DET    15099054000809333061   an
enormous     ADJ    17917224542039855524   enormous
automobile   NOUN   7211811266693931283    automobile


In [None]:
"""
stop words words like "a" and "the" appear so frequently that they don't require tagging as
thoroughly as nouns, verbs and modifiers. 
"""

In [None]:
#stop words are all words rather than nouns

In [29]:
#perform standars imports
import spacy 
nlp = spacy.load('en_core_web_sm')

In [30]:
#print the set of all spacy stop words
print(nlp.Defaults.stop_words)
#326 stop words their are in english

{'still', 'after', 'twelve', 'whenever', 'together', 'anyone', 'due', 'next', 'us', 'empty', 'many', 'so', 'in', 'sometime', 'much', 'serious', 'name', 'thus', 'hence', 'ten', 'made', 'am', '’s', 'its', 'hereupon', 'indeed', 'front', 'it', 'everywhere', 'such', 'noone', 'you', 'because', 'down', 'yourself', 'becomes', 'really', 'just', 'thereupon', 'whither', 'anywhere', 'yourselves', 'same', 'several', "'s", 'or', 'through', 'there', 'former', 'thru', 'my', 'your', 'back', 'without', 'ca', "'ve", 'was', "n't", 'whose', 'then', 'becoming', 'take', 'cannot', 'her', 'somewhere', 'him', 'is', 'already', 'here', 'during', 'our', 'mine', 'latterly', 'more', 'neither', 'almost', 'make', 'enough', 'whereby', 'among', 'when', 'four', 'out', 'why', 'every', 'few', 'using', 'perhaps', 'afterwards', 'may', 'yours', 'part', 'beyond', 'anyway', "'m", 'not', 're', 'which', 'across', 'most', '’d', 'another', 'rather', 'bottom', 'amount', 'did', 'twenty', 'along', 'side', 'meanwhile', 'give', 'them', 

In [31]:
len(nlp.Defaults.stop_words)

326

In [None]:
#To see if a word is stop word 

In [32]:
nlp.vocab['myself'].is_stop

True

In [33]:
nlp.vocab['mystery'].is_stop

False

In [None]:
#to add stop word

In [34]:
#Add the word to the set of stop words. Use lowercase!
nlp.Defaults.stop_words.add('btw')
#set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

In [35]:
len(nlp.Defaults.stop_words)

327

In [36]:
nlp.vocab['btw'].is_stop

True

In [37]:
#to remove a stop words

In [39]:
#let consider beyound is stop words
nlp.Defaults.stop_words.remove('beyond')
#Remove the stop_word tag from the lexeme
nlp.vocab['beyond'].is_stop = False