## Tokenization

In [1]:
!pip install nltk



In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/srinidhisunkara/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
corpus = """ Hello Welcome, to Srinidhi's NLP.
Please do read all the content I post! to become an expert. """

In [8]:
# Tokenization
# Sentence --> paragraphs

from nltk.tokenize import sent_tokenize
documents = sent_tokenize(corpus)

In [9]:
for sentence in documents:
    print(sentence)

 Hello Welcome, to Srinidhi's NLP.
Please do read all the content I post!
to become an expert.


In [10]:
# Paragraph to words
# sentence to words

from nltk.tokenize import word_tokenize

word_tokenize(corpus)

#expect 's everthing is seperated.

['Hello',
 'Welcome',
 ',',
 'to',
 'Srinidhi',
 "'s",
 'NLP',
 '.',
 'Please',
 'do',
 'read',
 'all',
 'the',
 'content',
 'I',
 'post',
 '!',
 'to',
 'become',
 'an',
 'expert',
 '.']

In [12]:
for sentence in documents:
    print(sentence+"  "+str(word_tokenize(sentence)))

 Hello Welcome, to Srinidhi's NLP.  ['Hello', 'Welcome', ',', 'to', 'Srinidhi', "'s", 'NLP', '.']
Please do read all the content I post!  ['Please', 'do', 'read', 'all', 'the', 'content', 'I', 'post', '!']
to become an expert.  ['to', 'become', 'an', 'expert', '.']


In [18]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus)

# even the 's is seperated

['Hello',
 'Welcome',
 ',',
 'to',
 'Srinidhi',
 "'",
 's',
 'NLP',
 '.',
 'Please',
 'do',
 'read',
 'all',
 'the',
 'content',
 'I',
 'post',
 '!',
 'to',
 'become',
 'an',
 'expert',
 '.']

In [19]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
tokenizer.tokenize(corpus)

# 's remains the same but . is attached to the lastword expect the last sentences last word

['Hello',
 'Welcome',
 ',',
 'to',
 'Srinidhi',
 "'s",
 'NLP.',
 'Please',
 'do',
 'read',
 'all',
 'the',
 'content',
 'I',
 'post',
 '!',
 'to',
 'become',
 'an',
 'expert',
 '.']

## Stemming - Text Preprocessing
-- Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as lemma. Stemming is important in natural language understanding (NLU) and natural language processing (NLP)

In [35]:
### Reviews ---> eating, eat, eaten - stem word is eat
words = ["eating", "eats","eaten","writing","writes","programming","programs","history","finally","finalized"]

#### PorterStemmer

In [36]:
from nltk.stem import PorterStemmer
stemming=PorterStemmer()

In [37]:
for word in words:
    print(word+"---->"+stemming.stem(word))
## major disadvantage of stemming is for some of the words the form changes and may not get the exact meaning history---->histori

eating---->eat
eats---->eat
eaten---->eaten
writing---->write
writes---->write
programming---->program
programs---->program
history---->histori
finally---->final
finalized---->final


In [38]:
# no work case
stemming.stem("congratulations")

'congratul'

In [39]:
# working case
stemming.stem('sitting')

'sit'

### RegexpStemmer class
-- with the help of which we can easily implement regular expression stemmer algorithms. It basically takes a single regular expression and removes any prefix or suffix that matches the expression

In [40]:
from nltk.stem import RegexpStemmer
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$', min=4) # remove ing or s or e or able is present at the end of the word

In [41]:
reg_stemmer.stem('eating')

'eat'

In [42]:
reg_stemmer.stem('ingeating')

'ingeat'

### Snowball Stemmer 
-- better than porter stemmer

In [43]:
from nltk.stem import SnowballStemmer
snowballstemmer = SnowballStemmer('english')

In [44]:
for word in words:
    print(word+'------>'+snowballstemmer.stem(word))

eating------>eat
eats------>eat
eaten------>eaten
writing------>write
writes------>write
programming------>program
programs------>program
history------>histori
finally------>final
finalized------>final


In [45]:
# difference between snowball and porter stemmer
print(stemming.stem('fairly'))
print(stemming.stem('sportingly'))

fairli
sportingli


In [46]:
print(snowballstemmer.stem('fairly'))
print(snowballstemmer.stem('sportingly'))

fair
sport


- disadvantage of stemming - for some of the words form change
- Lemmitization solves all those problems

## Lemmatizer
- Output of lemmatization is called lemma or root word rather than stem word
- We get valid word , no change in form
- It takes more time to run than stemming
- Used in Q & A, chatbots, text summerization

In [52]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/srinidhisunkara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/srinidhisunkara/nltk_data...


In [56]:
'''
POS 
Noun - n - default
Verb - v
Adjective - a
Adverb - r
'''
for pos in ['n','v','a','r']:
    print('going'+"---->"+pos+"----->"+lemmatizer.lemmatize("going", pos=pos))

going---->n----->going
going---->v----->go
going---->a----->going
going---->r----->going


In [59]:
for word in words:
    print(word+"----->"+lemmatizer.lemmatize(word,pos='v'))

eating----->eat
eats----->eat
eaten----->eat
writing----->write
writes----->write
programming----->program
programs----->program
history----->history
finally----->finally
finalized----->finalize


## Stopwords

In [66]:
paragraph = '''
I have three visions for India.

In 3000 years of our history, people from all over the world have come and invaded us, captured our lands, conquered our minds. From Alexander onwards, the Greeks, the Portuguese, the British, the French, the Dutch—all of them came and looted us, took over what was ours. Yet, we have not conquered anyone. We have not grabbed their land, culture, and history, nor tried to enforce our way of life on them.

Why? Because we respect the freedom of others.

That is why my first vision is that of freedom.
I believe that India got its first vision of freedom in 1857, when we started the war of independence. It is this freedom that we must protect and nurture if we are to build a self-reliant and developed India.

My second vision for India’s development.
For fifty years, we have been a developing nation. It is time we see ourselves as a developed nation. We are among the top five nations in the world in terms of GDP. We have ten percent growth rate in most areas. Our poverty levels are falling, our achievements are being recognized worldwide. Yet, we lack the confidence to see ourselves as a developed nation, self-reliant and self-assured.

Isn’t this incorrect?

I have a third vision.
India must stand up to the world. Because I believe that unless India stands up to the world, no one will respect us. Only strength respects strength. We must be strong not only as a military power but also as an economic power. Both must go hand-in-hand.

I want to ask you, what kind of a nation are we building?
There was a time when we were the most advanced and prosperous nation in the world. We had great minds, wealth, and culture. But over the years, we lost confidence, and today, even after so many advancements, we hesitate to call ourselves developed.

Why?

We are obsessed with foreign things. We want foreign televisions, foreign shirts, foreign technology. Why this obsession? It is self-respect that we are lacking. If we do not respect ourselves, how can we expect others to respect us?

I was in Hyderabad giving a lecture when a 14-year-old girl asked me for my autograph. I asked her what her goal in life was. She replied, "I want to live in a developed India."

For her, and for all young people, I have this dream:
A developed India. An India that is strong. An India that is self-reliant.

Let us all work together and make this dream a reality.

Jai Hind!
'''
# words like i the she have - dont play any significant role

In [63]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/srinidhisunkara/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [64]:
# print the stopwords available in the corpus
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [65]:
stemmer = PorterStemmer()

In [71]:
sentences = nltk.sent_tokenize(paragraph)
sentences

['\nI have three visions for India.',
 'In 3000 years of our history, people from all over the world have come and invaded us, captured our lands, conquered our minds.',
 'From Alexander onwards, the Greeks, the Portuguese, the British, the French, the Dutch—all of them came and looted us, took over what was ours.',
 'Yet, we have not conquered anyone.',
 'We have not grabbed their land, culture, and history, nor tried to enforce our way of life on them.',
 'Why?',
 'Because we respect the freedom of others.',
 'That is why my first vision is that of freedom.',
 'I believe that India got its first vision of freedom in 1857, when we started the war of independence.',
 'It is this freedom that we must protect and nurture if we are to build a self-reliant and developed India.',
 'My second vision for India’s development.',
 'For fifty years, we have been a developing nation.',
 'It is time we see ourselves as a developed nation.',
 'We are among the top five nations in the world in terms 

In [69]:
## apply stop words filter them and apply stemming

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))] # consider only words that are not stop words and apply stemmer
    sentences[i]= ' '.join(words) # convert all the words to sentence

In [70]:
sentences

['i three vision india .',
 'in 3000 year histori , peopl world come invad us , captur land , conquer mind .',
 'from alexand onward , greek , portugues , british , french , dutch—al came loot us , took .',
 'yet , conquer anyon .',
 'we grab land , cultur , histori , tri enforc way life .',
 'whi ?',
 'becaus respect freedom other .',
 'that first vision freedom .',
 'i believ india got first vision freedom 1857 , start war independ .',
 'it freedom must protect nurtur build self-reli develop india .',
 'my second vision india ’ develop .',
 'for fifti year , develop nation .',
 'it time see develop nation .',
 'we among top five nation world term gdp .',
 'we ten percent growth rate area .',
 'our poverti level fall , achiev recogn worldwid .',
 'yet , lack confid see develop nation , self-reli self-assur .',
 'isn ’ incorrect ?',
 'i third vision .',
 'india must stand world .',
 'becaus i believ unless india stand world , one respect us .',
 'onli strength respect strength .',
 'we

In [74]:
## applying snowball stemmer
sentences = nltk.sent_tokenize(paragraph)
stemmer = SnowballStemmer('english')
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))] # consider only words that are not stop words and apply stemmer
    sentences[i]= ' '.join(words) # convert all the words to sentence
sentences

['i three vision india .',
 'in 3000 year histori , peopl world come invad us , captur land , conquer mind .',
 'from alexand onward , greek , portugues , british , french , dutch—al came loot us , took .',
 'yet , conquer anyon .',
 'we grab land , cultur , histori , tri enforc way life .',
 'whi ?',
 'becaus respect freedom other .',
 'that first vision freedom .',
 'i believ india got first vision freedom 1857 , start war independ .',
 'it freedom must protect nurtur build self-reli develop india .',
 'my second vision india ’ develop .',
 'for fifti year , develop nation .',
 'it time see develop nation .',
 'we among top five nation world term gdp .',
 'we ten percent growth rate area .',
 'our poverti level fall , achiev recogn worldwid .',
 'yet , lack confid see develop nation , self-reli self-assur .',
 'isn ’ incorrect ?',
 'i third vision .',
 'india must stand world .',
 'becaus i believ unless india stand world , one respect us .',
 'onli strength respect strength .',
 'we

In [82]:
# applying lemmatization
sentences = nltk.sent_tokenize(paragraph)
lemmatizer = WordNetLemmatizer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word, pos='v') for word in words if word not in set(stopwords.words('english'))] # consider only words that are not stop words and apply stemmer
    sentences[i]= ' '.join(words) # convert all the words to sentence
sentences

['I three visions India .',
 'In 3000 years history , people world come invade us , capture land , conquer mind .',
 'From Alexander onwards , Greeks , Portuguese , British , French , Dutch—all come loot us , take .',
 'Yet , conquer anyone .',
 'We grab land , culture , history , try enforce way life .',
 'Why ?',
 'Because respect freedom others .',
 'That first vision freedom .',
 'I believe India get first vision freedom 1857 , start war independence .',
 'It freedom must protect nurture build self-reliant develop India .',
 'My second vision India ’ development .',
 'For fifty years , develop nation .',
 'It time see develop nation .',
 'We among top five nations world term GDP .',
 'We ten percent growth rate areas .',
 'Our poverty level fall , achievements recognize worldwide .',
 'Yet , lack confidence see develop nation , self-reliant self-assured .',
 'Isn ’ incorrect ?',
 'I third vision .',
 'India must stand world .',
 'Because I believe unless India stand world , one res