In [8]:
import nltk
from nltk.tokenize import word_tokenize


import pandas as pd

import os

### Taking a Sample text as string

In [10]:
text_string = """Fortunato and Montresor are descending into the damp vaults under the river bed. Montresor strategically planned all these; and following his friend investigative Fortunato is in the excitement of the amontillado. They reach the final end of the vault. Fortunato is heavily intoxicated by Montresor as a part of his plan. There in the centre of the small cabin was sitting a wooden cask. It was framed by gold lining and decorated with priceless jewels; and there were skulls and human-sized bones scattered here and there inside that cabin. Fortunato not in the presence of mind to fear or appreciate the terrific elegance of the scene they have arrived at; he asked “ Give me amontillado “. Montresor replied in a wittily crooked manner “ Let me repeat the consequences or shall I say the ritual of drinking this elixir of other worldly experience.” He went on, “ This cask in front of you is no ordinary cask of wine, it captures the mysteries of the universe; those who can tolerate such energy shall live a million years and those who can’t shall sacrifice their life to the cask of amontillado.”
“ Hence I ask you again. And it is also my duty as a servant to the Lord Odallitnoma! Do you must drink the amontillado, Fortunato? “ asked Montresor. “ Yes! Amontillado! “ said Fortunato in an eerie voice; his eyes brightly locked towards the cask.
Montresor poured some of the blood red liquid in a purple glass which was carefully placed in a pocket on the side of the cask. Fortunato looked mesmerizingly at the wine and carefully took the glass from Montresor’s hands. He drank the wine and immediately he closed his eyes; his hands stuck in the same position glass in front of his mouth. He opened his eyes after a few seconds. Surprisingly his intoxication had seemed to have gone away. He spoke in his normal voice, “ Montresor! “.
“How do you feel?” curiously asked Montresor.
“I don’t feel my body. Also why are you asking me? Why don't you taste for yourself? What is the drink ? What is this place? “
“It is part of the amontillado ritual. For this drink to stay in existence every year on the day of the blood moon one soul must be sacrificed, or else  the drink will evaporate inside the empty cask. Your soul has left your body already. Now, if you are strong enough you will suffer immortality and if you are weak then your body will spill like wine and will be soaked into the gravel underneath.” Montresor looked at Fortunato,” And why can't I drink the wine?.. I already did. Five Hundred years ago. Now I am cursed with my body as the servant; to keep the cycle of sacrifice continued. If I drink it again I will be transformed into liquid and infused into the cask.”
Suddenly there is a bright red light from inside the cask. Fortunato’s hands and limbs were falling from his body. He shouted in pain. As soon as his limbs fell to the ground it transformed into red liquid. Pieces by pieces all his body parts and finally his head converted into thick red liquid. To the surprise of Montresor, Fortunato’s liquid body was not soaked into the ground. Instead it started moving upwards, towards the golden cask. The cask was still glowing red. The red liquid on the ground was slowly absorbed by the cask and after completion the red glow went dim. Montresor remembered; few centuries ago one stranger gave him a bottle of wine and forbade him to open that, but out of curiosity he opened that and liked the taste of the drink.
"""

### Tokenizing text with word-tokenizer

In [25]:
tokens = word_tokenize(text_string)
print(tokens)
print(len(tokens))

['Fortunato', 'and', 'Montresor', 'are', 'descending', 'into', 'the', 'damp', 'vaults', 'under', 'the', 'river', 'bed', '.', 'Montresor', 'strategically', 'planned', 'all', 'these', ';', 'and', 'following', 'his', 'friend', 'investigative', 'Fortunato', 'is', 'in', 'the', 'excitement', 'of', 'the', 'amontillado', '.', 'They', 'reach', 'the', 'final', 'end', 'of', 'the', 'vault', '.', 'Fortunato', 'is', 'heavily', 'intoxicated', 'by', 'Montresor', 'as', 'a', 'part', 'of', 'his', 'plan', '.', 'There', 'in', 'the', 'centre', 'of', 'the', 'small', 'cabin', 'was', 'sitting', 'a', 'wooden', 'cask', '.', 'It', 'was', 'framed', 'by', 'gold', 'lining', 'and', 'decorated', 'with', 'priceless', 'jewels', ';', 'and', 'there', 'were', 'skulls', 'and', 'human-sized', 'bones', 'scattered', 'here', 'and', 'there', 'inside', 'that', 'cabin', '.', 'Fortunato', 'not', 'in', 'the', 'presence', 'of', 'mind', 'to', 'fear', 'or', 'appreciate', 'the', 'terrific', 'elegance', 'of', 'the', 'scene', 'they', 'hav

### Converting to lowercase

In [27]:
tokens_lower = [i.lower() for i in tokens]

###### Checking frequency

In [34]:
from nltk.probability import FreqDist
dic = FreqDist()

for i in tokens_lower:
    dic[i] += 1

print(len(dic))
dic

284


FreqDist({'the': 49, '.': 34, 'of': 22, 'and': 19, '“': 14, 'his': 13, 'in': 13, 'montresor': 12, 'cask': 11, 'fortunato': 10, ...})

In [33]:
top10 = dic.most_common(10)
top10

[('the', 196),
 ('.', 136),
 ('of', 88),
 ('and', 76),
 ('“', 56),
 ('his', 52),
 ('in', 52),
 ('montresor', 48),
 ('cask', 44),
 ('fortunato', 40)]

### Removing Stop-Words

In [45]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words[:10])
stop_words = stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [49]:
tokens_stopwords_removed = []
for i in tokens_lower:
    if i in stop_words:
        continue
    else:
        tokens_stopwords_removed.append(i)

len(tokens_stopwords_removed)

682

###### Punctuation fixing

In [93]:
import re
punc = re.compile(r'[-,!?;:()''|0-9]')
tkn_punc_removed = []
for i in tokens_lower:
    ii = punc.sub("",i)
    if len(ii)>0:
        tkn_punc_removed.append(ii)

len(tkn_punc_removed)

676

###### Creating N-Grams

In [55]:
ngram_tkn = list(nltk.ngrams(tokens_lower, 4))
ngram_tkn[:6]

[('fortunato', 'and', 'montresor', 'are'),
 ('and', 'montresor', 'are', 'descending'),
 ('montresor', 'are', 'descending', 'into'),
 ('are', 'descending', 'into', 'the'),
 ('descending', 'into', 'the', 'damp'),
 ('into', 'the', 'damp', 'vaults')]

### Stemming

###### Porter stemmer

In [70]:
from nltk.stem import PorterStemmer
p_stemmer = PorterStemmer()

In [85]:
for i in tokens_stopwords_removed[:60]:
    stm = p_stemmer.stem(i)
    if len(i)!=len(stm):
        print(i,":",stm)

descending : descend
vaults : vault
strategically : strateg
planned : plan
following : follow
his : hi
investigative : investig
excitement : excit
intoxicated : intox
his : hi
centre : centr


###### Lancaster Stemmer

In [83]:
from nltk.stem import LancasterStemmer
l_stemmer = LancasterStemmer()

In [86]:
for i in tokens_stopwords_removed[:60]:
    stm = l_stemmer.stem(i)
    if len(i)!=len(stm):
        print(i,":",stm)

montresor : montres
are : ar
descending : descend
vaults : vault
under : und
river : riv
montresor : montres
planned : plan
all : al
these : thes
following : follow
investigative : investig
excitement : excit
final : fin
heavily : heavy
intoxicated : intox
montresor : montres
there : ther
centre : cent


###### SnowBall stemmer

In [81]:
from nltk.stem import SnowballStemmer
s_stemmer = SnowballStemmer('english')

In [87]:
for i in tokens_stopwords_removed[:60]:
    stm = s_stemmer.stem(i)
    if len(i)!=len(stm):
        print(i,":",stm)

descending : descend
vaults : vault
strategically : strateg
planned : plan
following : follow
investigative : investig
excitement : excit
intoxicated : intox
centre : centr


### Lemmatization
(without taking part of speech context)

In [88]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

lemzer = WordNetLemmatizer()

In [100]:
lemzer.lemmatize('drives')

'drive'

In [101]:
for i in set(tkn_punc_removed):
    lm = lemzer.lemmatize(i)
    if i!=lm:
        print(i,":",lm)

captures : capture
vaults : vault
limbs : limb
seconds : second
was : wa
pieces : piece
has : ha
consequences : consequence
skulls : skull
bones : bone
mysteries : mystery
eyes : eye
parts : part
centuries : century
years : year
hands : hand
as : a
jewels : jewel
