# Pre-Processing

In [14]:
import nltk

### Tokenization – learning to use the inbuilt tokenizers of NLTK

In [1]:
from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer
from nltk import word_tokenize


In [4]:
lTokenizer = LineTokenizer()
print("Line tokenizer output :",lTokenizer.tokenize("My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. \nFather to a murdered son, husband to a murdered wife. \nAnd I will have my vengeance, in this life or the next."))

Line tokenizer output : ['My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. ', 'Father to a murdered son, husband to a murdered wife. ', 'And I will have my vengeance, in this life or the next.']


In [5]:
rawText = "By 11 o'clock on Sunday, the doctor shall open the dispensary."
sTokenizer = SpaceTokenizer()
print("Space Tokenizer output :",sTokenizer.tokenize(rawText))

Space Tokenizer output : ['By', '11', "o'clock", 'on', 'Sunday,', 'the', 'doctor', 'shall', 'open', 'the', 'dispensary.']


In [12]:
print("Word Tokenizer output :", word_tokenize(rawText))

Word Tokenizer output : ['By', '11', "o'clock", 'on', 'Sunday', ',', 'the', 'doctor', 'shall', 'open', 'the', 'dispensary', '.']


In [7]:
tTokenizer = TweetTokenizer()
print("Tweet Tokenizer output :",tTokenizer.tokenize("This is a cooool #dummysmiley: :-) :-P <3"))

Tweet Tokenizer output : ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3']


### Stemming – learning to use the inbuilt stemmers of NLTK

In [14]:
from nltk import PorterStemmer, LancasterStemmer, word_tokenize

In [16]:
raw = "My name is Maximus Decimus Meridius, commander of the Armies of the North, General of the Felix Legions and loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And I will have my vengeance, in this life or the next."
tokens = word_tokenize(raw)

In [18]:
porter = PorterStemmer()
pStems = [porter.stem(t) for t in tokens]
print(pStems)

['my', 'name', 'is', 'maximu', 'decimu', 'meridiu', ',', 'command', 'of', 'the', 'armi', 'of', 'the', 'north', ',', 'gener', 'of', 'the', 'felix', 'legion', 'and', 'loyal', 'servant', 'to', 'the', 'true', 'emperor', ',', 'marcu', 'aureliu', '.', 'father', 'to', 'a', 'murder', 'son', ',', 'husband', 'to', 'a', 'murder', 'wife', '.', 'and', 'i', 'will', 'have', 'my', 'vengeanc', ',', 'in', 'thi', 'life', 'or', 'the', 'next', '.']


In [19]:
lancaster = LancasterStemmer()
lStems = [lancaster.stem(t) for t in tokens]
print(lStems)

['my', 'nam', 'is', 'maxim', 'decim', 'meridi', ',', 'command', 'of', 'the', 'army', 'of', 'the', 'nor', ',', 'gen', 'of', 'the', 'felix', 'leg', 'and', 'loy', 'serv', 'to', 'the', 'tru', 'emp', ',', 'marc', 'aureli', '.', 'fath', 'to', 'a', 'murd', 'son', ',', 'husband', 'to', 'a', 'murd', 'wif', '.', 'and', 'i', 'wil', 'hav', 'my', 'veng', ',', 'in', 'thi', 'lif', 'or', 'the', 'next', '.']


### Lemmatization – learning to use the WordnetLemmatizer of NLTK

In [4]:
from nltk import WordNetLemmatizer,word_tokenize, PorterStemmer

In [5]:
raw = "My name is Maximus Decimus Meridius, commander of the armies of the north, General of the Felix legions and loyal servant to the true emperor, Marcus Aurelius. Father to a murdered son, husband to a murdered wife. And I will have my vengeance, in this life or the next."
tokens = word_tokenize(raw)

In [6]:
porter = PorterStemmer()
stems = [porter.stem(t) for t in tokens]
print(stems)

['my', 'name', 'is', 'maximu', 'decimu', 'meridiu', ',', 'command', 'of', 'the', 'armi', 'of', 'the', 'north', ',', 'gener', 'of', 'the', 'felix', 'legion', 'and', 'loyal', 'servant', 'to', 'the', 'true', 'emperor', ',', 'marcu', 'aureliu', '.', 'father', 'to', 'a', 'murder', 'son', ',', 'husband', 'to', 'a', 'murder', 'wife', '.', 'and', 'i', 'will', 'have', 'my', 'vengeanc', ',', 'in', 'thi', 'life', 'or', 'the', 'next', '.']


In [8]:
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in tokens]
print(lemmas)

['My', 'name', 'is', 'Maximus', 'Decimus', 'Meridius', ',', 'commander', 'of', 'the', 'army', 'of', 'the', 'north', ',', 'General', 'of', 'the', 'Felix', 'legion', 'and', 'loyal', 'servant', 'to', 'the', 'true', 'emperor', ',', 'Marcus', 'Aurelius', '.', 'Father', 'to', 'a', 'murdered', 'son', ',', 'husband', 'to', 'a', 'murdered', 'wife', '.', 'And', 'I', 'will', 'have', 'my', 'vengeance', ',', 'in', 'this', 'life', 'or', 'the', 'next', '.']


### Stopwords – learning to use the stopwords corpus and seeing the difference it can make

In [11]:
from nltk.corpus import gutenberg
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [12]:
gb_words = gutenberg.words('bible-kjv.txt')
words_filtered = [e for e in gb_words if len(e) >=3]

In [16]:
stopwords = nltk.corpus.stopwords.words('english')
words = [w for w in words_filtered if w.lower() not in stopwords]

In [17]:
fdistPlain = nltk.FreqDist(words)
fdist = nltk.FreqDist(gb_words)

In [20]:
print('Following are the most common 10 words in the bag')
print(fdist.most_common(10))
print('Following are the most common 10 words in the bag minus the stopwords')
print(fdistPlain.most_common(10))

Following are the most common 10 words in the bag
[(',', 70509), ('the', 62103), (':', 43766), ('and', 38847), ('of', 34480), ('.', 26160), ('to', 13396), ('And', 12846), ('that', 12576), ('in', 12331)]
Following are the most common 10 words in the bag minus the stopwords
[('shall', 9760), ('unto', 8940), ('LORD', 6651), ('thou', 4890), ('thy', 4450), ('God', 4115), ('said', 3995), ('thee', 3827), ('upon', 2730), ('man', 2721)]


### Edit distance – writing your own algorithm to find edit distance between two strings

In [21]:
from nltk.metrics.distance import edit_distance

In [32]:
def my_edit_distance(str1, str2):
    m = len(str1) +1
    n = len(str2) +1
    table = {}
    for i in range(m): table[i,0] = i
    for j in range(n): table[0,j] = j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if str1[i-1] == str2[j-1] else 1
            table[i,j] = min(table[i, j-1]+1, table[i-1, j]+1, table[i-1,j-1]+cost)
    return table[i,j]

In [33]:
print("Our Algorithm :",my_edit_distance("hand", "and"))
print("NLTK Algorithm :",edit_distance("hand", "and"))

Our Algorithm : 1
NLTK Algorithm : 1


### Processing two short stories and extracting the common vocabulary between two of them

In [None]:
story1 = """In a far away kingdom, there was a river. This river
was home to many golden swans. The swans spent most of their time
on the banks of the river. Every six months, the swans would leave
a golden feather as a fee for using the lake. The soldiers of the
kingdom would collect the feathers and deposit them in the royal
treasury.
One day, a homeless bird saw the river. "The water in this river
seems so cool and soothing. I will make my home here," thought the
bird.
As soon as the bird settled down near the river, the golden swans
noticed her. They came shouting. "This river belongs to us. We pay
a golden feather to the King to use this river. You can not live
here."
"I am homeless, brothers. I too will pay the rent. Please give me
shelter," the bird pleaded. "How will you pay the rent? You do not
have golden feathers," said the swans laughing. They further added,
"Stop dreaming and leave once." The humble bird pleaded many times.
But the arrogant swans drove the bird away.
"I will teach them a lesson!" decided the humiliated bird.
She went to the King and said, "O King! The swans in your river are
impolite and unkind. I begged for shelter but they said that they
had purchased the river with golden feathers."
The King was angry with the arrogant swans for having insulted the
homeless bird. He ordered his soldiers to bring the arrogant swans
to his court. In no time, all the golden swans were brought to the
King’s court.
"Do you think the royal treasury depends upon your golden feathers?
You can not decide who lives by the river. Leave the river at once
or you all will be beheaded!" shouted the King.
The swans shivered with fear on hearing the King. They flew away
never to return. The bird built her home near the river and lived
there happily forever. The bird gave shelter to all other birds in
the river. """

In [35]:
story2 = """Long time ago, there lived a King. He was lazy and
liked all the comforts of life. He never carried out his duties as
a King. “Our King does not take care of our needs. He also ignores
the affairs of his kingdom." The people complained.
One day, the King went into the forest to hunt. After having
wandered for quite sometime, he became thirsty. To his relief, he
spotted a lake. As he was drinking water, he suddenly saw a golden
swan come out of the lake and perch on a stone. “Oh! A golden swan.
I must capture it," thought the King.
But as soon as he held his bow up, the swan disappeared. And the
King heard a voice, “I am the Golden Swan. If you want to capture
me, you must come to heaven."
Surprised, the King said, “Please show me the way to heaven." “Do
good deeds, serve your people and the messenger from heaven would
come to fetch you to heaven," replied the voice.
The selfish King, eager to capture the Swan, tried doing some good
deeds in his Kingdom. “Now, I suppose a messenger will come to take
me to heaven," he thought. But, no messenger came.
The King then disguised himself and went out into the street. There
he tried helping an old man. But the old man became angry and said,
“You need not try to help. I am in this miserable state because of
out selfish King. He has done nothing for his people."
Suddenly, the King heard the golden swan’s voice, “Do good deeds and
you will come to heaven." It dawned on the King that by doing
selfish acts, he will not go to heaven.
He realized that his people needed him and carrying out his duties
was the only way to heaven. After that day he became a responsible
King.
"""

In [36]:
story1 = story1.replace(",", "").replace("\n", "").replace('.','').replace('"', '').replace("!","").replace("?","").casefold()
story2 = story2.replace(",", "").replace("\n", "").replace('.','').replace('"', '').replace("!","").replace("?","").casefold()

In [38]:
story1_words = story1.split(" ")
print("First Story words :",story1_words)
print('\n\n')
story2_words = story2.split(" ")
print("Second Story words :",story2_words)

First Story words : ['in', 'a', 'far', 'away', 'kingdom', 'there', 'was', 'a', 'river', 'this', 'river', 'was', 'home', 'to', 'many', 'golden', 'swans', 'the', 'swans', 'spent', 'most', 'of', 'their', 'time', 'on', 'the', 'banks', 'of', 'the', 'river', 'every', 'six', 'months', 'the', 'swans', 'would', 'leave', 'a', 'golden', 'feather', 'as', 'a', 'fee', 'for', 'using', 'the', 'lake', 'the', 'soldiers', 'of', 'the', 'kingdom', 'would', 'collect', 'the', 'feathers', 'and', 'deposit', 'them', 'in', 'the', 'royal', 'treasury', 'one', 'day', 'a', 'homeless', 'bird', 'saw', 'the', 'river', 'the', 'water', 'in', 'this', 'river', 'seems', 'so', 'cool', 'and', 'soothing', 'i', 'will', 'make', 'my', 'home', 'here', 'thought', 'the', 'bird', 'as', 'soon', 'as', 'the', 'bird', 'settled', 'down', 'near', 'the', 'river', 'the', 'golden', 'swans', 'noticed', 'her', 'they', 'came', 'shouting', 'this', 'river', 'belongs', 'to', 'us', 'we', 'pay', 'a', 'golden', 'feather', 'to', 'the', 'king', 'to', 'u

In [39]:
story1_vocab = set(story1_words)
print("First Story vocabulary :",story1_vocab)
story2_vocab = set(story2_words)
print("Second Story vocabulary",story2_vocab)

First Story vocabulary : {'', 'them', 'timesbut', 'brothers', 'no', 'flew', 'would', 'far', 'at', 'thought', 'said', 'how', 'but', 'happily', 'awayi', 'was', 'built', 'unkind', 'noticed', 'think', 'feather', 'on', 'homeless', 'away', 'have', 'kingthe', 'king', 'live', 'as', 'saw', 'having', 'soon', 'bird', 'laughing', 'please', 'once', 'leave', 'all', 'courtdo', 'further', 'one', 'pay', 'birdshe', 'spent', 'court', 'their', 'feathers', 'shouting', 'use', 'angry', 'settled', 'rent', 'ordered', 'feathersyou', 'months', 'using', 'me', 'drove', 'inthe', 'fee', 'thehomeless', 'her', 'who', 'most', 'too', 'added', 'brought', 'theyhad', 'shivered', 'soldiers', 'lesson', 'theking’s', 'near', 'arrogant', 'banks', 'for', 'cool', 'belongs', 'swansto', 'deposit', 'treasury', 'humiliated', 'birds', 'i', 'they', 'seems', 'that', 'can', 'insulted', 'be', 'other', 'us', 'golden', 'beheaded', 'every', 'awaynever', 'a', 'kingdom', 'shelter', 'bring', 'day', 'areimpolite', 'hearing', 'shouted', 'do', 'so

In [40]:
common_vocab = story1_vocab & story2_vocab
print("Common Vocabulary :",common_vocab)

Common Vocabulary : {'no', 'am', 'all', 'thought', 'said', 'but', 'i', 'to', 'lake', 'water', 'this', 'was', 'that', 'angry', 'time', 'will', 'me', 'of', 'golden', 'he', 'on', 'a', 'his', 'your', 'kingdom', 'day', 'king', 'as', 'saw', 'there', 'the', 'by', 'went', 'and', 'you', 'soon', 'in', 'not', 'for'}
