# Stemming

In [34]:
text = "It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms"
text

'It originated from the idea that there are readers who prefer learning new skills from the comforts of their drawing rooms'

In [35]:
import nltk
tokens = nltk.word_tokenize(text)
tokens

['It',
 'originated',
 'from',
 'the',
 'idea',
 'that',
 'there',
 'are',
 'readers',
 'who',
 'prefer',
 'learning',
 'new',
 'skills',
 'from',
 'the',
 'comforts',
 'of',
 'their',
 'drawing',
 'rooms']

In [36]:
from nltk.stem.porter import PorterStemmer 
porter_stemmer = PorterStemmer()
stems = []
for i in tokens:
    stems.append(porter_stemmer.stem(i))
stems

['It',
 'origin',
 'from',
 'the',
 'idea',
 'that',
 'there',
 'are',
 'reader',
 'who',
 'prefer',
 'learn',
 'new',
 'skill',
 'from',
 'the',
 'comfort',
 'of',
 'their',
 'draw',
 'room']

# Lemmatization

- First we will do pos tagging because the Lemmatizer needs the pos to work. (pos => Part Of Speech)
- Then we change the tags using a function to the pos tags that the lemmatizer will use
- Call the nltk function to apply lemmatization

In [37]:
tokens = nltk.word_tokenize(text)
pos = nltk.pos_tag(tokens)
pos

[('It', 'PRP'),
 ('originated', 'VBD'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('idea', 'NN'),
 ('that', 'IN'),
 ('there', 'EX'),
 ('are', 'VBP'),
 ('readers', 'NNS'),
 ('who', 'WP'),
 ('prefer', 'VBP'),
 ('learning', 'VBG'),
 ('new', 'JJ'),
 ('skills', 'NNS'),
 ('from', 'IN'),
 ('the', 'DT'),
 ('comforts', 'NNS'),
 ('of', 'IN'),
 ('their', 'PRP$'),
 ('drawing', 'NN'),
 ('rooms', 'NNS')]

In [38]:
def findpos(x):
    t='n'
    if x.startswith("V"):
        t="v"
    elif x.startswith("J"):
        t="a"
    elif x.startswith("R"):
        t="r"
    elif x.startswith("N"):
        t="n"
    return(t)

x = []
for i in range(len(pos)):
    x.append(findpos(pos[i][1]))
print(x)

['n', 'v', 'n', 'n', 'n', 'n', 'n', 'v', 'n', 'n', 'v', 'v', 'a', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']


In [39]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
lemma = []
for i in range(len(pos)):
    lemma.append(wnl.lemmatize(pos[i][0], x[i]))
lemma

['It',
 'originate',
 'from',
 'the',
 'idea',
 'that',
 'there',
 'be',
 'reader',
 'who',
 'prefer',
 'learn',
 'new',
 'skill',
 'from',
 'the',
 'comfort',
 'of',
 'their',
 'drawing',
 'room']

# Levenstein's Edit Distance
- Initialize first row to row number
- Initialize first column to column number
- Apply the algorithm in Dynamic Programming approach

In [40]:
def edit_distance(source, destn):
    m = len(source) + 1
    n = len(destn) + 1
    dist = {}
    for i in range(m):
        dist[i,0] = i
    for j in range(n):
        dist[0,j] = j
    
    for i in range(1, m):
        for j in range(1, n):
            if (source[i-1] == destn[j-1]):
                cost = 0
            else:
                cost = 2
            dist[i,j] = min(dist[i,j-1]+1, dist[i-1, j]+1, dist[i-1,j-1]+cost)
    return dist[i,j]

table = edit_distance('execution', 'intention')
table

8