# Stemming Words with NLTK

In [1]:
from nltk import word_tokenize
from nltk.stem import *

import pandas as pd

### PorterStemmer
* PorterStemmer uses Suffix Stripping to produce stems.

In [2]:
input_tokens = ['overwhelming', 'overwhelmingly', 
                'hushed', 'hush',
                'functional', 'functionally',
                'lying', 'lied',
                'fairly', 
                'destabilize', 'stability',
                'friendship', 'friendships', 'friendly', 'friendless', 
                'connect', 'connections', 'connected',  
                'the', 'these', 'those',
                'motivational', 'motivate', 'motivating']

In [3]:
ps = PorterStemmer()

ps_stemmed_tokens = []
for token in input_tokens:
    ps_stemmed_tokens.append(ps.stem(token))

In [4]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Porter Stemmer': ps_stemmed_tokens
})

stems_df

Unnamed: 0,words,Porter Stemmer
0,overwhelming,overwhelm
1,overwhelmingly,overwhelmingli
2,hushed,hush
3,hush,hush
4,functional,function
5,functionally,function
6,lying,lie
7,lied,lie
8,fairly,fairli
9,destabilize,destabil


### LancasterStemmer
* The LancasterStemmer (Paice-Husk stemmer) is an iterative algorithm with rules saved externally.
* LancasterStemmer is simple, but heavy stemming due to iterations and over-stemming may occur. 
* Over-stemming causes the stems to be not linguistic, or they may have no meaning.

In [5]:
ls = LancasterStemmer()

ls_stemmed_tokens = []
for token in input_tokens:
    ls_stemmed_tokens.append(ls.stem(token))

In [6]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Lancaster Stemmer': ls_stemmed_tokens
})

stems_df

Unnamed: 0,words,Lancaster Stemmer
0,overwhelming,overwhelm
1,overwhelmingly,overwhelm
2,hushed,hush
3,hush,hush
4,functional,funct
5,functionally,funct
6,lying,lying
7,lied,lied
8,fairly,fair
9,destabilize,dest


In [7]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Porter Stemmer': ps_stemmed_tokens,
    'Lancaster Stemmer': ls_stemmed_tokens
})

stems_df

Unnamed: 0,words,Porter Stemmer,Lancaster Stemmer
0,overwhelming,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm
2,hushed,hush,hush
3,hush,hush,hush
4,functional,function,funct
5,functionally,function,funct
6,lying,lie,lying
7,lied,lie,lied
8,fairly,fairli,fair
9,destabilize,destabil,dest


### SnowballStemmer
* One can generate its own set of rules for any language that is why Python nltk introduced SnowballStemmers that are used to create non-English Stemmers!

In [8]:
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [9]:
ss =  SnowballStemmer('english')

ss_stemmed_tokens = []
for token in input_tokens:
    ss_stemmed_tokens.append(ss.stem(token))

In [10]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Snowball Stemmer': ss_stemmed_tokens
})

stems_df

Unnamed: 0,words,Snowball Stemmer
0,overwhelming,overwhelm
1,overwhelmingly,overwhelm
2,hushed,hush
3,hush,hush
4,functional,function
5,functionally,function
6,lying,lie
7,lied,lie
8,fairly,fair
9,destabilize,destabil


In [11]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Porter Stemmer': ps_stemmed_tokens,
    'Lancaster Stemmer': ls_stemmed_tokens,
    'Snowball Stemmer': ss_stemmed_tokens
})

stems_df

Unnamed: 0,words,Porter Stemmer,Lancaster Stemmer,Snowball Stemmer
0,overwhelming,overwhelm,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm,overwhelm
2,hushed,hush,hush,hush
3,hush,hush,hush,hush
4,functional,function,funct,function
5,functionally,function,funct,function
6,lying,lie,lying,lie
7,lied,lie,lied,lie
8,fairly,fairli,fair,fair
9,destabilize,destabil,dest,destabil


In [12]:
with open('./datasets/stemming.txt', 'r') as f:
    file_contents = f.read()

print(file_contents)

FileNotFoundError: [Errno 2] No such file or directory: './datasets/stemming.txt'

In [None]:
word_tokens = word_tokenize(file_contents)

In [None]:
ss =  SnowballStemmer('english', ignore_stopwords=True)

ss_stemmed_words = []
for word in word_tokens:
    ss_stemmed_words.append(ss.stem(word))

In [None]:
" ".join(ss_stemmed_words)

# Done !