# Porter Stemmer

## Importing Packages and Downloading Resources

In [2]:
import nltk
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Setting Stopwords for only 'English' language

In [3]:
stopwords = set(nltk.corpus.stopwords.words('english'))

## Sentence (Content) for processing

In [4]:
sentence = """An exception represents a run-time error that halts the normal execution at a particular line and transfers control to error handling code. This section just introduces the most basic uses of exceptions.
For example a run-time error might be that a variable used in the program does not have a value (ValueError .. you've probably seen that one a few times), or a file open operation error because a file does not exist (IOError). Learn more in the exceptions tutorial and see the entire exception list."""

## Tokenizing the given sentence

In [5]:
tokens = nltk.word_tokenize(sentence)
print("Tokens found in the sentence: ",tokens)

Tokens found in the sentence:  ['An', 'exception', 'represents', 'a', 'run-time', 'error', 'that', 'halts', 'the', 'normal', 'execution', 'at', 'a', 'particular', 'line', 'and', 'transfers', 'control', 'to', 'error', 'handling', 'code', '.', 'This', 'section', 'just', 'introduces', 'the', 'most', 'basic', 'uses', 'of', 'exceptions', '.', 'For', 'example', 'a', 'run-time', 'error', 'might', 'be', 'that', 'a', 'variable', 'used', 'in', 'the', 'program', 'does', 'not', 'have', 'a', 'value', '(', 'ValueError', '..', 'you', "'ve", 'probably', 'seen', 'that', 'one', 'a', 'few', 'times', ')', ',', 'or', 'a', 'file', 'open', 'operation', 'error', 'because', 'a', 'file', 'does', 'not', 'exist', '(', 'IOError', ')', '.', 'Learn', 'more', 'in', 'the', 'exceptions', 'tutorial', 'and', 'see', 'the', 'entire', 'exception', 'list', '.']


## Filtering Tokens, based on stopwords

In [6]:
tokens2 = [t for t in tokens if t.lower() not in stopwords]
print("Tokens found (excluding stopwords): ",tokens2)

Tokens found (excluding stopwords):  ['exception', 'represents', 'run-time', 'error', 'halts', 'normal', 'execution', 'particular', 'line', 'transfers', 'control', 'error', 'handling', 'code', '.', 'section', 'introduces', 'basic', 'uses', 'exceptions', '.', 'example', 'run-time', 'error', 'might', 'variable', 'used', 'program', 'value', '(', 'ValueError', '..', "'ve", 'probably', 'seen', 'one', 'times', ')', ',', 'file', 'open', 'operation', 'error', 'file', 'exist', '(', 'IOError', ')', '.', 'Learn', 'exceptions', 'tutorial', 'see', 'entire', 'exception', 'list', '.']


## Displaying Stems for each word (filtered)

### Initializing PorterStemmer

In [7]:
ps = PorterStemmer()

### Displaying Stems for the filtered words

In [10]:
print("Roots for each word (filtered):")
print("Word".ljust(25," "),": ","Stem".ljust(25," "))
for w in tokens2:
    print(w.ljust(25," "), ": ", ps.stem(w).ljust(25," "))

Roots for each word (filtered):
Word                      :  Stem                     
exception                 :  except                   
represents                :  repres                   
run-time                  :  run-tim                  
error                     :  error                    
halts                     :  halt                     
normal                    :  normal                   
execution                 :  execut                   
particular                :  particular               
line                      :  line                     
transfers                 :  transfer                 
control                   :  control                  
error                     :  error                    
handling                  :  handl                    
code                      :  code                     
.                         :  .                        
section                   :  section                  
introduces                :  intr