In [2]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer    # nltk.stem is a package that performs stemming using different classes

# Stemming

In [7]:
# PorterStemmer uses Suffix Stripping(removing suffixes from a word) to produce stems. PorterStemmer algorithm does not follow linguistics rather a set of 05 rules for different cases that are applied
# in phases (step by step) to generate stems. SnowballStemmers that are used to create non-English Stemmers!

# The LancasterStemmer (Paice-Husk stemmer) is an iterative algorithm with rules saved externally. LancasterStemmer is simple, but heavy stemming due to iterations and over-stemming may occur. 
# Over-stemming causes the stems to be not linguistic, or they may have no meaning.

In [3]:
porter = PorterStemmer()    # create an object of class PorterStemmer
lancaster = LancasterStemmer()

# proide a word to be stemmed
print("Porter Stemmer")
print(porter.stem("cats"))
print(porter.stem("trouble"))
print(porter.stem("troubling"))
print(porter.stem("troubled"))
print("Lancaster Stemmer")
print(lancaster.stem("cats"))
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))

In [6]:
# A list of words to be stemmed | Word vs PorterStremmer vs LancasterStemmer
word_list = ["friend", "friendship", "friends", "friendships","stabil","destabilize","misunderstanding","railroad","moonlight","football"]
print("{0:20}{1:20}{2:20}".format("Word","Porter Stemmer","lancaster Stemmer"))
for word in word_list:
    print("{0:20}{1:20}{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

## Stemming Sentences

In [15]:
sentence="Pythoners are very intelligent and work very pythonly and now they are pythoning their way to success."
porter.stem(sentence)  # As you see the stemmer sees the entire sentence as a word, so it returns it as it is. 

In [16]:
# We need to stem each word in the sentence and return a combined sentence. To separate the sentence into words, you can use tokenizer. The nltk tokenizer separates the sentence into words as follows. 
# You can create a function and just pass the sentence to the function, and it will give you the stemmed sentence. 

from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

x = stemSentence(sentence)
print(x)

In [None]:
# Python nltk provides not only two English stemmers: PorterStemmer and LancasterStemmer but also a lot of non-English stemmers as part of SnowballStemmers, ISRIStemmer, RSLPSStemmer.

# Lemmatization 

In [None]:
# Lemmatization reduces the inflected words properly ensuring that the root word belongs to the language. In Lemmatization root word is called Lemma.
# A lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words.

In [17]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word","Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word)))
    
# In the above output, you must be wondering that no actual root form has been given for any word, this is because they are given without context. You need to provide the context 
# in which you want to lemmatize that is the parts-of-speech (POS). This is done by giving the value for pos parameter in wordnet_lemmatizer.lemmatize.

In [18]:
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos="v")))