In [None]:
# Stemming : Stemming is the process of producing morphological variants of a root/base word. 
# Stemming programs are commonly referred to as stemming algorithms or stemmers. 

# Errors in Stemming:
# There are mainly two errors in stemming – Overstemming and Understemming. 
# Overstemming occurs when two words are stemmed to same root that are of different stems. 
# Under-stemming occurs when two words are stemmed to same root that are not of different stems.



In [None]:
# Stemming : Stemming is a technique to remove affixes from a word, ending up with the stem.

# Lemmatization : problem with stemming is that often, stemmed words do not carry any meaning.
# Lemmatization deals with such cases where it returns base form of words that carries dectionary meaning

In [None]:
# One major difference with stemming is that lemmatize takes a part of speech parameter, 
# “pos” If not supplied, the default is “noun.”

### Porter Stemmer

##### PorterStemmer removes 's', 'es','e','ed','al', and so on

In [11]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

text = "My name is maximum Decimus Meridius. Commander of the Armier of the North, General of the Felix Legions \
        and loyal servant to the true emperor, Mercus Aurelius. \nFather to a murdered son. husband to a murdered\
        wife. \nAnd I will have my vengeance, in this"

token = [v for v in word_tokenize(text)]

print(token,"\n\n")

print(PorterStemmer().stem(text))


['My', 'name', 'is', 'maximum', 'Decimus', 'Meridius', '.', 'Commander', 'of', 'the', 'Armier', 'of', 'the', 'North', ',', 'General', 'of', 'the', 'Felix', 'Legions', 'and', 'loyal', 'servant', 'to', 'the', 'true', 'emperor', ',', 'Mercus', 'Aurelius', '.', 'Father', 'to', 'a', 'murdered', 'son', '.', 'husband', 'to', 'a', 'murdered', 'wife', '.', 'And', 'I', 'will', 'have', 'my', 'vengeance', ',', 'in', 'this'] 


my name is maximum decimus meridius. commander of the armier of the north, general of the felix legions         and loyal servant to the true emperor, mercus aurelius. 
father to a murdered son. husband to a murdered        wife. 
and i will have my vengeance, in thi


### Lancester Stemmer
#### Suffixes are droped biger than Porter .'us', 'e','th','eral','ered'

In [12]:
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize

text = "My name is maximum Decimus Meridius. Commander of the Armier of the North, General of the Felix Legions \
        and loyal servant to the true emperor, Mercus Aurelius. \nFather to a murdered son. husband to a murdered\
        wife. \nAnd I will have my vengeance, in this"

token = [v for v in word_tokenize(text)]

print(token,"\n\n")

print(LancasterStemmer().stem(text))

['My', 'name', 'is', 'maximum', 'Decimus', 'Meridius', '.', 'Commander', 'of', 'the', 'Armier', 'of', 'the', 'North', ',', 'General', 'of', 'the', 'Felix', 'Legions', 'and', 'loyal', 'servant', 'to', 'the', 'true', 'emperor', ',', 'Mercus', 'Aurelius', '.', 'Father', 'to', 'a', 'murdered', 'son', '.', 'husband', 'to', 'a', 'murdered', 'wife', '.', 'And', 'I', 'will', 'have', 'my', 'vengeance', ',', 'in', 'this'] 


my name is maximum decimus meridius. commander of the armier of the north, general of the felix legions         and loyal servant to the true emperor, mercus aurelius. 
father to a murdered son. husband to a murdered        wife. 
and i will have my vengeance, in this


### RegexpStemmer :Stemming using Regualr Expression

In [13]:
# example 01

import nltk

from nltk.stem import RegexpStemmer

reg_exp_stemmer = RegexpStemmer('able$|ing$') 

'''removes the suffixes albe and ing, 
if present in a word and min specifies the minimum
length of the stemmed word '''

print(reg_exp_stemmer.stem('flying'))

print(reg_exp_stemmer.stem('capable'))

print(reg_exp_stemmer.stem('doing'))

fly
cap
do


In [14]:
# Example 2

from nltk.stem import RegexpStemmer

reg_stemmer = RegexpStemmer('ing$',min =4)

sentence = 'I love Playing football'

' '.join([reg_stemmer.stem(x) for x in sentence.split()])


'I love Play football'

### Simple_Stemmer

In [15]:
import nltk
def simple_stemmer(text):
    stem = [nltk.stem.PorterStemmer().stem(v) for v in text.split()]
    stem = ' '.join(stem)
    return stem

simple_stemmer("My system keeps crashing his crashed yesterday, ours ctashed daily")


'my system keep crash hi crash yesterday, our ctash daili'

### Snowball Stemmer

In [10]:
from nltk.stem import SnowballStemmer

ss = SnowballStemmer("german")
print("Supported Language:",SnowballStemmer.languages)

# Stemming on German language
ss.stem('autobahnen')

# autobahnen = Cars
# springen = jumping
ss.stem('springen')


Supported Language: ('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


'spring'

In [2]:
from nltk.stem import PorterStemmer

words= ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()
for w in words:
    print(ps.stem(w))

wait
wait
wait
wait


In [4]:
words = ["like","liked","liking"]

ps = PorterStemmer()
for w in words:
    print(ps.stem(w))# NOise Removal    

like
like
like


In [5]:
# Noise Remove
noise_list = ['is','a','this','....']
text = "this is a sample text"
words = text.split()
noise_free_words = [word for word in words if word not in noise_list]
    
noise_free_text = " ".join(noise_free_words)

print(noise_free_text)#

sample text


In [7]:
from nltk.tokenize import word_tokenize

# Stemming a sentence
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned \
            poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

it
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


In [8]:
# Sample code to remove a regex pattern 

import re
text = "remove this #hashtag from analytics vidhya"
regex_pattern = "#[\w]*"

urls = re.finditer(regex_pattern,text)

for i in urls:
    inputtext = re.sub(i.group().strip(),'',text)
print(inputtext)

remove this  from analytics vidhya


In [10]:
# Lemmatizing  is like Stemming but it gives Actual real use full words 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
print(ps.stem("this"))
print(ps.stem("believes"))

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("this"))
print(lemmatizer.lemmatize("believes"))

# if we look at above example of steming and lematization output the target words 
# we are getting lematization are meaning full than stemming

print(lemmatizer.lemmatize("this",pos='v')) # adjective
print(lemmatizer.lemmatize("crossing",pos='a')) # adjective
print(lemmatizer.lemmatize("crossing",pos='v')) # verb
print(lemmatizer.lemmatize("crossing",pos='n')) # verb
print(lemmatizer.lemmatize("crossing",pos='r')) # verb


thi
believ
this
belief
this
crossing
cross
crossing
crossing


In [11]:
# Stemming Non-English Words
# SnowballStemmer can stem 13 languages besides the English language. The supported languages are:
from nltk.stem import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [12]:
# Stemming and Lemmatization Difference
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer() 
lemmatizer = WordNetLemmatizer() 
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple')) 
print('----------------------') 
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))

stone
speak
bedroom
joke
lisa
purpl
----------------------
stone
speaking
bedroom
joke
lisa
purple
