#### Stemming:- using fixed set of rules to bring the word to base form, ex:- ing, able, ed removal
#### lemmatiztion:- using knowledge of language to derive the base word

In [1]:
import nltk
import spacy

In [2]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [6]:
text  = ["eating", "going", "went", "worked", "ability"]
for et in text:
    print(stemmer.stem(et))

eat
go
went
work
abil


#### We can see that the nltk stemming is not identifying the language properly, distorted the actual meaning of the word

### So although the stemmer is dumb but its faster and for simple smaller texts it works

# Using Spacy

In [35]:
nlp = spacy.load("en_core_web_sm")

In [36]:
text = "eating ate agility dialing dialed went going"
doc = nlp(text)
for item in doc:
    print(item, " | ", item.lemma_)

eating  |  eat
ate  |  eat
agility  |  agility
dialing  |  dialing
dialed  |  dial
went  |  go
going  |  go


In [37]:
## We can see that the words have been mapped properly using ligustic knowledge

In [38]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [39]:
# using attribute_ruler component, we can acutally modify the way lemmatizer works

In [40]:
doc = nlp("bro don't say no to brah, I am exhausted")
for token in doc:
    print(token.lemma_)
# we see that the bro and brah are treated differently here.

bro
do
not
say
no
to
brah
,
I
be
exhaust


In [41]:
ar = nlp.get_pipe('attribute_ruler')
ar.add( [ [{"TEXT":"bro"}], [{"TEXT": "brah"}]], {"LEMMA":"Brother"})
doc = nlp("bro don't say no to brah, I am exhausted")

for token in doc:
    print(token.text, " | ", token.lemma_)

bro  |  Brother
do  |  do
n't  |  not
say  |  say
no  |  no
to  |  to
brah  |  Brother
,  |  ,
I  |  I
am  |  be
exhausted  |  exhaust


In [42]:
# we customized our nlp pipeline to treat the lemmatizer as per our need.

# Exercise

## Exercise1:

* Convert these list of words into base form using Stemming and Lemmatization and observe the transformations
* Write a short note on the words that have different base words using stemming and Lemmatization

In [43]:
#using stemming in nltk
lst_words = ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']

In [45]:
# importing the necessary libraries again:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [46]:
for word in lst_words:
    print(stemmer.stem(word))

run
paint
walk
dress
like
children
whom
good
ate
fish


In [48]:
# importing the libraries again:
import spacy
nlp = spacy.load("en_core_web_sm")


In [49]:
#using lemmatization in spacy
doc = nlp("running painting walking dressing likely children who good ate fishing")

In [51]:
for token in doc:
    print(token, " | ", token.lemma_)

running  |  run
painting  |  paint
walking  |  walk
dressing  |  dress
likely  |  likely
children  |  child
who  |  who
good  |  good
ate  |  eat
fishing  |  fishing


In [54]:
# Observing the difference in the two techniques:
org =  ['running', 'painting', 'walking', 'dressing', 'likely', 'children', 'whom', 'good', 'ate', 'fishing']
stem = []
lemmet = []
for token in org:
    stem.append(stemmer.stem(token))
for token in doc:
    lemmet.append(token.lemma_)
# ans = pd.Dataframe(org, stem, lemmet)

In [58]:
print(type(org), type(stem), type(lemmet))

<class 'list'> <class 'list'> <class 'list'>


In [66]:
# converting all the list elements to dataframe/ tabular form to see the difference.
df  = pd.DataFrame(list(zip(org,stem,lemmet)), columns = ['orginal', 'stemmed form', 'lemmetized'])
df

Unnamed: 0,orginal,stemmed form,lemmetized
0,running,run,run
1,painting,paint,paint
2,walking,walk,walk
3,dressing,dress,dress
4,likely,like,likely
5,children,children,child
6,whom,whom,who
7,good,good,good
8,ate,ate,eat
9,fishing,fish,fishing


## Exercise2:

convert the given text into it's base form using both stemming and lemmatization

In [123]:
text = """Latha is very multi talented girl. She is good at many skills like dancing, running, singing, playing. She also likes eating Pav Bhagi. she has a habit of fishing and swimming too. Besides all this, she is a wonderful at cooking too.
"""

In [124]:
#using stemming in nltk
#step1: Word tokenizing
#step2: getting the base form for each token using stemmer
#step3: joining all words in a list into string using 'join()'

In [125]:
# converting the paragraph to tokens:
org_word_token = nltk.word_tokenize(text)
stem_word = []
lemmet_word = []
for token in org_word_token:
    stem_word.append(stemmer.stem(token))
doc = nlp(text)
for sent in doc.sents:
    for token in sent:
        lemmet_word.append(token.lemma_)
    

In [126]:
' '.join(stem_word)

'latha is veri multi talent girl . she is good at mani skill like danc , run , sing , play . she also like eat pav bhagi . she ha a habit of fish and swim too . besid all thi , she is a wonder at cook too .'

In [127]:
pd.DataFrame(list(zip(org_word_token, stem_word, lemmet_word)), columns= ['original', 'stemmed', 'lemmetized'])

Unnamed: 0,original,stemmed,lemmetized
0,Latha,latha,Latha
1,is,is,be
2,very,veri,very
3,multi,multi,multi
4,talented,talent,talented
5,girl,girl,girl
6,.,.,.
7,She,she,she
8,is,is,be
9,good,good,good
