# <b style="color:green"> Text Preprocessing</b>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
adf = pd.read_csv("../data/IMDB_Dataset.csv")
adf.head()
df = adf

In [3]:
df.shape

(50000, 2)

In [4]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

## <b style="color:red">Lowercasing</b>

In [5]:
data = df['review'][3].lower()
data

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [6]:
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


## <b style="color:red">Remove HTML Tags</b>

In [7]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)


In [8]:
df['review'] = df['review'].apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [9]:
df['review'][3]

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

## <b style="color:red">Remove URLs</b>

In [10]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)


In [11]:
url1 = "Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1labb"
url2 = "Check out my notebook http://www.kaggle.com/campusx/notebook8223fc1labb"
url3 = "Google Search here www.google.com"
url4 = "For notebook click https://www.kaggle.com/campusx/notebook8223fc1labb to search check www.google.com"

url1 = remove_url(url1)
print(url1)
url2 = remove_url(url2)
print(url2)
url3 = remove_url(url3)
print(url3)
url4 = remove_url(url4)
print(url4)

Check out my notebook 
Check out my notebook 
Google Search here 
For notebook click  to search check 


## <b style="color:red">Remove Punctuation</b>

In [12]:
import string, time

In [13]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### **Method:1**

In [14]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text


In [15]:
text = "String. Width, length, height, are use to take volume?"
text = remove_punc(text)
text

'String Width length height are use to take volume'

In [16]:
start = time.time()
print(remove_punc(text))
t1 = time.time() - start
print(t1)
print(t1*50000)

String Width length height are use to take volume
0.0
0.0


### **Method:2**

In [17]:
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))


In [18]:
start = time.time()
print(remove_punc1(text))
t2 = time.time() - start
print(t2)
print(t2*50000)

String Width length height are use to take volume
0.0
0.0


In [19]:
df = pd.read_csv("../data/twitter-sentiment-hated-speech/train.csv")
df.head(7)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...


In [20]:
df['tweet'][5]

'[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo  '

In [21]:
df['tweet'] = df['tweet'].apply(remove_punc1)
df.head(7)

Unnamed: 0,id,label,tweet
0,1,0,user when a father is dysfunctional and is so...
1,2,0,user user thanks for lyft credit i cant use ca...
2,3,0,bihday your majesty
3,4,0,model i love u take with u all the time in u...
4,5,0,factsguide society now motivation
5,6,0,22 huge fan fare and big talking before they l...
6,7,0,user camping tomorrow user user user user use...


In [22]:
df['tweet'][5]

'22 huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo  '

## <b style="color:red">Chat Word Treatment</b>
- gn : good night
- asap : as soon as possible

In [23]:
chat_sort_cut = '''AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My Ass Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The Ass
PRT=Party
PRW=Parents Are Watching
QPSA?=Que Pasa?
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The Fuck
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait
TFW=That feeling when. TFW internet slang often goes in a caption to an image.
MFW=My face when
MRW =My reaction when
IFYP=I feel your pain
LOL=Laughing out loud
TNTL=Trying not to laugh
JK=Just kidding
IDC=I don’t care
ILY=I love you
IMU=I miss you
ADIH=Another day in hell
IDC=I don’t care
ZZZ=Sleeping, bored, tired
WYWH=Wish you were here
TIME=Tears in my eyes
BAE=Before anyone else
FIMH=Forever in my heart
BSAAW=Big smile and a wink
BWL=Bursting with laughter
LMAO=Laughing my ass off
BFF=Best friends forever
CSL=Can’t stop laughing'''

In [24]:
chat_sort_cut = re.findall("\S.*=*\S", chat_sort_cut)
chat_sort_cut

['AFAIK=As Far As I Know',
 'AFK=Away From Keyboard',
 'ASAP=As Soon As Possible',
 'ATK=At The Keyboard',
 'ATM=At The Moment',
 'A3=Anytime, Anywhere, Anyplace',
 'BAK=Back At Keyboard',
 'BBL=Be Back Later',
 'BBS=Be Back Soon',
 'BFN=Bye For Now',
 'B4N=Bye For Now',
 'BRB=Be Right Back',
 'BRT=Be Right There',
 'BTW=By The Way',
 'B4=Before',
 'B4N=Bye For Now',
 'CU=See You',
 'CUL8R=See You Later',
 'CYA=See You',
 'FAQ=Frequently Asked Questions',
 'FC=Fingers Crossed',
 "FWIW=For What It's Worth",
 'FYI=For Your Information',
 'GAL=Get A Life',
 'GG=Good Game',
 'GN=Good Night',
 'GMTA=Great Minds Think Alike',
 'GR8=Great!',
 'G9=Genius',
 'IC=I See',
 'ICQ=I Seek you (also a chat program)',
 'ILU=I Love You',
 'IMHO=In My Honest/Humble Opinion',
 'IMO=In My Opinion',
 'IOW=In Other Words',
 'IRL=In Real Life',
 'KISS=Keep It Simple, Stupid',
 'LDR=Long Distance Relationship',
 'LMAO=Laugh My Ass Off',
 'LOL=Laughing Out Loud',
 'LTNS=Long Time No See',
 'L8R=Later',
 'MTE=My

In [25]:
chat_words = {}
for chat in chat_sort_cut:
    lst = chat.split('=')
    chat_words[lst[0]] = lst[1]
chat_words

{'AFAIK': 'As Far As I Know',
 'AFK': 'Away From Keyboard',
 'ASAP': 'As Soon As Possible',
 'ATK': 'At The Keyboard',
 'ATM': 'At The Moment',
 'A3': 'Anytime, Anywhere, Anyplace',
 'BAK': 'Back At Keyboard',
 'BBL': 'Be Back Later',
 'BBS': 'Be Back Soon',
 'BFN': 'Bye For Now',
 'B4N': 'Bye For Now',
 'BRB': 'Be Right Back',
 'BRT': 'Be Right There',
 'BTW': 'By The Way',
 'B4': 'Before',
 'CU': 'See You',
 'CUL8R': 'See You Later',
 'CYA': 'See You',
 'FAQ': 'Frequently Asked Questions',
 'FC': 'Fingers Crossed',
 'FWIW': "For What It's Worth",
 'FYI': 'For Your Information',
 'GAL': 'Get A Life',
 'GG': 'Good Game',
 'GN': 'Good Night',
 'GMTA': 'Great Minds Think Alike',
 'GR8': 'Great!',
 'G9': 'Genius',
 'IC': 'I See',
 'ICQ': 'I Seek you (also a chat program)',
 'ILU': 'I Love You',
 'IMHO': 'In My Honest/Humble Opinion',
 'IMO': 'In My Opinion',
 'IOW': 'In Other Words',
 'IRL': 'In Real Life',
 'KISS': 'Keep It Simple, Stupid',
 'LDR': 'Long Distance Relationship',
 'LMAO': 

In [26]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


In [27]:
msg = chat_conversion('IMHO he the BFF')
print(msg)
msg = chat_conversion('FYI ILU W8 TTYL IMU')
print(msg)

In My Honest/Humble Opinion he the Best friends forever
For Your Information I Love You Wait Talk To You Later I miss you


In [28]:
df['tweet'] = df['tweet'].str.lower()
df['tweet'] = df['tweet'].apply(remove_punc1)
df['tweet'] = df['tweet'].apply(chat_conversion)
df['tweet'].head()

0    user when a father is dysfunctional and is so ...
1    user user thanks for lyft credit i cant use ca...
2                                  bihday your majesty
3    model i love You take with You all the Tears i...
4                    factsguide society now motivation
Name: tweet, dtype: object

## <b style="color:red">Spelling Correction</b>

In [29]:
from textblob import TextBlob
# !conda install -c conda-forge textblob

In [30]:
text = "Please read the note book, ands also liake the notebook."
# note book ----> notebook
textblb = TextBlob(text)
text = textblb.correct().string
text

'Please read the note book, and also like the notebook.'

## <b style="color:red">Remove Stop words</b>
- a, the, of, are, my
- Use `nltk` library
- Do not remove stop words in `POS Tagging`

In [31]:
from nltk.corpus import stopwords

In [32]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    return " ".join(new_text)

In [35]:
remove_stopwords("probabiliy my all-time favorite movies, a story of selflessness, sacrifice and dedication to a noble cause, but preachy boring."
                )

'probabiliy  all-time favorite movies,  story  selflessness, sacrifice  dedication   noble cause,  preachy boring.'

In [36]:
df = pd.read_csv("../data/IMDB_Dataset.csv", nrows=100)
df['review'] = df['review'].apply(remove_html_tags)
df['review'] = df['review'].apply(remove_stopwords)
df['review']

0     One    reviewers  mentioned   watching  1 Oz e...
1     A wonderful little production. The filming tec...
2     I thought    wonderful way  spend time    hot ...
3     Basically there's  family   little boy (Jake) ...
4     Petter Mattei's "Love   Time  Money"   visuall...
                            ...                        
95    Daniel Day-Lewis    versatile actor alive. Eng...
96    My guess would    originally going    least tw...
97    Well, I like  watch bad horror B-Movies, cause...
98    This IS  worst movie I  ever seen,  well as,  ...
99    I    Mario fan   long  I  remember, I   fond m...
Name: review, Length: 100, dtype: object

## <b style="color:red">Handling Emojis</b>
- Remove with textual data
- Replace with emotion

### **To Remove Emojis**

In [37]:
# To remove emojis
import re
def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emotions
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"  
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Example usage:
text_with_emojis = "I love Python! 😍🐍. This movie is amazing! 🍿🎬. Just finished my workout 💪🏋️‍♀️"
clean_text = remove_emoji(text_with_emojis)
print(clean_text)
 

I love Python! . This movie is amazing! . Just finished my workout ‍


### **Replace With emojis Meaning**

In [38]:
# Replace with meaning
import emoji
print(emoji.demojize('Pushpa name sun ke 🌸 smjha kya 🔥 hai main.'))
print(emoji.demojize( "You are not a friend, you are a 🐍."))


Pushpa name sun ke :cherry_blossom: smjha kya :fire: hai main.
You are not a friend, you are a :snake:.


## <b style="color:red">Tokenization</b>
- I am an Indian.   >-------->>>     "I" "am", "an", "India"
- Word level tokenization. Sentence level tokenization.
- __Why tokenization?__
- text classification : Costumer chat : Support, Sales
  - Get number of unique words.
  - eg. : I am new in new delhi.  >-------->>> I:1, am:2, new:3, in:4, delhi:5
- __Problem in tokenization.__
  - _Prefix_ : Characters at the begining. : `$("` : eg. $10
  - _Suffix_ : Characters at the end. : `km),.!"` : eg. 23Km
  - _Infix_ : Characters in between. : `- -- / ...` : eg. New-York
  - _Exception_ : Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied. : eg. Let's, U.S.


### 1. Using the split function

In [39]:
# word tokenization
sent1 = 'I am going to delhi'
sent1 = sent1.split()
print(sent1)

# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2 = sent2.split('.')
print(sent2)

# problem with split function
sent3 = 'I am going to delhi!'
sent3 = sent3.split()
print(sent3)

sent4 = 'Where do think I should go? I have 3 day holiday'
sent4 = sent4.split('.')
print(sent4)

['I', 'am', 'going', 'to', 'delhi']
['I am going to delhi', ' I will stay there for 3 days', " Let's hope the trip to be great"]
['I', 'am', 'going', 'to', 'delhi!']
['Where do think I should go? I have 3 day holiday']


### 2. Using Regular Expression (Regex)

In [40]:
import re

sent11 = "I am going to delhi!"
tokens = re.findall("[\w']+", sent11)
print(tokens, end="\n\n")

sent12 = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sent12 = re.compile('[.,!?]').split(sent12)
# sent12 = re.split(r'[.!?]', sent12)
sent12 = [sent.strip() for sent in sent12 if sent.strip()]
print(sent12, end="\n\n")
for x in sent12:
    print(x)


['I', 'am', 'going', 'to', 'delhi']

['Lorem Ipsum is simply dummy text of the printing and typesetting industry', "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s", 'when an unknown printer took a galley of type and scrambled it to make a type specimen book']

Lorem Ipsum is simply dummy text of the printing and typesetting industry
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s
when an unknown printer took a galley of type and scrambled it to make a type specimen book


### 3. NLTK

In [41]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [42]:
from nltk.tokenize import word_tokenize, sent_tokenize

# word_tokenize
sentence1 = 'I am going to visit delhi!'
sentence1 = word_tokenize(sentence1)
print(sentence1, end="\n\n")

sentence2 = """Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentence2 = sent_tokenize(sentence2)
print(sentence2, end="\n")
for x in sentence2:
    print(x)
print()

s1 = 'I have a Ph.D in A.I.'
s2 = "We're here to help! mail us at nashik@gmail.com"
s3 = 'A 5km ride cost $10.50.'

s1 = word_tokenize(s1)
print(s1)
s2 = word_tokenize(s2)
print(s2)
s3 = word_tokenize(s3)
print(s3)

['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?', "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,\nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]
Lorem Ipsum is simply dummy text of the printing and typesetting industry?
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s,
when an unknown printer took a galley of type and scrambled it to make a type specimen book.

['I', 'have', 'a', 'Ph.D', 'in', 'A.I', '.']
['We', "'re", 'here', 'to', 'help', '!', 'mail', 'us', 'at', 'nashik', '@', 'gmail.com']
['A', '5km', 'ride', 'cost', '$', '10.50', '.']


### 4. Spacy

In [43]:
# !python -m spacy download en_core_web_sm

In [49]:
import spacy

nlp = spacy.load("en_core_web_sm")

s1 = 'I have a Ph.D in A.I.'
s2 = "We're here to help! mail us at nashik2023@gmail.com"
s3 = 'A 5km ride cost $10.50.'
s4 = 'I am going to visit delhi!'

doc1 = nlp(s1)
for token in doc1:
    print(token, end="\t")
print()
doc2 = nlp(s2)
for token in doc2:
    print(token, end="\t")
print()
doc3 = nlp(s3)
for token in doc3:
    print(token, end="\t")
print()
doc4 = nlp(s4)
for token in doc4:
    print(token, end="\t")
print()

I	have	a	Ph	.	D	in	A.I.	
We	're	here	to	help	!	mail	us	at	nashik2023@gmail.com	
A	5	km	ride	cost	$	10.50	.	
I	am	going	to	visit	delhi	!	


## <b style="color:red">Stemming & Lemmatization</b>
- __Inflection__ : In grammer, inflection is the modification of a word to express different grammatical categories such as tense, case, voice, aspect, person, number, gender, and mood.
- A little change in word to change the meaning. eg.
  - `walk >--->>> walk, walking, walked walks`
  - `do >--->>> does, undo, undoable`
- <b style="color:blue">__Stemming__</b> : Stemming is the process of reducing _inflection_ in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language.
  - walking >--->>> walk
  - It is text preprocessing technique.
  - Use in IR Systems : Information Retrival System
  - For stemming we use `nltk` library.
  - NLTK -
    - Porter Stemming
    - Snow Ball Stemmer
- Some times the output of __stemming__ will not be a english word. To solve this problem we use __lemmatization__. Work of __lemmatization__ is same as __stemming__.
- Stemming is faster than lemmatization.
- <b style="color:blue">__Lemmatization__</b> : Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. In Lemmatization root word is called __Lemma__. A lemma (plural lemmas or lemmata) is the canonical form, dictionary from, or citation form of a set of words.
- Stemming use algoriths to eliminate prefix, infix or postfix. Where Lemmatization use a dictionary to search the word. We use __WordNet Lemmatizer__ dictionary, which is a __Lexical Dictionary__. Stemming is faster than Lemmatization.

### 1. Stemming

In [50]:
# Stemming
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])


In [57]:
sample = "walk walks walking walked"
sample = stem_words(sample)
print(sample)

sample = "I am going to read a books"
sample = stem_words(sample)
print(sample)

walk walk walk walk
i am go to read a book


### 2. Lemmatization

In [64]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...


True

In [66]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at some time. He has bad habbit of swimming after playing long hours in Sun."
punctuations = "?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v'))) 
    # pov='v' // pos:part of speech, v:verb


Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
some                some                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habbit              habbit              
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
Sun                 Sun                 
