In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("IMDB_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Lowercasing

In [5]:
df['review'] = df['review'].str.lower()

# Remove HTML Tags

In [6]:
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [7]:
text = '<p>Hello <b>Rahul</b>! Welcome to <a href="https://example.com">our website</a>.</p> <div>This is a <span style="color:red;">sample</span> paragraph with <i>HTML</i> tags.</div>'

remove_html_tags(text)

'Hello Rahul! Welcome to our website. This is a sample paragraph with HTML tags.'

In [9]:
df['review'] = df['review'].apply(remove_html_tags)

# Remove URLs

In [10]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [11]:
text1 = 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1abb'
text2 = 'Check out my notebook http://www.kaggle.com/campusx/notebook8223fc1abb'
text3 = 'Google search here www.google.com'
text4 = 'For notebook click https://www.kaggle.com/campusx/notebook8223fc1abb to search check www.google.com'

In [13]:
remove_url(text4)

'For notebook click  to search check '

# Remove Punctuation

In [16]:
import string, time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
exclude = string.punctuation

In [18]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [19]:
text = 'string. With. Punctuation?'

In [20]:
remove_punc(text)

'string With Punctuation'

In [25]:
start = time.perf_counter()
remove_punc(text)
time1 = time.perf_counter() - start

print("Time taken to process one input:", time1, "sec")
print("So, time taken to process 50K inputs:", time1 * 50000, "sec")

Time taken to process one input: 6.160000339150429e-05 sec
So, time taken to process 50K inputs: 3.0800001695752144 sec


In [26]:
# Faster method
def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [27]:
start = time.perf_counter()
remove_punc1(text)
time1 = time.perf_counter() - start

print("Time taken to process one input:", time1, "sec")
print("So, time taken to process 50K inputs:", time1 * 50000, "sec")

Time taken to process one input: 4.499999340623617e-05 sec
So, time taken to process 50K inputs: 2.2499996703118086 sec


# Chat Word treatment

In [29]:
chat_words = {
    "A3": "Anytime, Anywhere, Anyplace",
    "ADIH": "Another Day In Hell",
    "AFK": "Away From Keyboard",
    "AFAIK": "As Far As I Know",
    "ASAP": "As Soon As Possible",
    "ASL": "Age, Sex, Location",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "BAE": "Before Anyone Else",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRUH": "Bro",
    "BRT": "Be Right There",
    "BSAAW": "Big Smile And A Wink",
    "BTW": "By The Way",
    "BWL": "Bursting With Laughter",
    "CSL": "Can‚Äôt Stop Laughing",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "DM": "Direct Message",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FIMH": "Forever In My Heart",
    "FOMO": "Fear Of Missing Out",
    "FR": "For Real",
    "FWIW": "For What It's Worth",
    "FYP": "For You Page",
    "FYI": "For Your Information",
    "G9": "Genius",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GMTA": "Great Minds Think Alike",
    "GN": "Good Night",
    "GOAT": "Greatest Of All Time",
    "GR8": "Great!",
    "HBD": "Happy Birthday",
    "IC": "I See",
    "ICQ": "I Seek You",
    "IDC": "I Don‚Äôt Care",
    "IDK": "I Don't Know",
    "IFYP": "I Feel Your Pain",
    "ILU": "I Love You",
    "ILY": "I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMU": "I Miss You",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "IYKYK": "If You Know, You Know",
    "JK": "Just Kidding",
    "KISS": "Keep It Simple, Stupid",
    "L": "Loss",
    "L8R": "Later",
    "LDR": "Long Distance Relationship",
    "LMK": "Let Me Know",
    "LMAO": "Laughing My A** Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "M8": "Mate",
    "MFW": "My Face When",
    "MID": "Mediocre",
    "MRW": "My Reaction When",
    "MTE": "My Thoughts Exactly",
    "NVM": "Never Mind",
    "NRN": "No Reply Necessary",
    "NPC": "Non-Player Character",
    "OIC": "Oh I See",
    "OP": "Overpowered",
    "PITA": "Pain In The A**",
    "POV": "Point Of View",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A** Off",
    "RN": "Right Now",
    "SK8": "Skate",
    "STATS": "Your Sex And Age",
    "SUS": "Suspicious",
    "TBH": "To Be Honest",
    "TFW": "That Feeling When",
    "THX": "Thank You",
    "TIME": "Tears In My Eyes",
    "TLDR": "Too Long, Didn‚Äôt Read",
    "TNTL": "Trying Not To Laugh",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "W": "Win",
    "W8": "Wait...",
    "WB": "Welcome Back",
    "WTF": "What The F**k",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "WYD": "What You Doing?",
    "WYWH": "Wish You Were Here",
    "ZZZ": "Sleeping, Bored, Tired"
}

In [30]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [35]:
chat_conversion('FYI New Delhi is the capital of India')

'For Your Information New Delhi is the capital of India'

# Spelling Correction

In [36]:
from textblob import TextBlob

In [37]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'
textblb = TextBlob(incorrect_text)
textblb.correct().string

'certain conditions during several generations are modified in the same manner.'

# Removing Stopwords

In [38]:
from nltk.corpus import stopwords

In [39]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [40]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stopwords.words('english'):
            new_text.append(word)
    return ' '.join(new_text)

In [41]:
remove_stopwords("This is a simple sentence that is written for the purpose of demonstrating stopwords in a clear way.")

'This simple sentence written purpose demonstrating stopwords clear way.'

In [44]:
df.shape

(50000, 2)

# Handling Emojis

In [48]:
# Remove Emoji

def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [49]:
remove_emoji("Loved the movie. It was üòçüòò")

'Loved the movie. It was '

In [50]:
remove_emoji("Lmao üòÇüòÅ")

'Lmao '

In [51]:
# Replace Emoji

import emoji
print(emoji.demojize("Loved the movie. It was üòçüòò"))

Loved the movie. It was :smiling_face_with_heart-eyes::face_blowing_a_kiss:


In [52]:
print(emoji.demojize('Python is üî•'))

Python is :fire:


# Tokenization

### 1. Using the split() function

In [53]:
# word tokenization
sent1 = "i am going to delhi"
sent1.split()

['i', 'am', 'going', 'to', 'delhi']

In [54]:
# sentence tokenization
sent2 = "I am going to Delhi. I am gonna stay there for 3 days"
sent2.split('.')

['I am going to Delhi', ' I am gonna stay there for 3 days']

In [57]:
# Problems with split function
sent3 = "I am going to Delhi!"
sent3.split()

['I', 'am', 'going', 'to', 'Delhi!']

In [58]:
# See. Delhi! is appearning together instead of 'Delhi' and '!'

In [59]:
sent4 = "Where do you think I should go? I have 3 days leave"
sent4.split(".")
# We can see in the output that split function method is no tdoing the desired sentence tokenization

['Where do you think I should go? I have 3 days leave']

### 2. Regular Expression

In [60]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w]+", sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [61]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sentences = re.compile('[.!?] ').split(text)
sentences

['Lorem Ipsum is simply dummy text of the printing and typesetting industry',
 "\nLorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [62]:
# But still we have to design regex based on our data. It is more convenient to use standard libraries like NLTK and Spacy.

### 3. NLTK

In [63]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [64]:
sent1 = "I am going to Delhi!"
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'Delhi', '!']

In [65]:
text = """Lorem Ipsum is simply dummy text of the printing and typesetting industry? 
Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, 
when an unknown printer took a galley of type and scrambled it to make a type specimen book."""
sent_tokenize(text)

['Lorem Ipsum is simply dummy text of the printing and typesetting industry?',
 "Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, \nwhen an unknown printer took a galley of type and scrambled it to make a type specimen book."]

In [73]:
sent5 = 'I have a Ph.D in A.I'
sent6 = "We're here to help! mail us at nks@gmail.com"
sent7 = 'A 5km ride cost $10.50'
print(word_tokenize(sent5))
print(word_tokenize(sent6))
print(word_tokenize(sent7))

['I', 'have', 'a', 'Ph.D', 'in', 'A.I']
['We', "'re", 'here', 'to', 'help', '!', 'mail', 'us', 'at', 'nks', '@', 'gmail.com']
['A', '5km', 'ride', 'cost', '$', '10.50']


In [77]:
# As we can see above in the second output, NLTK is also not perfect as it is splitting the EmailID. It can make mistakes. Spacy generally performs better. But NLTK is aso very good as we can see.

### 4. Spacy

In [80]:
# import spacy
# nlp = spacy.load("en_core_web_sm")
# nlp(sent5)

# spacy import is resulting in some error because of some conflict with PyTorch. Need some fix.

# Stemming

In [81]:
from nltk.stem.porter import PorterStemmer

In [82]:
ps = PorterStemmer()

def stem_words(text):
    return " ".join(ps.stem(word) for word in text.split())

In [85]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [87]:
sample = "study studies studying"
stem_words(sample)

'studi studi studi'

In [88]:
# See. That's the problem with stemming. It doesn't always produce a real word. 
# That's why Lemmatization is used which produces real words but takes more time than stemming.
# So, if we need to present the reduced words to the user, then we must use lemmatization.

# Lemmatization

In [89]:
import nltk

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations="?:!.,;"
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

sentence_words
print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word, pos = 'v')))  # by pos parameter, we are specifying part of speech ki aapko kis form mein word chahiye. pos = 'v' means hume words verb form mein chahiye.

Word                Lemma               
He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
