# **Dataset**

In [1]:
import pandas as pd

In [6]:
# Try reading the CSV with the 'python' engine and skipping bad lines
df = pd.read_csv("/content/IMDB Dataset.csv", engine='python', on_bad_lines='skip')

In [7]:
df.shape

(15744, 2)

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# **Converting into Lower Case**

In [9]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [10]:
# converting this column into lower case
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [11]:
# converting every column into lower case
df['review'] = df['review'].str.lower()

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# **Remove HTML Tags**

In [13]:

import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub('r', text)

In [14]:
text = "basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life."

In [15]:
remove_html_tags(text)

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.rrthis movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.rrok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life."

In [16]:
df['review'] = df['review'].apply(remove_html_tags)

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. rrthe filming t...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# **Remove URLs**

In [18]:
def remove_url(text):
  pattern = re.compile(r'https?://\S+|www\.\S+')
  return pattern.sub(r'', text)

In [19]:
text1 = "For more design inspiration, visit https://dribbble.com."
text2 = "Visit our portfolio at https://satishnaidu.dev for more details."
text3 = "Google search here www.google.com."
text4 = "I uploaded the project to GitHub: https://github.com/username/project-name."

In [20]:
remove_url(text1)

'For more design inspiration, visit '

In [21]:
remove_url(text2)

'Visit our portfolio at  for more details.'

In [22]:
df['review'] = df['review'].apply(remove_url)

# **Remove Punctuation**

In [23]:
import string
import time
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [24]:
exclude = string.punctuation

In [25]:
# step-1
def remove_punc(text):
  for char in exclude:
    text = text.replace(char, '')
  return text

In [26]:
text = 'string. With. Punctuation?'

In [27]:
remove_punc(text)

'string With Punctuation'

In [28]:
# how much time it taking to remove
start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1)

string With Punctuation
0.0001800060272216797


In [29]:
# step-2
def remove_punc1(text):
  return text.translate(str.maketrans('', '', exclude))

In [30]:
# how much time it taking to remove
start = time.time()
print(remove_punc1(text))
time2 = time.time() - start
print(time2)

string With Punctuation
0.0001392364501953125


In [31]:
print(time2/time1) # step-2 is faster compared to step-1

0.7735099337748345


In [32]:
df['review'] = df['review'].apply(remove_punc)

In [33]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production rrthe filming te...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# **Chat word treatment**

In [34]:
# function to convert text data into python dictionary
slang_dict = {}

with open("slang.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()
        if not line:
            continue  # skip empty lines
        # Try splitting by common delimiters
        for delimiter in ['=', ':', '\t', '–', '—', '-']:
            if delimiter in line:
                parts = line.split(delimiter, 1)
                key = parts[0].strip()
                value = parts[1].strip()
                slang_dict[key] = value
                break  # Stop after the first matching delimiter

# Optional: Print the result
for k, v in slang_dict.items():
    print(f"{k} => {v}")


AFAIK => As Far As I Know
AFK => Away From Keyboard
ASAP => As Soon As Possible
ATK => At The Keyboard
ATM => At The Moment
A3 => Anytime, Anywhere, Anyplace
BAK => Back At Keyboard
BBL => Be Back Later
BBS => Be Back Soon
BFN => Bye For Now
B4N => Bye For Now
BRB => Be Right Back
BRT => Be Right There
BTW => By The Way
B4 => Before
CU => See You
CUL8R => See You Later
CYA => See You
FAQ => Frequently Asked Questions
FC => Fingers Crossed
FWIW => For What It's Worth
FYI => For Your Information
GAL => Get A Life
GG => Good Game
GN => Good Night
GMTA => Great Minds Think Alike
GR8 => Great!
G9 => Genius
IC => I See
ICQ => I Seek you (also a chat program)
ILU => ILU: I Love You
IMHO => In My Honest/Humble Opinion
IMO => In My Opinion
IOW => In Other Words
IRL => In Real Life
KISS => Keep It Simple, Stupid
LDR => Long Distance Relationship
LMAO => Laughing my a** off
LOL => Laughing out loud
LTNS => Long Time No See
L8R => Later
MTE => My Thoughts Exactly
M8 => Mate
NRN => No Reply Necessa

In [35]:
def chat_conversion(text):
  new_text = []
  for w in text.split():
    if w.upper() in slang_dict:
       new_text.append(slang_dict[w.upper()])
    else:
      new_text.append(w)
  return " ".join(new_text)

In [36]:
chat_conversion("IMHO he is the best")

'In My Honest/Humble Opinion he is the best'

In [37]:
chat_conversion("FYI delhi is the capital of india")

'For Your Information delhi is the capital of india'

In [38]:
# Our dataset does not have these type of words

# **Spelling Correction**

In [39]:
from textblob import TextBlob

In [40]:
incorrect_text = "Yesturday I went to the libary to borow some books about artifisial inteligence."
textBlb = TextBlob(incorrect_text)
textBlb.correct().string

'Yesterday I went to the library to brow some books about artificial intelligence.'

# **Removing Stop words**

In [41]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [42]:
from nltk.corpus import stopwords

In [43]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [44]:
def remove_stopwords(text):
  new_text = []
  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
  x = new_text[:]
  new_text.clear()
  return " ".join(x)

In [45]:
remove_stopwords("She went to the market in the morning because it was the only time she had before work.")

'She went   market   morning      time    work.'

In [46]:
df['review'] = df['review'].apply(remove_stopwords)

In [50]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz e...,positive
1,wonderful little production rrthe filming tec...,positive
2,thought wonderful way spend time hot s...,positive
3,basically theres family little boy jake thi...,negative
4,petter matteis love time money visually s...,positive


# **Removing Emojis**

In [51]:
# function to remove emojis
import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [52]:
remove_emoji("Good morning! ☀️😊 I hope you have a great day ahead. 💪✨ Don’t forget to smile 😄 and stay positive! 🌈❤️ Let’s make today amazing! 🚀🔥")

'Good morning!  I hope you have a great day ahead.  Don’t forget to smile  and stay positive!  Let’s make today amazing! '

In [53]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/590.6 kB[0m [31m26.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [54]:
# replacing emoji with text
import emoji
print(emoji.demojize('I cannot believe 😲 how fast the week went by.'))

I cannot believe :astonished_face: how fast the week went by.


# **Tokenization**

**1.Using the Split function**

In [55]:
# word tokenization
sent1 = 'I am going to delhi'
sent1.split()

['I', 'am', 'going', 'to', 'delhi']

In [56]:
# sentence tokenization
sent2 = 'I am going to delhi. I will stay there for 3 days. Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 ' I will stay there for 3 days',
 " Let's hope the trip to be great"]

In [57]:
# problems with split function
sent3 = 'I am going to delhi!' # it wont split ! mark.
sent3.split()

['I', 'am', 'going', 'to', 'delhi!']

In [59]:
# problems with split function
sent2 = 'I am going to delhi. I will stay there for 3 days? Let\'s hope the trip to be great'
sent2.split('.')

['I am going to delhi',
 " I will stay there for 3 days? Let's hope the trip to be great"]

**2.Regular Expression**

In [60]:
import re
sent3 = 'I am going to delhi!'
tokens = re.findall("[\w']+", sent3)
tokens

['I', 'am', 'going', 'to', 'delhi']

In [62]:
text = "I am going to delhi. I will stay there for 3 days? Let\'s hope the trip to be great"
sentences = re.compile('[.!?] ').split(text)
sentences

['I am going to delhi',
 'I will stay there for 3 days',
 "Let's hope the trip to be great"]

In [63]:
# We have to make patterns, it is very much complex. But, compared to split it is good.

**3.NLTK**

In [66]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [67]:
from nltk.tokenize import word_tokenize,sent_tokenize

In [68]:
sent1 = 'I am going to delhi!'
word_tokenize(sent1)

['I', 'am', 'going', 'to', 'delhi', '!']

In [69]:
text = "I am going to delhi. I will stay there for 3 days? Let\'s hope the trip to be great"
sent_tokenize(text)

['I am going to delhi.',
 'I will stay there for 3 days?',
 "Let's hope the trip to be great"]

In [70]:
sent5 = 'I have a ph.D in A.I'
sent6 = 'we re here to help! mail us john.jay@example.com'
sent7 = 'A 5km ride cost $10.50'

In [71]:
word_tokenize(sent5)

['I', 'have', 'a', 'ph.D', 'in', 'A.I']

In [72]:
word_tokenize(sent6)

['we',
 're',
 'here',
 'to',
 'help',
 '!',
 'mail',
 'us',
 'john.jay',
 '@',
 'example.com']

In [74]:
word_tokenize(sent7)

['A', '5km', 'ride', 'cost', '$', '10.50']

In [75]:
# NLTK has some issues but it is better and simple compared to regular expression

**4.Spacy**

In [76]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [77]:
# convert sentences into documents
doc1 = nlp(sent5)
doc2 = nlp(sent6)
doc3 = nlp(sent7)
doc4 = nlp(sent1)

In [79]:
for token in doc2:
  print(token)

we
re
here
to
help
!
mail
us
john.jay@example.com


In [80]:
for token in doc3:
  print(token)

A
5
km
ride
cost
$
10.50


In [81]:
for token in doc4:
  print(token)

I
am
going
to
delhi
!


In [82]:
# Spacy is better than nltk, but one thing you have to consider that no library performs 100per, every library fails at some point

In [90]:
df['review'] = df['review'].apply(nlp)

In [91]:
df.head()

Unnamed: 0,review,sentiment
0,"(one, , reviewers, , mentioned, , watchi...",positive
1,"( , wonderful, little, production, rrthe, film...",positive
2,"( , thought, , wonderful, way, , spend, ti...",positive
3,"(basically, there, s, , family, , little, b...",negative
4,"(petter, matteis, love, , time, , money, ...",positive


# **Stemming**

In [83]:
from nltk.stem.porter import PorterStemmer

In [84]:
ps = PorterStemmer()
def stem_words(text):
  return " ".join([ps.stem(word) for word in text.split()])

In [85]:
sample = "walk walks walking walked"
stem_words(sample)

'walk walk walk walk'

In [86]:
text = "The children were playing happily in the playground. They enjoyed running, jumping, and shouting while their parents watched them from a distance. Everyone was smiling and talking about their plans for the upcoming holidays."
print(text)

The children were playing happily in the playground. They enjoyed running, jumping, and shouting while their parents watched them from a distance. Everyone was smiling and talking about their plans for the upcoming holidays.


In [87]:
stem_words(text)

'the children were play happili in the playground. they enjoy running, jumping, and shout while their parent watch them from a distance. everyon wa smile and talk about their plan for the upcom holidays.'

In [89]:
# stemming doesnt gives correct form of words so, instead of it we can use Lemmatization
# use case
# stemming -> Use when you no need to show output to user -> fast
# lemmatization -> Use when you have to show output to user -> slow

# **Lemmatization**

In [93]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [94]:
from nltk.stem import WordNetLemmatizer
import nltk

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = "?!.;,"

sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

print("{0:20}{1:20}".format("Word", "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
He                  He                  
was                 wa                  
running             running             
and                 and                 
eating              eating              
at                  at                  
same                same                
time                time                
He                  He                  
has                 ha                  
bad                 bad                 
habit               habit               
of                  of                  
swimming            swimming            
after               after               
playing             playing             
long                long                
hours               hour                
in                  in                  
the                 the                 
Sun                 Sun                 


In [99]:
# Lemmetization on Parts of speech(pos)
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))

He                  He                  
was                 be                  
running             run                 
and                 and                 
eating              eat                 
at                  at                  
same                same                
time                time                
He                  He                  
has                 have                
bad                 bad                 
habit               habit               
of                  of                  
swimming            swim                
after               after               
playing             play                
long                long                
hours               hours               
in                  in                  
the                 the                 
Sun                 Sun                 
