#### Dataset Link
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [3]:
import pandas as pd

##### Dataset Import

In [4]:
data_path = "IMDB Dataset.csv"

In [5]:
df = pd.read_csv(data_path)

In [6]:
df.shape

(50000, 2)

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df = df.head(100)

In [9]:
df.shape

(100, 2)

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


##### Lower Case Conversion

In [12]:
df['review'][5]

'Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. It just never gets old, despite my having seen it some 15 or more times in the last 25 years. Paul Lukas\' performance brings tears to my eyes, and Bette Davis, in one of her very few truly sympathetic roles, is a delight. The kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. And the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. If I had a dozen thumbs, they\'d all be "up" for this movie.'

In [14]:
df['review'] = df['review'].str.lower()

In [15]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive
...,...,...
95,daniel day-lewis is the most versatile actor a...,positive
96,my guess would be this was originally going to...,negative
97,"well, i like to watch bad horror b-movies, cau...",negative
98,"this is the worst movie i have ever seen, as w...",negative


In [16]:
df['review'][5]

'probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times in the last 25 years. paul lukas\' performance brings tears to my eyes, and bette davis, in one of her very few truly sympathetic roles, is a delight. the kids are, as grandma says, more like "dressed-up midgets" than children, but that only makes them more fun to watch. and the mother\'s slow awakening to what\'s happening in the world and under her own roof is believable and startling. if i had a dozen thumbs, they\'d all be "up" for this movie.'

##### Remove HTML Tags

In [19]:
import re # Python re (regular expression) module

In [20]:
def remove_html_tags(texts):
    pattern = re.compile('<.*?>') # The pattern <.*?> is used to match HTML tags
    return pattern.sub(r'', texts) # Substitute Matches with an Empty String

In [21]:
text = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

In [22]:
remove_html_tags(text)

' Movie 1 Actor - Aamir Khan Click here to download'

In [23]:
df['review'] = df['review'].apply(remove_html_tags)

In [24]:
df['review'][8]

"encouraged by the positive comments about this film on here i was looking forward to watching this film. bad mistake. i've seen 950+ films and this is truly one of the worst of them - it's awful in almost every way: editing, pacing, storyline, 'acting,' soundtrack (the film's only song - a lame country tune - is played no less than four times). the film looks cheap and nasty and is boring in the extreme. rarely have i been so happy to see the end credits of a film. the only thing that prevents me giving this a 1-score is harvey keitel - while this is far from his best performance he at least seems to be making a bit of an effort. one for keitel obsessives only."

##### Remove URL

In [26]:
def remove_URL(texts):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', texts)

In [28]:
text1 = 'Youtube : https://www.youtube.com/'
text2 = 'Facebook : https://www.facebook.com/'

In [29]:
remove_URL(text1)

'Youtube : '

In [30]:
remove_URL(text2)

'Facebook : '

##### Punctuation Handling

In [74]:
import string  # Import the string module to access predefined constants
import time    # Import the time module to measure execution time

In [75]:
exclude = string.punctuation
exclude

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [76]:
def remove_punctuation(texts):
    for char in exclude:
        texts = texts.replace(char, '')
    return texts

In [77]:
text = "string!!. With. Punctuation? It's ok !!"

In [78]:
start = time.time()  # Start the timer
print(remove_punctuation(text))
time1 = time.time() - start  # Calculate the elapsed time
print(time1 * 50000)  # Simulate the time it would take to process 50,000 strings

string With Punctuation Its ok 
10.561943054199219


In [81]:
def remove_punctuation1(texts):
    return texts.translate(str.maketrans('', '', string.punctuation))

In [82]:
start = time.time()
print(remove_punctuation1(text))
time1 = time.time() - start
print(time1 * 50000)

string With Punctuation Its ok 
12.969970703125


In [83]:
df['review'][9]

'if you like original gut wrenching laughter you will like this movie. if you are young or old then you will love this movie, hell even my mom liked it.great camp!!!'

In [84]:
remove_punctuation(df['review'][9])

'if you like original gut wrenching laughter you will like this movie if you are young or old then you will love this movie hell even my mom liked itgreat camp'

In [85]:
remove_punctuation1(df['review'][9])

'if you like original gut wrenching laughter you will like this movie if you are young or old then you will love this movie hell even my mom liked itgreat camp'

##### Chat Conversion Handle

In [88]:
chat_words = {
    'AFAIK':'As Far As I Know',
    'AFK':'Away From Keyboard',
    'ASAP':'As Soon As Possible'
}


{
    "FYI": "For Your Information",
    "ASAP": "As Soon As Possible",
    "BRB": "Be Right Back",
    "BTW": "By The Way",
    "OMG": "Oh My God",
    "IMO": "In My Opinion",
    "LOL": "Laugh Out Loud",
    "TTYL": "Talk To You Later",
    "GTG": "Got To Go",
    "TTYT": "Talk To You Tomorrow",
    "IDK": "I Don't Know",
    "TMI": "Too Much Information",
    "IMHO": "In My Humble Opinion",
    "ICYMI": "In Case You Missed It",
    "AFAIK": "As Far As I Know",
    "BTW": "By The Way",
    "FAQ": "Frequently Asked Questions",
    "TGIF": "Thank God It's Friday",
    "FYA": "For Your Action",
    "ICYMI": "In Case You Missed It",
}

{'FYI': 'For Your Information',
 'ASAP': 'As Soon As Possible',
 'BRB': 'Be Right Back',
 'BTW': 'By The Way',
 'OMG': 'Oh My God',
 'IMO': 'In My Opinion',
 'LOL': 'Laugh Out Loud',
 'TTYL': 'Talk To You Later',
 'GTG': 'Got To Go',
 'TTYT': 'Talk To You Tomorrow',
 'IDK': "I Don't Know",
 'TMI': 'Too Much Information',
 'IMHO': 'In My Humble Opinion',
 'ICYMI': 'In Case You Missed It',
 'AFAIK': 'As Far As I Know',
 'FAQ': 'Frequently Asked Questions',
 'TGIF': "Thank God It's Friday",
 'FYA': 'For Your Action'}

In [89]:
def chat_conversion(texts):
    new_text = []

    for w in texts.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        
        else:
            new_text.append(w)

    return " ".join(new_text)

In [90]:
chat_conversion('Do this work ASAP')

'Do this work As Soon As Possible'

##### Incorrect Text Handling

In [94]:
from textblob import TextBlob

In [95]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

In [96]:
textBLB = TextBlob(incorrect_text)
textBLB.correct().string

'certain conditions during several generations are modified in the same manner.'

##### Stopwords

In [99]:
from nltk.corpus import stopwords
import nltk

In [100]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sifat\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [101]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [102]:
len(stopwords.words('english'))

179

In [103]:
def remove_stopwords(texts):
    new_text = []

    for w in texts.split():
        if w in stopwords.words('english'):
            new_text.append('')
        
        else:
            new_text.append(w)

    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [104]:
df['review'][11]

"i saw this movie when i was about 12 when it came out. i recall the scariest scene was the big bird eating men dangling helplessly from parachutes right out of the air. the horror. the horror.as a young kid going to these cheesy b films on saturday afternoons, i still was tired of the formula for these monster type movies that usually included the hero, a beautiful woman who might be the daughter of a professor and a happy resolution when the monster died in the end. i didn't care much for the romantic angle as a 12 year old and the predictable plots. i love them now for the unintentional humor.but, about a year or so later, i saw psycho when it came out and i loved that the star, janet leigh, was bumped off early in the film. i sat up and took notice at that point. since screenwriters are making up the story, make it up to be as scary as possible and not from a well-worn formula. there are no rules."

In [105]:
remove_stopwords(df['review'][11])

' saw  movie     12   came out.  recall  scariest scene   big bird eating men dangling helplessly  parachutes right    air.  horror.  horror.as  young kid going   cheesy b films  saturday afternoons,  still  tired   formula   monster type movies  usually included  hero,  beautiful woman  might   daughter   professor   happy resolution   monster died   end.   care much   romantic angle   12 year old   predictable plots.  love     unintentional humor.but,   year   later,  saw psycho   came    loved   star, janet leigh,  bumped  early   film.  sat   took notice   point. since screenwriters  making   story, make      scary  possible     well-worn formula.    rules.'

In [106]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [107]:
df['review'].apply(remove_stopwords)

0     one    reviewers  mentioned   watching  1 oz e...
1      wonderful little production.  filming techniq...
2      thought    wonderful way  spend time    hot s...
3     basically there's  family   little boy (jake) ...
4     petter mattei's "love   time  money"   visuall...
                            ...                        
95    daniel day-lewis    versatile actor alive. eng...
96     guess would    originally going    least two ...
97    well,  like  watch bad horror b-movies, cause ...
98       worst movie   ever seen,  well as,  worst  ...
99        mario fan   long    remember,    fond memo...
Name: review, Length: 100, dtype: object

##### Remove Emoji Handle

In [108]:
import re
def remove_emoji(texts):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [109]:
remove_emoji("Loved the movie. It was 😘😘")

"string!!. With. Punctuation? It's ok !!"

In [110]:
remove_emoji("Lmao 😂😂")

"string!!. With. Punctuation? It's ok !!"

In [115]:
import emoji

In [116]:
print(emoji.demojize('Python is 🔥'))

Python is :fire:


In [117]:
print(emoji.demojize('Loved the movie. It was 😘'))

Loved the movie. It was :face_blowing_a_kiss:
