In [None]:
import re
import string
from textblob import TextBlob
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from nltk.corpus import stopwords
import nltk
import emoji

# Download the NLTK stopwords if you haven't already
nltk.download('stopwords')

# Define a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

# Dictionary for common chat words
chat_words = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'FYI': 'For Your Information',
    'BRB': 'Be Right Back',
    'BTW': 'By The Way',
    'OMG': 'Oh My God',
    'IMO': 'In My Opinion',
    'LOL': 'Laugh Out Loud',
    'TTYL': 'Talk To You Later',
    'GTG': 'Got To Go',
    'TTYT': 'Talk To You Tomorrow',
    'IDK': "I Don't Know",
    'TMI': 'Too Much Information',
    'IMHO': 'In My Humble Opinion',
    'ICYMI': 'In Case You Missed It',
    'FAQ': 'Frequently Asked Questions',
    'TGIF': "Thank God It's Friday",
    'FYA': 'For Your Action'
}


def preprocess_text(text):
    # Print execution start
    print("Starting preprocessing...")
    # Step 1: Remove HTML tags
    def remove_html_tags(text):
        clean = re.compile('<.*?>')
        cleaned_text = re.sub(clean, '', text)
        normalized_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return normalized_text

    # Step 2: Remove URLs
    def remove_url(text):
        text_without_url = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        cleaned_text = ' '.join(text_without_url.split())
        return cleaned_text

    # Step 3: Convert to lowercase
    def to_lowercase(text):
        return text.lower()
    
    # Step 4: Remove stopwords
    def remove_stopwords(text):
        new_text = []
        for word in text.split():
            if word not in stopwords.words('english'):
                new_text.append(word)
        return " ".join(new_text)

    # Step 5: Replace chat words with full forms
    def chat_conversion(text):
        new_text = []
        for word in text.split():
            if word.upper() in chat_words:
                new_text.append(chat_words[word.upper()])
            else:
                new_text.append(word)
        return " ".join(new_text)
    
    # Step 6: Remove/Handle emojis
    def remove_emoji(text):
        # Demojize the text, converting emojis to their corresponding text descriptions
        result = emoji.demojize(text)
        # Replace colons and underscores with spaces
        result = result.replace(":", "").replace("_", " ")
        return result



    # Apply all preprocessing steps
    text = remove_html_tags(text)
    text = remove_url(text)
    text = to_lowercase(text)  # Convert to lowercase
    text = remove_stopwords(text)  # Remove stopwords
    text = chat_conversion(text)
    text = remove_emoji(text)  # Remove emojis

    # Print execution end
    print("Preprocessing completed.")

    return text



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
df = pd.read_csv("D:\WorkSpace\GenAI\GenAI\datasets\IMDB Dataset.csv")

In [72]:
df1 = df.sample(n=500, random_state=1)
df1

Unnamed: 0,review,sentiment
26247,With No Dead Heroes you get stupid lines like ...,negative
44630,"I've seen a few movies in my time, but this on...",positive
46103,Florence Chadwick was actually the far more ac...,negative
16668,Ridiculous horror film about a wealthy man (Jo...,negative
12196,"Well, if you are one of those Katana's film-nu...",positive
...,...,...
43321,Although I'm grateful this obscure gem of 70's...,positive
6151,I had a lot of expectations from this movie an...,negative
37927,Pretty funny stuff. Charlie was still working ...,positive
21146,Spoilers I guess.<br /><br /> The absolutely a...,negative


In [73]:
new_data = {
    'review': "I loved This movie, it was awesomeüòòüòòüëçüëç‚úåÔ∏è‚úåÔ∏èüòäüòä‚ù§Ô∏è‚ù§Ô∏èüëâüëâüíïüíï",
    'sentiment': "Positive"
}

In [74]:
new_data_df = pd.DataFrame([new_data])  # Create a DataFrame from the new data
df1 = pd.concat([df1, new_data_df], ignore_index=True)  # Concatenate the DataFrames

In [75]:
df1

Unnamed: 0,review,sentiment
0,With No Dead Heroes you get stupid lines like ...,negative
1,"I've seen a few movies in my time, but this on...",positive
2,Florence Chadwick was actually the far more ac...,negative
3,Ridiculous horror film about a wealthy man (Jo...,negative
4,"Well, if you are one of those Katana's film-nu...",positive
...,...,...
496,I had a lot of expectations from this movie an...,negative
497,Pretty funny stuff. Charlie was still working ...,positive
498,Spoilers I guess.<br /><br /> The absolutely a...,negative
499,This movie has EVERY clich√© of every terrorism...,negative


In [68]:
df1['review'][500]

'I loved This movie, it was awesomeüòòüòòüëçüëç‚úåÔ∏è‚úåÔ∏èüòäüòä‚ù§Ô∏è‚ù§Ô∏èüëâüëâüíïüíï'

In [76]:
df1['review'] = df1['review'].apply(preprocess_text)

Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Preprocessing completed.
Starting preprocessing...
Prepr

In [88]:
df1['review'][500]

'loved movie awesomeface blowing a kissface blowing a kissthumbs upthumbs upvictory handvictory handsmiling face with smiling eyessmiling face with smiling eyesred heartred heartbackhand index pointing rightbackhand index pointing righttwo heartstwo hearts'

In [78]:
df1['review'] = df1['review'].str.translate(translator)

In [89]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [91]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [92]:
df1['sentence_tokens'] = df1['review'].apply(word_tokenize)

In [93]:
df1

Unnamed: 0,review,sentiment,sentence_tokens
0,dead heroes get stupid lines like woefully aby...,negative,"[dead, heroes, get, stupid, lines, like, woefu..."
1,ive seen movies time one exceptional watch tru...,positive,"[ive, seen, movies, time, one, exceptional, wa..."
2,florence chadwick actually far accomplished sw...,negative,"[florence, chadwick, actually, far, accomplish..."
3,ridiculous horror film wealthy man john carrad...,negative,"[ridiculous, horror, film, wealthy, man, john,..."
4,well one katanas filmnuts just like me sure ap...,positive,"[well, one, katanas, filmnuts, just, like, me,..."
...,...,...,...
496,lot expectations movie since yashraj filmjimmy...,negative,"[lot, expectations, movie, since, yashraj, fil..."
497,pretty funny stuff charlie still working towar...,positive,"[pretty, funny, stuff, charlie, still, working..."
498,spoilers guess absolutely absurd logic ending ...,negative,"[spoilers, guess, absolutely, absurd, logic, e..."
499,movie every clich√© every terrorism airliner cr...,negative,"[movie, every, clich√©, every, terrorism, airli..."


In [94]:
from nltk.stem.porter import PorterStemmer

In [95]:
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [96]:
df1['stemmed'] = df1['review'].apply(stem_words)

In [97]:
df1

Unnamed: 0,review,sentiment,sentence_tokens,stemmed
0,dead heroes get stupid lines like woefully aby...,negative,"[dead, heroes, get, stupid, lines, like, woefu...",dead hero get stupid line like woefulli abysm ...
1,ive seen movies time one exceptional watch tru...,positive,"[ive, seen, movies, time, one, exceptional, wa...",ive seen movi time one except watch truli appr...
2,florence chadwick actually far accomplished sw...,negative,"[florence, chadwick, actually, far, accomplish...",florenc chadwick actual far accomplish swimmer...
3,ridiculous horror film wealthy man john carrad...,negative,"[ridiculous, horror, film, wealthy, man, john,...",ridicul horror film wealthi man john carradin ...
4,well one katanas filmnuts just like me sure ap...,positive,"[well, one, katanas, filmnuts, just, like, me,...",well one katana filmnut just like me sure appr...
...,...,...,...,...
496,lot expectations movie since yashraj filmjimmy...,negative,"[lot, expectations, movie, since, yashraj, fil...",lot expect movi sinc yashraj filmjimmi oper ca...
497,pretty funny stuff charlie still working towar...,positive,"[pretty, funny, stuff, charlie, still, working...",pretti funni stuff charli still work toward pe...
498,spoilers guess absolutely absurd logic ending ...,negative,"[spoilers, guess, absolutely, absurd, logic, e...",spoiler guess absolut absurd logic end ruin en...
499,movie every clich√© every terrorism airliner cr...,negative,"[movie, every, clich√©, every, terrorism, airli...",movi everi clich√© everi terror airlin crisi mo...


In [100]:
import nltk
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [101]:
# Define a function to lemmatize words in the review column
def lemmatize_review(review):
    # Lemmatize each word in the review
    return ' '.join([wordnet_lemmatizer.lemmatize(word, pos='v') for word in review.split()])

# Apply the function to create a new column 'lemmatized'
df1['lemmatized'] = df1['review'].apply(lemmatize_review)

In [102]:
df1

Unnamed: 0,review,sentiment,sentence_tokens,stemmed,lemmatized
0,dead heroes get stupid lines like woefully aby...,negative,"[dead, heroes, get, stupid, lines, like, woefu...",dead hero get stupid line like woefulli abysm ...,dead heroes get stupid line like woefully abys...
1,ive seen movies time one exceptional watch tru...,positive,"[ive, seen, movies, time, one, exceptional, wa...",ive seen movi time one except watch truli appr...,ive see movies time one exceptional watch trul...
2,florence chadwick actually far accomplished sw...,negative,"[florence, chadwick, actually, far, accomplish...",florenc chadwick actual far accomplish swimmer...,florence chadwick actually far accomplish swim...
3,ridiculous horror film wealthy man john carrad...,negative,"[ridiculous, horror, film, wealthy, man, john,...",ridicul horror film wealthi man john carradin ...,ridiculous horror film wealthy man john carrad...
4,well one katanas filmnuts just like me sure ap...,positive,"[well, one, katanas, filmnuts, just, like, me,...",well one katana filmnut just like me sure appr...,well one katanas filmnuts just like me sure ap...
...,...,...,...,...,...
496,lot expectations movie since yashraj filmjimmy...,negative,"[lot, expectations, movie, since, yashraj, fil...",lot expect movi sinc yashraj filmjimmi oper ca...,lot expectations movie since yashraj filmjimmy...
497,pretty funny stuff charlie still working towar...,positive,"[pretty, funny, stuff, charlie, still, working...",pretti funni stuff charli still work toward pe...,pretty funny stuff charlie still work towards ...
498,spoilers guess absolutely absurd logic ending ...,negative,"[spoilers, guess, absolutely, absurd, logic, e...",spoiler guess absolut absurd logic end ruin en...,spoilers guess absolutely absurd logic end rui...
499,movie every clich√© every terrorism airliner cr...,negative,"[movie, every, clich√©, every, terrorism, airli...",movi everi clich√© everi terror airlin crisi mo...,movie every clich√© every terrorism airliner cr...
