In [1]:
# Importing the `re` module for regular expression operations (e.g., text preprocessing).

import re

# Importing `pandas` for data manipulation and analysis.

import pandas as pd

# Importing `numpy` for numerical computations and handling arrays.

import numpy as np

# Importing `LabelEncoder` from sklearn for encoding target labels into numerical format.

from sklearn.preprocessing import LabelEncoder

# Importing `train_test_split` to split the dataset into training and testing subsets.

from sklearn.model_selection import train_test_split

# Importing the main `keras` library to build and train deep learning models.

import keras

# Importing `classification_report` to evaluate the performance of the model by generating detailed classification metrics.

from sklearn.metrics import classification_report

# Importing `accuracy_score` to compute the accuracy of the classification model.

from sklearn.metrics import accuracy_score

# Importing `math` for mathematical operations (e.g., logarithms, square roots, etc.).

import math

# Importing `nltk`, a natural language processing library, for tasks such as tokenization, stemming, and stopword removal.

import nltk

In [2]:
# Ignore the Warning

import warnings
warnings.filterwarnings("ignore")

## Load the dataset

In [3]:
df=pd.read_csv("C:/Users/Pramoda A S/Desktop/AIML Documents/DataSets in CSV files/Movie Review Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
# Pick any random Review

df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [5]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
#sentiment count

df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

## Data Preprocessing

### Remove HTML tags, URLs, non-alphanumeric characters & Convert to lowercasev

In [7]:
import re



def remove_tags(string):

    removelist = ""  # Add any characters you'd like to keep

    # Remove HTML tags

    result = re.sub(r'<[^>]+>', '', string)

    # Remove URLs

    result = re.sub(r'https?://\S+', '', result)

    # Remove non-alphanumeric characters (except for those in the removelist)

    result = re.sub(r'[^a-zA-Z0-9' + removelist + r'\s]', ' ', result)

    # Convert to lowercase

    result = result.lower()

    return result

In [8]:
string = "<html><body><p> Movie 1</p><p> Actor - Aamir Khan</p><p> Click here to <a href='http://google.com'>download</a></p></body></html>"

print(remove_tags(string))

 movie 1 actor   aamir khan click here to download


In [9]:
#Suppose we have the FOllowing Text With URL.

string1 = 'Check out my notebook https://www.kaggle.com/campusx/notebook8223fc1abb'

print(remove_tags(string1))

check out my notebook 


In [10]:
# Text With Punctuation.

string2 = "The quick brown fox jumps over the lazy dog. However, the dog doesn't seem impressed! Oh no, it just yawned. How disappointing! Maybe a squirrel would elicit a reaction. Alas, the fox is out of luck."

# Remove Punctuation.

print(remove_tags(string2))

the quick brown fox jumps over the lazy dog  however  the dog doesn t seem impressed  oh no  it just yawned  how disappointing  maybe a squirrel would elicit a reaction  alas  the fox is out of luck 


In [11]:
# Apply Function to Remove HTML Tags in our Dataset Colum Review.

df['review'] = df['review'].apply(remove_tags)

In [12]:
df['review'][3]

'basically there s a family where a little boy  jake  thinks there s a zombie in his closet   his parents are fighting all the time this movie is slower than a soap opera    and suddenly  jake decides to become rambo and kill the zombie ok  first of all when you re going to make a film you must decide if its a thriller or a drama  as a drama the movie is watchable  parents are divorcing   arguing like in real life  and then we have jake with his closet which totally ruins all the film  i expected to see a boogeyman similar movie  and instead i watched a drama with some meaningless thriller spots 3 out of 10 just for the well playing parents   descent dialogs  as for the shots with jake  just ignore them '

### Remove stopwords

In [13]:
#nltk.download('stopwords')

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [14]:
import nltk

from nltk.tokenize import WhitespaceTokenizer

from nltk.stem import WordNetLemmatizer



# Download required NLTK resources

#nltk.download('wordnet')

#nltk.download('omw-1.4')



# Initialize tokenizer and lemmatizer

w_tokenizer = WhitespaceTokenizer()

lemmatizer = WordNetLemmatizer()

### Lemmatization

In [18]:
# Define the lemmatization function

def lemmatize_text(text):

    # Tokenize and lemmatize in a single step using a list comprehension

    lemmatized = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

    # Join the lemmatized tokens back into a string

    return ' '.join(lemmatized)



# Apply the function to the 'review' column

df['review'] = df['review'].apply(lemmatize_text)

In [19]:
# Initialize a variable to store the total word count

s = 0.0

# Loop through each review in the 'review' column

for i in df['review']:

    # Split the review into a list of words

    word_list = i.split()

    # Add the number of words in the current review to the total word count

    s = s + len(word_list)

# Calculate the average review length by dividing the total word count by the number of reviews

print("Average length of each review : ", s / df.shape[0])

# Initialize a counter for positive sentiment reviews

pos = 0

# Loop through each row in the DataFrame

for i in range(df.shape[0]):

    # Check if the sentiment of the current review is 'positive'

    if df.iloc[i]['sentiment'] == 'positive':

        # Increment the positive sentiment counter

        pos = pos + 1

# Calculate the number of negative sentiment reviews

neg = df.shape[0] - pos

# Calculate and print the percentage of positive sentiment reviews

print("Percentage of reviews with positive sentiment is " + str(pos / df.shape[0] * 100) + "%")

# Calculate and print the percentage of negative sentiment reviews

print("Percentage of reviews with negative sentiment is " + str(neg / df.shape[0] * 100) + "%")

Average length of each review :  119.5824
Percentage of reviews with positive sentiment is 50.0%
Percentage of reviews with negative sentiment is 50.0%


In [20]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode h...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically family little boy jake think zombie ...,negative
4,petter mattei love time money visually stunnin...,positive


### Encoding Labels and Split train and test

In [21]:
reviews = df['review'].values

labels = df['sentiment'].values

encoder = LabelEncoder()

encoded_labels = encoder.fit_transform(labels)

In [22]:
#split the dataset

#train dataset

train_reviews=df.review[:40000]

train_sentiments=df.sentiment[:40000]

#test dataset

test_reviews=df.review[40000:]

test_sentiments=df.sentiment[40000:]

print(train_reviews.shape,train_sentiments.shape)

print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


## Spelling Correction
### involves identifying and correcting misspelled words in a piece of text. It's a crucial step in many Natural Language Processing (NLP) tasks like text cleaning, search engines, chatbots, and more. Correcting spelling mistakes ensures the data is clean, improves readability, and enhances the performance of machine learning models

## How Spelling Correction Works

## Detect Misspelled Words

## Identify words in the text that are not valid according to a dictionary or linguistic model.

## Suggest Corrections

## Provide the closest valid word(s) based on the identified error.

## Replace the Word

## Replace the misspelled word with the most suitable correction.

In [23]:
from spellchecker import SpellChecker



def correct_spelling(text):

    spell = SpellChecker()

    words = text.split()

    corrected_words = []



    for word in words:

        corrected = spell.correction(word)

        # If correction is None, use the original word

        corrected_words.append(corrected if corrected else word)



    return " ".join(corrected_words)



# Apply spelling correction

train_reviews = [correct_spelling(review) for review in train_reviews]

test_reviews = [correct_spelling(review) for review in test_reviews] 

ModuleNotFoundError: No module named 'indexer'

## Building the model

### Bags of words

In [24]:
# Import the required class

from sklearn.feature_extraction.text import CountVectorizer



# CountVectorizer for bag-of-words representation

# Using valid min_df and max_df values

cv = CountVectorizer(min_df=1, max_df=0.95, binary=False, ngram_range=(1, 3))



# Fit and transform the train reviews

cv_train_reviews = cv.fit_transform(train_reviews)



# Transform the test reviews

cv_test_reviews = cv.transform(test_reviews)



# Output the shapes

print('BOW_cv_train:', cv_train_reviews.shape)

print('BOW_cv_test:', cv_test_reviews.shape)



# Vocabulary size

vocab = cv.get_feature_names_out()

print('Vocabulary size:', len(vocab))

BOW_cv_train: (40000, 6856109)
BOW_cv_test: (10000, 6856109)
Vocabulary size: 6856109


## Tfidf vectorizer

In [None]:
# Import the required class

from sklearn.feature_extraction.text import TfidfVectorizer



# Initialize TfidfVectorizer

tv = TfidfVectorizer(min_df=1, max_df=0.95, use_idf=True, ngram_range=(1, 3))



# Fit and transform the train reviews

tv_train_reviews = tv.fit_transform(train_reviews)



# Transform the test reviews

tv_test_reviews = tv.transform(test_reviews)



# Output the shapes

print('Tfidf_train:', tv_train_reviews.shape)

print('Tfidf_test:', tv_test_reviews.shape)



# Vocabulary size

vocab = tv.get_feature_names_out()

print('Vocabulary size:', len(vocab))

In [None]:
# Import necessary libraries

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

from sklearn.preprocessing import LabelEncoder


In [None]:
# Assuming you have already encoded your labels

# reviews = data['review'].values

# labels = data['sentiment'].values



# Encode sentiments

encoder = LabelEncoder()

encoded_labels = encoder.fit_transform(labels)


In [None]:
# Split the dataset into train and test

train_reviews = df['review'][:40000]

train_sentiments = encoded_labels[:40000]

test_reviews = df['review'][40000:]

test_sentiments = encoded_labels[40000:]

In [None]:
# Ensure shapes

print(train_reviews.shape, train_sentiments.shape)

print(test_reviews.shape, test_sentiments.shape)


In [None]:
# Vectorize the reviews (you can choose CountVectorizer or TfidfVectorizer)

# Using CountVectorizer

cv = CountVectorizer(min_df=1, max_df=0.95, ngram_range=(1, 3))

cv_train_reviews = cv.fit_transform(train_reviews)

cv_test_reviews = cv.transform(test_reviews)

In [None]:
# Or use TfidfVectorizer

# tv = TfidfVectorizer(min_df=1, max_df=0.95, ngram_range=(1, 3))

# tv_train_reviews = tv.fit_transform(train_reviews)

# tv_test_reviews = tv.transform(test_reviews)



# Train-Test Split (if not already split)

X_train, X_val, y_train, y_val = train_test_split(cv_train_reviews, train_sentiments, test_size=0.2, random_state=42)


In [None]:
# Initialize the models

logreg = LogisticRegression(max_iter=1000)

nb = MultinomialNB()

svc = SVC(kernel='linear')

In [None]:
logreg.fit(X_train, y_train)

In [None]:
# Train the models

logreg.fit(X_train, y_train)

nb.fit(X_train, y_train)

svc.fit(X_train, y_train)


In [None]:
# Make predictions

logreg_preds = logreg.predict(X_val)

nb_preds = nb.predict(X_val)

svc_preds = svc.predict(X_val)


In [None]:
# Evaluate the models

print("Logistic Regression Accuracy: ", accuracy_score(y_val, logreg_preds))

print("Naive Bayes Accuracy: ", accuracy_score(y_val, nb_preds))

print("SVM Accuracy: ", accuracy_score(y_val, svc_preds))

In [None]:
# Print classification reports

print("Logistic Regression Classification Report: ")

print(classification_report(y_val, logreg_preds))

print("Naive Bayes Classification Report: ")

print(classification_report(y_val, nb_preds))

print("SVM Classification Report: ")

print(classification_report(y_val, svc_preds))