In [103]:
#importing necessary libs
import pandas as pd             
import numpy as np               
import re                        
import string                   
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [104]:
#reading data
data = pd.read_csv("IMDB Dataset.csv")

In [105]:
#exploring data
data.shape
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [106]:
#exploring values of the two columns
# data[data['sentiment']=='positive'].shape[0]
# data[data['sentiment']=='negative'].shape[0]
data['sentiment'].value_counts()
data['review'].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
49995    False
49996    False
49997    False
49998    False
49999    False
Name: review, Length: 50000, dtype: bool

In [107]:

# def clean_text(text):
#     text = re.sub('<.*?>', '', text) 
#     text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) 
#     text = text.lower()  
#     return text

# df['clean_review'] = df['review'].apply(clean_text)
# df.drop(columns=['review'], inplace=True)

# df[['review', 'clean_review']].head()


In [108]:
#some statistics about data
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [109]:
# Text Cleaning : HTML Tags, LowerCase, Numbers, Stopwords, Lemmatization
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  #for lemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub('<.*?>', '', text) #HTML tags removal
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # punctuation removal
    text = text.lower() #all text to lowercase
    
    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    #split text into words, remove stopwords, and lemmatize 
    words = text.split()
    cleaned_words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words
    ]
    
    return " ".join(cleaned_words)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\VICTUS\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [110]:

# incorrect_text = 'I aam not shure if this is a gaood movie or noit, but I thhink it is.'
# # Calling function
# textBlb1 = TextBlob(incorrect_text)
# # Corrected Text
# print(incorrect_text)
# print(textBlb1.correct().string)
# for i in range(len(data)):
#     if data['clean_review'][i] is not None:
#         data['clean_review'][i] = TextBlob(data['clean_review'][i]).correct()
# # data = TextBlob(data['clean_review'])
# data['clean_review']

In [111]:
#Splitting data into training and testing sets
X = data['review']
y = data['sentiment']

#Split data into trainging and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [112]:
#Vectorizing text data.
vectoried = TfidfVectorizer(max_features=5000) # we can limit the number of features to the most frequent 5000 words.
X_train_vectoried = vectoried.fit_transform(X_train)
X_test_vectoried = vectoried.transform(X_test)


In [113]:
#Training the Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vectoried , y_train) #train the model using the training data
y_pred = model.predict(X_test_vectoried) #make predictions on the test data

#Evaluating the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[4229  732]
 [ 751 4288]]
Classification Report:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Accuracy Score: 0.8517
