# Importing The Libraries

In [8]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Importing The Dataset

In [11]:
df = pd.read_csv(r"C:\Users\itsso\Downloads\Movies Dataset\IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Data Preprocessing

In [12]:
# Tokenize the data
df['review'] = df['review'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

# Convert text to lowercase
df['review'] = df['review'].apply(lambda x: [word.lower() for word in x])

# Join the tokenized words into string
df['review']=df['review'].apply(lambda x: ' '.join(x))

# Map sentiment labels to numerical values
df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review'])
y = df['sentiment']

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,a wonderful little production . < br / > < br ...,1
2,i thought wonderful way spend time hot summer ...,1
3,basically 's family little boy ( jake ) thinks...,0
4,petter mattei 's `` love time money '' visuall...,1


# Training The Models

In [16]:
# Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)

# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# XGBoosting Classifier
xgb = xgb.XGBClassifier()
xgb.fit(X_train, y_train)

# Logistic Regression Classifier
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Testing The Models

In [32]:
# Predictions on the test data
y_pred_nb = nb.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_knn = knn.predict(X_test)
y_pred_xgb = xgb.predict(X_test)
y_pred_lr = lr.predict(X_test)
y_pred_dt = dt.predict(X_test)

# Evaluate the models
models = ['Naive Bayes', 'Random Forest', 'KNN', 'XGBoosting', 'Logistic Regression', 'Decision Tree']
accuracy = [accuracy_score(y_test, y_pred_nb), accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_knn), 
              accuracy_score(y_test, y_pred_xgb), accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_dt)]
precision = [precision_score(y_test, y_pred_nb), precision_score(y_test, y_pred_rf), precision_score(y_test, y_pred_knn), 
              precision_score(y_test, y_pred_xgb), precision_score(y_test, y_pred_lr), precision_score(y_test, y_pred_dt)]
recall = [recall_score(y_test, y_pred_nb), recall_score(y_test, y_pred_rf), recall_score(y_test, y_pred_knn), 
           recall_score(y_test, y_pred_xgb), recall_score(y_test, y_pred_lr), recall_score(y_test, y_pred_dt)]
f1 = [f1_score(y_test, y_pred_nb), f1_score(y_test, y_pred_rf), f1_score(y_test, y_pred_knn), 
             f1_score(y_test, y_pred_xgb), f1_score(y_test, y_pred_lr), f1_score(y_test, y_pred_dt)]

# compare the models through dataframe
result = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1
})

result

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Naive Bayes,0.8655,0.880668,0.847986,0.864018
1,Random Forest,0.7669,0.813716,0.696964,0.750828
2,KNN,0.7751,0.763009,0.803136,0.782558
3,XGBoosting,0.8595,0.849289,0.876761,0.862806
4,Logistic Regression,0.8988,0.888482,0.913872,0.900998
5,Decision Tree,0.7224,0.726708,0.719786,0.72323
