In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("IMDB Dataset.csv")

## Data preprocessing

In [5]:
df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [6]:
df.drop_duplicates(inplace=True)

In [7]:
# Converting reviews to lowercase
df['review'] = df['review'].str.lower()

In [8]:

# Removing HTML tags from reviews
import re

def remove_html(text):
    pattern = r'<.*?>'   
    text = re.sub(pattern, '', text)
    return text

df['review'] = df['review'].apply(remove_html)


In [9]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming tec...,1


In [10]:
import nltk

In [11]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kimay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def remove_urls(text):
    pattern = r'https?://\S+|www\.\S+'
    return re.sub(pattern, '', text)

df['review'] = df['review'].astype(str).apply(remove_urls)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [14]:
X = df['review']
y = df['sentiment']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### TF-IDF

In [16]:
#transform the text data into numerical features using TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### ML Models

In [17]:
#Logistic Regression Model
log_model = LogisticRegression(max_iter=500)
log_model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


In [18]:
y_pred = log_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8846


In [19]:
# Random Forest Model

from sklearn.ensemble import RandomForestClassifier

rnd_model = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_model.fit(X_train_tfidf, y_train)

KeyboardInterrupt: 

In [None]:
y_pred = rnd_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8357


In [None]:
#Decision Tree Model
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_tfidf, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [None]:
y_pred = dt_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7174


In [None]:
# XGBoost Model
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder= False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_tfidf, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
y_pred = xgb_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8491


In [None]:
#KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_tfidf, y_train)


0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [None]:
y_pred = knn_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7358


In [None]:
#SVM Model
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

NameError: name 'X_train_tfidf' is not defined

In [None]:
y_pred = svm_model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

### DL tf-Keras Model