In [1]:
import pandas as pd
import numpy as np

In [2]:
## Load the dataset
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape

(50000, 2)

In [4]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [8]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Text PreProcessing

In [7]:
df['review'].head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [10]:
## lower all the cases
df['review']=df['review'].str.lower()

In [11]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [12]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
stops = set(stopwords.words('english'))   # load once
pattern_special = re.compile(r'[^a-zA-Z0-9\s-]+')
pattern_url = re.compile(r'(http|https|ftp|ssh)://\S+')
pattern_html = re.compile(r'<.*?>')

## Removing the Special Characters
df['review'] = df['review'].apply(lambda x: pattern_special.sub('', x))
## Remove the stopwords
df['review'] = df['review'].apply(lambda x: " ".join([y for y in x.split() if y not in stops]))
## Remove url
df['review'] = df['review'].apply(lambda x: pattern_url.sub('', x))
## Remove html tags 
df['review'] = df['review'].apply(lambda x: pattern_html.sub('', x))
## Remove any additional spaces
df['review'] = df['review'].apply(lambda x: " ".join(x.split()))

In [18]:
df['review']

0        one reviewers mentioned watching 1 oz episode ...
1        wonderful little production br br filming tech...
2        thought wonderful way spend time hot summer we...
3        basically theres family little boy jake thinks...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary schools n...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movies high art fans exp...
Name: review, Length: 50000, dtype: object

In [20]:
df['sentiment'] = df['sentiment'].map({'negative':0, 'positive':1})

In [21]:
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production br br filming tech...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [22]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df['review'],df['sentiment'],test_size=0.2)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [26]:
## TF-IDF 
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [27]:
## Logistic Regression
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.902


In [28]:
# Linear SVM
svm = LinearSVC()
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print("Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

Linear SVM Accuracy: 0.8967


In [29]:
## Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_nb))

MultinomialNB Accuracy: 0.879


In [30]:
print("\nClassification Report (SVM):\n", classification_report(y_test, y_pred_svm))


Classification Report (SVM):
               precision    recall  f1-score   support

           0       0.90      0.89      0.90      4982
           1       0.90      0.90      0.90      5018

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [32]:
import joblib

# Save models
joblib.dump(log_reg, "model/log_reg.pkl")
joblib.dump(svm, "model/svm.pkl")
joblib.dump(nb, "model/nb.pkl")

['model/nb.pkl']

In [33]:
joblib.dump(tfidf, "model/vectorizer.pkl")
print("✅ Models and vectorizer saved successfully!")

✅ Models and vectorizer saved successfully!
