In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [31]:
data_path = os.path.join('../data/raw/IMDB DataSet.csv')
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [32]:
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [33]:
print("Before dropping duplicates: ",df.duplicated().sum())
df.drop_duplicates(inplace=True)
print("After dropping duplicates",df.duplicated().sum())


Before dropping duplicates:  418
After dropping duplicates 0


In [34]:
import re
import contractions

def clean_text(text: str) -> str:
    """
    Clean review text for sentiment analysis.
    - Replace contractions (can't → can not)
    - Lowercase
    - Remove HTML tags
    - Remove punctuation, digits, and special characters
    - Collapse multiple spaces
    """

    text = contractions.fix(text)                   #fix contractions first                

        
    text = text.lower()                              # lowercase
    text = re.sub(r"<.*?>", " ", text)               # remove HTML tags
    text = re.sub(r"can't", "can not", text)         # handle contractions
    text = re.sub(r"n't", " not", text)              # isn't → is not
    text = re.sub(r"[^a-z\s]", " ", text)            # keep only letters + spaces
    text = re.sub(r"\s+", " ", text).strip()         # collapse spaces


    return text



In [35]:
df["clean"] = df["review"].astype(str).apply(clean_text)

df.head()

Unnamed: 0,review,sentiment,clean
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,basically there is a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...


In [40]:
from sklearn.model_selection import train_test_split

X = df["clean"]
y = df["sentiment"].map({"positive":1,
                         "negative": 0 })

X_train , X_test, y_train ,y_test = train_test_split( 
    X,  y ,
    test_size=0.2,      # 20% test data
    random_state=42,    # reproductibility
    stratify=y          # Keeps positives and negative balanced
 )

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2),
    max_df = 0.95,
    min_df=5
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

In [43]:
print(X_train_vec)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4539969 stored elements and shape (39665, 119066)>
  Coords	Values
  (0, 52832)	0.4588512690402721
  (0, 116285)	0.3269117278016237
  (0, 116884)	0.0474934701684599
  (0, 1850)	0.14896397885168416
  (0, 13065)	0.06387157230821774
  (0, 66385)	0.09826143048800716
  (0, 32663)	0.11913774762232543
  (0, 118355)	0.09368059471009366
  (0, 92816)	0.12086370560111727
  (0, 77270)	0.03821717336153929
  (0, 24592)	0.14369700271636424
  (0, 84644)	0.10189140174199611
  (0, 9830)	0.41337114124665325
  (0, 5221)	0.1135338973715151
  (0, 74110)	0.0885421459232696
  (0, 18951)	0.15083229774629042
  (0, 10636)	0.08375574855793659
  (0, 76935)	0.11014610451355632
  (0, 93888)	0.06834988469437019
  (0, 50043)	0.0868747851925366
  (0, 50870)	0.11968478034225341
  (0, 46756)	0.06639670763961718
  (0, 107317)	0.06744625545620994
  (0, 86393)	0.22185532091845495
  (0, 34854)	0.08850312406639448
  :	:
  (39664, 70800)	0.0822219515510303
  (39664,

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Predict
y_pred = clf.predict(X_test_vec)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8982555208228294
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4940
           1       0.89      0.91      0.90      4977

    accuracy                           0.90      9917
   macro avg       0.90      0.90      0.90      9917
weighted avg       0.90      0.90      0.90      9917



In [45]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
print("Accuracy:", nb.score(X_test_vec, y_test))


Accuracy: 0.8809115659977816


In [46]:
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.fit(X_train_vec, y_train)
print("Accuracy:", svm.score(X_test_vec, y_test))


Accuracy: 0.903599878995664


In [61]:
def sample_test(sampleText):

    sampleText_vec = tfidf.transform([sampleText])
    sample_prediction = clf.predict(sampleText_vec)
    if sample_prediction == 1:
        print("Positive Review")
    else:
        print("Neagtive Review")
    return sample_prediction

sample_test("do not watch ")


Positive Review


array([1])