In [3]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from imblearn.over_sampling import SMOTE

In [4]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to C:\Users\Suraj
[nltk_data]     Khodade\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Suraj
[nltk_data]     Khodade\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Suraj
[nltk_data]     Khodade\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
df = pd.read_csv("IMDB Dataset.csv")

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df["sentiment"].unique()

array(['positive', 'negative'], dtype=object)

In [8]:

df["label"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [9]:
df.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [10]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [11]:

def preprocess(text):
    tokens = nltk.word_tokenize(str(text).lower())
    tokens = (lemmatizer.lemmatize(t) for t in tokens if t.isalpha() and t not in stop_words)
    return " ".join(tokens)

In [12]:
df['processed_review'] =  df['review'].apply(preprocess)  # Fixed typo

In [13]:
df.head()

Unnamed: 0,review,sentiment,label,processed_review
0,One of the other reviewers has mentioned that ...,positive,1,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visually stunnin...


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_review'], df['label'], random_state=42, test_size=0.2, stratify=df['label'])

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000,), (10000,), (40000,), (10000,))

In [16]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)  # Use transform, not fit_transform

In [17]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_vect, y_train)

In [28]:
X_res.shape, y_res.shape

((40000, 5000), (40000,))

In [18]:
model = LogisticRegression(max_iter=200, class_weight="balanced")
model.fit(X_res, y_res)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,200


In [19]:
y_pred = model.predict(X_test_vect)

In [20]:
# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=["negative", "positive"]))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

print("Accuracy:", model.score(X_test_vect, y_test))

Classification Report:
               precision    recall  f1-score   support

    negative       0.89      0.88      0.88      5000
    positive       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Confusion Matrix:
 [[4376  624]
 [ 515 4485]]
Accuracy: 0.8861


In [21]:
# -------------------------
# Explainability: top words
# -------------------------
feature_names = vectorizer.get_feature_names_out()
coefs = model.coef_[0]

# Top 15 words for positive and negative
top_pos = sorted(zip(coefs, feature_names), reverse=True)[:15]
top_neg = sorted(zip(coefs, feature_names))[:15]

print("\nTop Positive Words:")
for weight, word in top_pos:
    print(f"{word}: {weight:.3f}")

print("\nTop Negative Words:")
for weight, word in top_neg:
    print(f"{word}: {weight:.3f}")


Top Positive Words:
great: 6.979
excellent: 6.825
perfect: 5.468
amazing: 5.009
best: 4.714
wonderful: 4.603
hilarious: 4.526
loved: 4.522
favorite: 4.476
enjoyed: 4.117
today: 4.040
brilliant: 3.995
superb: 3.953
highly: 3.941
definitely: 3.904

Top Negative Words:
worst: -10.435
waste: -8.356
awful: -7.665
bad: -7.462
boring: -6.046
poor: -5.730
terrible: -5.588
poorly: -5.305
horrible: -5.218
nothing: -5.199
dull: -5.177
worse: -5.072
fails: -4.881
disappointing: -4.631
unfortunately: -4.565


In [22]:
## Test 
review = "This is the very interesting sci fi moview with good suspense. I love this moview "

# Preprocess and vectorize the review
review_processed = preprocess(review)
review_vect = vectorizer.transform([review_processed])
pred = model.predict(review_vect)
print("Predicted label:", "positive" if pred[0]==1 else "negative")

Predicted label: positive


In [23]:
## Test 
review = "This is the very boring sci fi moview with good suspense. I don't love this moview "

# Preprocess and vectorize the review
review_processed = preprocess(review)
review_vect = vectorizer.transform([review_processed])
pred = model.predict(review_vect)
print("Predicted label:", "positive" if pred[0]==1 else "negative")

Predicted label: negative


In [24]:
## create a pickel file
import pickle
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump((model, vectorizer, preprocess), f)
    

In [27]:
## test with pkl
with open("sentiment_model.pkl", "rb") as f:
    model_loaded, vectorizer_loaded, preprocess_loaded = pickle.load(f)

# Test the loaded model
test_comments = [
    "I love the new design of the platform!",
    "The recent changes have made it best.",
    "It's okay, not great but not terrible either."
]

# Preprocess and vectorize the test comments
X_test = vectorizer_loaded.transform([preprocess_loaded(comment) for comment in test_comments])

# Get predictions
predictions = model_loaded.predict(X_test)
print(predictions)

for p in predictions:
    print("positive" if p==1 else "negative")

[1 1 0]
positive
positive
negative
