### Data Loading

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('IMDB_Dataset.csv')

# Mapping ensures the model understands our target categories numerically

df['sentiment'] = df['sentiment'].map({'negative': 0, 'positive': 1})

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Data understanding

In [2]:
df.shape

(50000, 2)

In [3]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
df['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

### Text Preprocessing

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-z\s]', '', text)
    
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    
    return ' '.join(cleaned_words)

df['cleaned_review'] = df['review'].apply(clean_text)
df[['review', 'cleaned_review']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELCOT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ELCOT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,review,cleaned_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


### Vectorization and Splitting

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['cleaned_review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ngram_range=(1, 2) helps capture phrases like "not good" or "very happy"
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Vocabulary size: {len(tfidf.vocabulary_)}")

Vocabulary size: 10000


### Model Training and Evaluation

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# liblinear is efficient for small-to-medium datasets
model = LogisticRegression(max_iter=1000, C=5.0, solver='liblinear')
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print("\nDetailed Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 89.49%

Detailed Report:
               precision    recall  f1-score   support

           0       0.90      0.89      0.89      4961
           1       0.89      0.90      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.90      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



### Custom Prediction and Deployment

In [9]:
import pickle

def predict_sentiment(text):

    cleaned = clean_text(text)
    vector = tfidf.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Positive" if prediction == 1 else "Negative"

print(f"Sample 1: {predict_sentiment('The cinematography was breathtaking!')}")
print(f"Sample 2: {predict_sentiment('A total waste of money and time.')}")

pickle.dump(model, open("sentiment_model.pkl", "wb"))
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))
print("\nFiles saved: sentiment_model.pkl, tfidf_vectorizer.pkl")

Sample 1: Positive
Sample 2: Negative

Files saved: sentiment_model.pkl, tfidf_vectorizer.pkl
