In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
# Load dataset
df = pd.read_csv("IMDB Dataset.csv")
print(df.head())

In [None]:
# Preprocess Text
print(df.columns)
if 'review' not in df.columns or 'sentiment' not in df.columns:
    raise ValueError("Dataset must have 'review' and 'sentiment' columns.")
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stop_words]
    return " ".join(words)
df['cleaned_review'] = df['review'].apply(preprocess_text)
print(df[['review', 'cleaned_review']].head())

In [None]:
# Vectorize it to make the machine learn
vectorizer = TfidfVectorizer(max_features=5000)
x = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(f"Train size: {x_train.shape}, Test size: {x_test.shape}")

In [None]:
print(x)

In [None]:
print(y)

In [None]:
# Logistic Regression
log_reg = LogisticRegression()
model1=log_reg.fit(x_train, y_train)
y_pred_log = model1.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_log)
precision = precision_score(y_test, y_pred_log)
recall = recall_score(y_test, y_pred_log)

print(f"Logistic Regression - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


In [None]:
# Plotting the results
plt.figure(figsize=(10, 4))
plt.scatter(range(50), y_test.values[:50], label="Actual", alpha=0.7, color='blue', marker='o')
plt.scatter(range(50), y_pred_log[:50], label="Predicted", alpha=0.7, color='red', marker='x')
plt.title("Logistic Regression: Actual vs Predicted")
plt.xlabel("Sample Index")
plt.ylabel("Sentiment (0 = Negative, 1 = Positive)")
plt.legend()
plt.show()

In [None]:
# Naive Bayes (Multinomial)
nb = MultinomialNB()
model2=nb.fit(x_train, y_train)
y_pred_nb = model2.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb)
recall = recall_score(y_test, y_pred_nb)

print(f"Naive Bayes - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


In [None]:
# Plot the results
plt.figure(figsize=(10, 4))
plt.scatter(range(50), y_test.values[:50], label="Actual", alpha=0.7, color='blue', marker='o')
plt.scatter(range(50), y_pred_nb[:50], label="Predicted", alpha=0.7, color='red', marker='x')
plt.title("Logistic Regression: Actual vs Predicted")
plt.xlabel("Sample Index")
plt.ylabel("Sentiment (0 = Negative, 1 = Positive)")
plt.legend()
plt.show()

In [None]:
# Logistic Regression Hyper Parameter Tuning
log_reg_params = {
    'C': [0.01, 0.1, 1, 10, 100]
}
log_reg = LogisticRegression()
log_reg_grid = GridSearchCV(log_reg, log_reg_params, cv=5, scoring='accuracy', n_jobs=-1)
model3 = log_reg_grid.fit(x_train, y_train)
log_reg_pred_ht=model3.predict(x_test)
accuracy = accuracy_score(y_test, log_reg_pred_ht)
precision = precision_score(y_test, log_reg_pred_ht)
recall = recall_score(y_test, log_reg_pred_ht)
print(f"Logistic Regression HyperParameter Tuned - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
print(f"Best Logistic Regression Params: {model3.best_params_}")
print(f"Best Logistic Regression Accuracy: {model3.best_score_:.4f}")

In [None]:
# Plot the results
plt.figure(figsize=(10, 4))
plt.scatter(range(50), y_test.values[:50], label="Actual", alpha=0.7, color='blue', marker='o')
plt.scatter(range(50), log_reg_pred_ht[:50], label="Predicted", alpha=0.7, color='red', marker='x')
plt.title("Logistic Regression: Actual vs Predicted")
plt.xlabel("Sample Index")
plt.ylabel("Sentiment (0 = Negative, 1 = Positive)")
plt.legend()
plt.show()

In [None]:
# Naive Bayes Hyper Parameter Tuning

nb_params = {
    'alpha': [0.01, 0.1, 1, 10]
}
nb = MultinomialNB()
nb_grid = GridSearchCV(nb, nb_params, cv=5, scoring='accuracy', n_jobs=-1)
model4 = nb_grid.fit(x_train, y_train)
nb_pred_ht=model4.predict(x_test)
accuracy = accuracy_score(y_test, nb_pred_ht)
precision = precision_score(y_test, nb_pred_ht)
recall = recall_score(y_test, nb_pred_ht)
print(f"Naive Bayes Hyperparameter Tuned - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
print(f"Best Naive Bayes Params: {model4.best_params_}")
print(f"Best Naive Bayes Accuracy: {model4.best_score_:.4f}")

In [None]:
# Plot the results

plt.figure(figsize=(10, 4))
plt.scatter(range(50), y_test.values[:50], label="Actual", alpha=0.7, color='blue', marker='o')
plt.scatter(range(50), nb_pred_ht[:50], label="Predicted", alpha=0.7, color='red', marker='x')
plt.title("Logistic Regression: Actual vs Predicted")
plt.xlabel("Sample Index")
plt.ylabel("Sentiment (0 = Negative, 1 = Positive)")
plt.legend()
plt.show()

In [None]:
# Test the model
def classify_review(review):
    cleaned_review = preprocess_text(review)
    transformed_review = vectorizer.transform([cleaned_review])
    log_reg_pred = model1.predict(transformed_review)[0]
    nb_pred = model2.predict(transformed_review)[0]
    log_reg_ht_pred = model3.predict(transformed_review)[0]
    nb_ht_pred = model4.predict(transformed_review)[0]
    sentiment_map = {0: "Negative", 1: "Positive"}
    print("\n--- Sentiment Classification Results ---")
    print(f"Logistic Regression: {sentiment_map[log_reg_pred]}")
    print(f"Naïve Bayes: {sentiment_map[nb_pred]}")
    print(f"Logistic Regression (Hyperparameter Tuned): {sentiment_map[log_reg_ht_pred]}")
    print(f"Naïve Bayes (Hyperparameter Tuned): {sentiment_map[nb_ht_pred]}")
user_review = input("Enter a review: ")
print(f"Review: {user_review}")
classify_review(user_review)