In [None]:
# ---------------------------
# Import Required Libraries
# ---------------------------

import nltk
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import movie_reviews, stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# ---------------------------
# Download NLTK Datasets
# ---------------------------

nltk.download('movie_reviews')
nltk.download('stopwords')

In [None]:
# ---------------------------
# Load IMDb Movie Reviews
# ---------------------------

documents = []

# Load movie reviews as (list_of_words, category)
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        words = list(movie_reviews.words(fileid))
        documents.append((words, category))



# Shuffle the dataset
random.shuffle(documents)

In [None]:
# ---------------------------
# Preprocessing Setup
# ---------------------------

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


def preprocess(words):
    clean_words = []
    for word in words:
        if word.isalpha() and word.lower() not in stop_words:
            clean_words.append(stemmer.stem(word.lower()))
    return ' '.join(clean_words)
            

# Apply preprocessing
texts = [preprocess(words) for words, _ in documents]
labels = [label for _, label in documents]

In [None]:
# ---------------------------
# TF-IDF Vectorization
# ---------------------------

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(texts)
y = labels

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# ---------------------------
# Train Sentiment Classifier
# ---------------------------

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
# ---------------------------
# Evaluate the Model
# ---------------------------

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n Classification Report:\n", classification_report(y_test, y_pred))
plt.figure(figsize=(6, 4))
sns.heatmap(confusion_matrix(y_test, y_pred), 
            annot=True, fmt='d', cmap='Blues', 
            xticklabels=model.classes_, 
            yticklabels=model.classes_)
plt.title("Confusion Matrix Heatmap")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

In [None]:
# ---------------------------
# Predict Custom Sentiment
# ---------------------------

# Function to predict sentiment of a custom text input
def predict_sentiment(text):
    cleaned = preprocess(text.split())
    vec = vectorizer.transform([cleaned])
    return model.predict(vec)[0]

print("Sentiment:", predict_sentiment("I really loved the movie! It was amazing."))
print("Sentiment:", predict_sentiment("The movie was terrible and boring."))

In [None]:
# ---------------------------
# Save Model and Vectorizer
# ---------------------------

import joblib

# Save trained model and TF-IDF vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf.pkl')