# 🎬 Sentiment Analyzer - Movie Review Classification

This notebook demonstrates sentiment analysis using Natural Language Processing (NLP) and machine learning to classify movie reviews as positive or negative.

**Author:** Shams Rupak  
**GitHub:** https://github.com/ShamsRupak

## 📦 Import Required Libraries

In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import random
import re
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 📥 Download NLTK Data

In [None]:
# Download required NLTK data
print("📥 Downloading required NLTK data...")
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize

## 🧹 Define Text Preprocessing Function

In [None]:
def preprocess_text(text):
    """Preprocess text by cleaning, lowercasing, and removing stopwords."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and short words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    return ' '.join(tokens)

## 📊 Load and Prepare Movie Reviews Dataset

In [None]:
print("📊 Loading movie reviews dataset...")

# Load positive and negative reviews
positive_reviews = [(movie_reviews.raw(fileid), 'positive') 
                   for fileid in movie_reviews.fileids('pos')]
negative_reviews = [(movie_reviews.raw(fileid), 'negative') 
                   for fileid in movie_reviews.fileids('neg')]

# Combine and create DataFrame
all_reviews = positive_reviews + negative_reviews
df = pd.DataFrame(all_reviews, columns=['review', 'sentiment'])

# Shuffle the data
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"✅ Loaded {len(df)} reviews")
print(f"\n📈 Dataset distribution:")
print(df['sentiment'].value_counts())

## 📊 Visualize Dataset Distribution

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(8, 6))
df['sentiment'].value_counts().plot(kind='bar', color=['#28a745', '#dc3545'])
plt.title('Distribution of Movie Review Sentiments', fontsize=16, fontweight='bold')
plt.xlabel('Sentiment', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)

# Add value labels on bars
for i, v in enumerate(df['sentiment'].value_counts()):
    plt.text(i, v + 10, str(v), ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 🔧 Preprocess Reviews

In [None]:
print("🔧 Preprocessing text...")
df['processed_review'] = df['review'].apply(preprocess_text)

# Show sample of preprocessed text
print("\n📝 Sample preprocessed reviews:")
for i in range(2):
    print(f"\n{df['sentiment'][i].upper()} Review:")
    print(f"Original: {df['review'][i][:100]}...")
    print(f"Processed: {df['processed_review'][i][:100]}...")

## ☁️ Create Word Clouds

In [None]:
# Create word clouds for positive and negative reviews
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Positive reviews word cloud
positive_text = ' '.join(df[df['sentiment'] == 'positive']['processed_review'])
positive_wordcloud = WordCloud(width=800, height=400, background_color='white', 
                              colormap='Greens').generate(positive_text)

axes[0].imshow(positive_wordcloud, interpolation='bilinear')
axes[0].set_title('Positive Reviews Word Cloud 😊', fontsize=16, fontweight='bold')
axes[0].axis('off')

# Negative reviews word cloud
negative_text = ' '.join(df[df['sentiment'] == 'negative']['processed_review'])
negative_wordcloud = WordCloud(width=800, height=400, background_color='white',
                              colormap='Reds').generate(negative_text)

axes[1].imshow(negative_wordcloud, interpolation='bilinear')
axes[1].set_title('Negative Reviews Word Cloud 😞', fontsize=16, fontweight='bold')
axes[1].axis('off')

plt.tight_layout()
plt.show()

## 🔄 Split Data and Convert Labels

In [None]:
# Convert labels to binary (0 for negative, 1 for positive)
df['label'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Split data
X = df['processed_review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"✂️ Data split:")
print(f"📊 Training samples: {len(X_train)}")
print(f"📊 Testing samples: {len(X_test)}")

## 🔤 Feature Extraction using TF-IDF

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"✅ TF-IDF Feature dimensions: {X_train_tfidf.shape[1]}")

# Get feature names and their importance
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\n📝 Sample features: {list(feature_names[:10])}")

## 🤖 Train Multiple Models

In [None]:
# Initialize models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\n🤖 Training {name}...")
    
    # Train
    model.fit(X_train_tfidf, y_train)
    
    # Predict
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'accuracy': accuracy
    }
    
    print(f"✅ {name} Accuracy: {accuracy:.2%}")

## 📊 Compare Model Performance

In [None]:
# Create comparison plot
plt.figure(figsize=(10, 6))
models_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in models_names]

bars = plt.bar(models_names, accuracies, color=['#3498db', '#e74c3c'])
plt.title('Model Performance Comparison', fontsize=16, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12)
plt.ylim(0.8, 0.9)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.002, 
             f'{acc:.2%}', ha='center', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 🔢 Confusion Matrix for Best Model

In [None]:
# Get best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"🏆 Best Model: {best_model_name}")

# Create confusion matrix
cm = confusion_matrix(y_test, best_predictions)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=16, fontweight='bold')
plt.ylabel('Actual', fontsize=12)
plt.xlabel('Predicted', fontsize=12)
plt.tight_layout()
plt.show()

# Print classification report
print("\n📈 Classification Report:")
print(classification_report(y_test, best_predictions, 
                          target_names=['Negative 😞', 'Positive 😊']))

## 💬 Interactive Sentiment Prediction

In [None]:
def predict_sentiment(text, model, vectorizer):
    """Predict sentiment of a given text."""
    # Preprocess
    processed_text = preprocess_text(text)
    
    # Transform to features
    features = vectorizer.transform([processed_text])
    
    # Get prediction and probability
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0]
    
    sentiment = "Positive 😊" if prediction == 1 else "Negative 😞"
    confidence = max(probability) * 100
    
    return sentiment, confidence

# Test with sample reviews
sample_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the plot kept me engaged throughout.",
    "Terrible movie. The plot made no sense and the acting was wooden. Complete waste of time.",
    "The movie had some good moments but overall felt rushed and incomplete."
]

print("🎬 SAMPLE PREDICTIONS\n" + "="*50)

for review in sample_reviews:
    sentiment, confidence = predict_sentiment(review, best_model, tfidf_vectorizer)
    print(f"\n📝 Review: {review}")
    print(f"🎭 Sentiment: {sentiment}")
    print(f"💪 Confidence: {confidence:.1f}%")
    
    # Visual confidence bar
    bar_length = 30
    filled = int(bar_length * confidence / 100)
    bar = "█" * filled + "░" * (bar_length - filled)
    print(f"📊 [{bar}] {confidence:.1f}%")
    print("-"*50)

## 📈 Feature Importance Analysis

In [None]:
# Get feature importance for Logistic Regression
if 'Logistic Regression' in results:
    lr_model = results['Logistic Regression']['model']
    coefficients = lr_model.coef_[0]
    
    # Get top positive and negative features
    top_positive_idx = np.argsort(coefficients)[-20:]
    top_negative_idx = np.argsort(coefficients)[:20]
    
    top_positive_features = [(feature_names[i], coefficients[i]) for i in top_positive_idx]
    top_negative_features = [(feature_names[i], coefficients[i]) for i in top_negative_idx]
    
    # Plot feature importance
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Positive features
    positive_words, positive_scores = zip(*top_positive_features)
    ax1.barh(positive_words, positive_scores, color='green')
    ax1.set_title('Top Words for Positive Sentiment', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Coefficient Value')
    
    # Negative features
    negative_words, negative_scores = zip(*top_negative_features)
    ax2.barh(negative_words, negative_scores, color='red')
    ax2.set_title('Top Words for Negative Sentiment', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Coefficient Value')
    
    plt.tight_layout()
    plt.show()

## 🎯 Your Turn - Try Your Own Review!

Run the cell below and enter your own movie review to see the sentiment prediction:

In [None]:
# Interactive input
user_review = input("💬 Enter your movie review: ")

if user_review:
    sentiment, confidence = predict_sentiment(user_review, best_model, tfidf_vectorizer)
    
    print("\n" + "="*50)
    print(f"📊 ANALYSIS RESULTS")
    print("="*50)
    print(f"🎭 Sentiment: {sentiment}")
    print(f"💪 Confidence: {confidence:.1f}%")
    
    # Visual confidence bar
    bar_length = 30
    filled = int(bar_length * confidence / 100)
    bar = "█" * filled + "░" * (bar_length - filled)
    print(f"📊 [{bar}] {confidence:.1f}%")
    
    if confidence > 90:
        print("✨ Very confident prediction!")
    elif confidence > 70:
        print("👍 Fairly confident prediction.")
    else:
        print("🤔 Low confidence - the review might be ambiguous.")
    print("="*50)

## 🎉 Conclusion

We've successfully built a sentiment analyzer for movie reviews that:
- Achieves ~85% accuracy on test data
- Can classify reviews as positive or negative with confidence scores
- Uses TF-IDF for feature extraction
- Compares Naive Bayes and Logistic Regression models

Feel free to experiment with different preprocessing techniques, feature extraction methods, or models to improve the performance!

---
Made with ❤️ by Shams Rupak