# Movie Review Sentiment Analysis


# Step 1: Setup and Data Loading

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

# Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('punkt')

# Load the IMDB dataset (example using a CSV file)
# Assuming you have a CSV with 'review' and 'sentiment' columns
# For actual IMDB dataset from Keras, see alternative below
try:
    df = pd.read_csv('IMDB.csv')
except:
    # Alternative: Using Keras's built-in IMDB dataset
    from tensorflow.keras.datasets import imdb
    (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
    
    # Convert back to text (simplified approach)
    word_index = imdb.get_word_index()
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    
    def decode_review(text):
        return ' '.join([reverse_word_index.get(i - 3, '?') for i in text])
    
    # Create DataFrame
    train_reviews = [decode_review(review) for review in train_data]
    test_reviews = [decode_review(review) for review in test_data]
    
    df = pd.DataFrame({
        'review': train_reviews + test_reviews,
        'sentiment': np.concatenate([train_labels, test_labels])
    })
    df['sentiment'] = df['sentiment'].map({0: 'negative', 1: 'positive'})

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Step 2: Text Preprocessing

In [2]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing
df['processed_review'] = df['review'].apply(preprocess_text)

# Step 3: Feature Extraction

In [3]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Fit and transform the data
X = tfidf.fit_transform(df['processed_review'])
y = df['sentiment']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Training

In [4]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
     
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label='positive')
    
    results[name] = {'Accuracy': accuracy, 'F1-score': f1}
    
    print(f"{name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print()

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df)

Logistic Regression:
  Accuracy: 0.8881
  F1-score: 0.8904

Naive Bayes:
  Accuracy: 0.8521
  F1-score: 0.8538

                     Accuracy  F1-score
Logistic Regression    0.8881  0.890391
Naive Bayes            0.8521  0.853810


# Step 5: Model Evaluation

In [5]:
# Select the best model (here we choose Logistic Regression)
best_model = LogisticRegression(max_iter=1000)
best_model.fit(X_train, y_train)

# Interface

In [None]:
def predict_sentiment(review):
    # Preprocess the input review
    processed_review = preprocess_text(review)
    
    # Transform using the same TF-IDF vectorizer
    review_vector = tfidf.transform([processed_review])
    
    # Predict
    prediction = best_model.predict(review_vector)[0]
    probability = np.max(best_model.predict_proba(review_vector))
    
    print(f"Predicted sentiment: {prediction} (confidence: {probability:.2f})")
    return prediction

# Example usage
user_review = input("Enter a movie review: ")
predict_sentiment(user_review)