In [3]:
# Step 1: Import Required Libraries
import nltk
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# Step 2: Load the IMDB dataset
def load_imdb_data():
    nltk.download('movie_reviews')
    from nltk.corpus import movie_reviews
    
    # Extracting movie reviews and their labels
    documents = [(list(movie_reviews.words(fileid)), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    
    # Convert to dataframe
    df = pd.DataFrame(documents, columns=['review', 'sentiment'])
    df['review'] = df['review'].apply(lambda x: ' '.join(x))  # Convert word lists to full text
    return df

# Preprocess the text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    text = re.sub('\d+', '', text)  # Remove numbers
    return text

# Load and preprocess data
df = load_imdb_data()
df['review'] = df['review'].apply(preprocess_text)

  text = re.sub('\d+', '', text)  # Remove numbers
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\raipr\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [5]:
# Step 3: Convert Text Data into Numerical Features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['review'])
y = df['sentiment'].map({'pos': 1, 'neg': 0})  # Convert labels to binary

In [7]:
# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# Step 5: Train and Evaluate Models
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f'\nModel: {model_name}')
    print(f'Accuracy: {accuracy:.4f}')
    print(report)
    return accuracy

# Train models
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

In [9]:
results = {}
for name, model in models.items():
    results[name] = train_and_evaluate_model(model, X_train, X_test, y_train, y_test, name)

# Step 6: Compare Results
summary_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
print('\nSummary Table:\n', summary_df)


Model: Multinomial Naive Bayes
Accuracy: 0.7950
              precision    recall  f1-score   support

           0       0.77      0.84      0.81       302
           1       0.83      0.74      0.78       298

    accuracy                           0.80       600
   macro avg       0.80      0.79      0.79       600
weighted avg       0.80      0.80      0.79       600


Model: Logistic Regression
Accuracy: 0.8200
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       302
           1       0.82      0.82      0.82       298

    accuracy                           0.82       600
   macro avg       0.82      0.82      0.82       600
weighted avg       0.82      0.82      0.82       600


Model: Support Vector Machine
Accuracy: 0.8117
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       302
           1       0.81      0.81      0.81       298

    accuracy                           0

***Logistic Regression outperformed the other models with the qaccuray of 82%. It is because it effectively handles high-dimensional sparse text data, avoids overfitting, and is well-suited for binary classification.***