In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re

In [22]:
# Load the dataset
file_path = r"C:\Users\prana\Downloads\Aditi\Pranav\Portfolio_Projects\Sentiment Analysis\IMDB_Dataset.csv"
data = pd.read_csv(file_path)
# Define a function for data cleaning
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply data cleaning to the sentiment column
data['sentiment'] = data['sentiment'].apply(clean_text)

In [23]:
# Preprocessing
X = data['review']
y = data['sentiment']

In [24]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [26]:
# Model training and evaluation
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
}

In [27]:
for model_name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} accuracy: {accuracy}")

Multinomial Naive Bayes accuracy: 0.8517
Logistic Regression accuracy: 0.8959
