In [8]:
import re
import string
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [9]:
df = pd.read_csv(r"C:\Users\neel\OneDrive\Desktop\dataset\movies.csv", encoding='ISO-8859-1')

In [10]:
def to_lower(text):
    return text.lower()

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9 ]', '', text)

def preprocess_text(text):
    text = to_lower(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_extra_spaces(text)
    text = remove_urls(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_special_characters(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize(tokens)
    return ' '.join(tokens)

df['processed_text'] = df['review'].apply(preprocess_text)

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['sentiment']

def train_and_evaluate(models, X, y):
    """Train and test multiple models and evaluate accuracy."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{name} Accuracy: {accuracy:.4f}')
        print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}')

In [16]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression()
}

In [17]:

train_and_evaluate(models, X, y)

Naive Bayes Accuracy: 0.8614
Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.83      0.89      0.86      4745
    positive       0.89      0.83      0.86      5010

    accuracy                           0.86      9755
   macro avg       0.86      0.86      0.86      9755
weighted avg       0.86      0.86      0.86      9755

Logistic Regression Accuracy: 0.8938
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4745
    positive       0.89      0.91      0.90      5010

    accuracy                           0.89      9755
   macro avg       0.89      0.89      0.89      9755
weighted avg       0.89      0.89      0.89      9755

