In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [9]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
file_path = r"C:\Users\prana\Downloads\Aditi\Pranav\Portfolio_Projects\Sentiment Analysis\IMDB_Dataset.csv"
data = pd.read_csv(file_path)

In [11]:
# Data Cleaning and Preprocessing
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters and punctuation
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatization
    return text

In [13]:
# Feature Engineering
X = data['review']
y = data['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [16]:
# Model Selection and Hyperparameter Tuning
parameters = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train_vec, y_train)
best_rf = grid_search.best_estimator_

# Model Evaluation
y_pred = best_rf.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Random Forest accuracy: 0.8606
              precision    recall  f1-score   support

    negative       0.86      0.86      0.86      4961
    positive       0.86      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

