In [1]:
# Step 1: Install necessary packages (run this in the first cell if not already installed)
!pip install nltk scikit-learn




In [2]:
# Step 2: Import libraries
import nltk
import random
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

nltk.download('movie_reviews')
from nltk.corpus import movie_reviews


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\venka\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


In [4]:
# Step 3: Load IMDb movie reviews dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Convert to DataFrame
reviews = [" ".join(words) for words, label in documents]
sentiments = [1 if label == 'pos' else 0 for words, label in documents]
df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})
df.head()


Unnamed: 0,review,sentiment
0,"apparently , director tony kaye had a major ba...",1
1,running time approximately 1hr 40mins reviewed...,0
2,according to hollywood movies made in last few...,1
3,if chris farley had strapped some fake mutton ...,1
4,"allen , star of many a brian depalma movie in ...",1


In [5]:
# Step 4: Preprocess and split the dataset
X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Step 5: TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
# Step 6: Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [8]:
# Step 7: Evaluate the Model
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7975

Confusion Matrix:
 [[157  41]
 [ 40 162]]

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.79       198
           1       0.80      0.80      0.80       202

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400

