# 🎬 Task 2: IMDB Sentiment Analysis using Logistic Regression
In this task, we classify IMDB movie reviews as positive or negative using Logistic Regression after text cleaning and TF-IDF vectorization.

In [3]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Taha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Taha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Taha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 📥 Load the Dataset
Ensure 'IMDB.csv' is in the working directory with columns: `review`, `sentiment`.

In [4]:
# Load and check data
df = pd.read_csv("IMDB.csv")
df = df[['review', 'sentiment']]
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


## 🧼 Label Encoding & Cleaning
- Map sentiment to binary (positive: 1, negative: 0)
- Clean text using regex, remove HTML, tokenize, and lemmatize

In [11]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.dropna(subset=['label'], inplace=True)
print(df['label'].value_counts())

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'<.*?>', '', text)
    tokens = tokenizer.tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply cleaning
df['clean_text'] = df['review'].apply(clean_text)
print(df[['clean_text', 'label']].head())

label
1    25000
0    25000
Name: count, dtype: int64
                                          clean_text  label
0  one reviewer mentioned watching oz episode you...      1
1  wonderful little production br br filming tech...      1
2  thought wonderful way spend time hot summer we...      1
3  basically there family little boy jake think t...      0
4  petter matteis love time money visually stunni...      1


## 🔠 TF-IDF Vectorization and Splitting

In [15]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



## 🧪 Predict Sentiment on Custom Text

In [16]:
def predict_sentiment(text):
    cleaned = clean_text(text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example Predictions
print(predict_sentiment("I hated this movie. It was boring and slow."))
print(predict_sentiment("This movie was amazing, the acting was top-notch!"))

Negative
Positive
