# Title: Sentiment Analysis
# Author: Touseef Asif
# Objective : To build a Python-based Sentiment Analysis model for classifying IMDB reviews as positive or negative.

# Task 02

# Import Required Libraries

In [7]:
# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')

# Display stopwords for reference
stop_words = set(stopwords.words('english'))
print("Sample stopwords:", list(stop_words)[:10])

Sample stopwords: ['mightn', 'our', 'an', 'further', 'theirs', 'isn', "she's", 'didn', 'so', "should've"]


[nltk_data] Downloading package stopwords to C:\Users\Touseef
[nltk_data]     Asif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load the Dataset

In [3]:
# Step 2: Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Touseef
[nltk_data]     Asif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Touseef
[nltk_data]     Asif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load the Dataset

In [8]:
# Load the dataset (update the file path)
file_path = "IMDB Dataset.csv"  # Update with your dataset path
df = pd.read_csv(file_path)

# Display the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check for null values and basic info
print("\nDataset Info:")
df.info()

# Check for null values
print("\nNumber of null values in each column:")
print(df.isnull().sum())

First 5 rows of the dataset:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Number of null values in each column:
review       0
sentiment    0
dtype: int64


# Preprocessing the Text

**Text preprocessing involves cleaning and preparing the text for analysis.**

**1**. Convert Text to Lowercase
**2**. Remove Stopwords and Special Characters
**3**. Tokenization (splitting text into words)

In [9]:
# Step 1: Convert text to lowercase
df['review'] = df['review'].str.lower()

# Step 2: Remove special characters
df['review'] = df['review'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Step 3: Remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['review'] = df['review'].apply(remove_stopwords)

# Display preprocessed data
print("Preprocessed text sample:")
print(df['review'].head())

Preprocessed text sample:
0    one reviewers mentioned watching oz episode yo...
1    wonderful little production br br filming tech...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: review, dtype: object


# Feature Engineering (Convert Text to Numerical Format)

In [10]:
# Convert text data to numerical format using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Use the top 5000 features
X = vectorizer.fit_transform(df['review']).toarray()

# Display feature names and sample data
print("Top 10 features:", vectorizer.get_feature_names_out()[:10])
print("\nShape of feature matrix:", X.shape)

# Target variable (positive or negative)
y = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)  # 1: Positive, 0: Negative

Top 10 features: ['aaron' 'abandoned' 'abc' 'abilities' 'ability' 'able' 'aboutbr'
 'absence' 'absent' 'absolute']

Shape of feature matrix: (50000, 5000)


# Splitting the Dataset

In [11]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display split information
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (40000, 5000)
Testing data shape: (10000, 5000)


# Model Training

In [12]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Display model training status
print("Model trained successfully!")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained successfully!


# Model Evaluation

In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Optional: Display a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.88

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      4961
           1       0.87      0.88      0.88      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



# Predict Sentiment for a New Review

In [29]:
# Function to preprocess and predict sentiment for new reviews
def predict_sentiment(review):
    # Preprocess the review
    review = review.lower()
    review = re.sub(r'[^a-zA-Z\s]', '', review)
    review = remove_stopwords(review)
    review_vectorized = vectorizer.transform([review]).toarray()
    
    # Predict sentiment
    prediction = model.predict(review_vectorized)
    return "Positive" if prediction[0] == 1 else "Negative"

# Example: Test the function
new_review = "This movie was absolutely amazing! I loved it."
print("New Review Sentiment:", predict_sentiment(new_review))

New Review Sentiment: Positive
