# Sentiment Analysis from IMDB Movie Review Dataset 

In [6]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sb
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords

# Install the NLTK stopwords list (only need for first time)
# This list contains common words like 'the', 'a', 'is', etc. 
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

ModuleNotFoundError: No module named 'nltk'

In [2]:
df = pd.read_csv("IMDB_Dataset.csv")

print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [None]:
# Get an overview of the dataset
print("\nDataset Information:")
df.info()

# Control the disribution of 'sentiment' column in the dataset
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Sentiment Distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


### Data Preprocessing and Cleaning

In [5]:
# Get meaningless words as a list
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    This function cleans the raw text
    """
    # Remove the HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

In [6]:
print("\nStarting text cleaning...(This may take a while)")
df['cleaned_review'] = df['review'].apply(clean_text)
print("Text cleaning complete!")


Starting text cleaning...(This may take a while)
Text cleaning complete!


In [7]:
print("\nCleaning Data:")
print(df[['review', 'cleaned_review']].head())


Cleaning Data:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  one reviewers mentioned watching 1 oz episode ...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


### Convert Text to Numerical Format (Vectorization)

In [8]:
# Convert the labels to numerical values
df['sentiment_numeric'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split dataset as features (X) and target (y)
X = df['cleaned_review']
y = df['sentiment_numeric']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

### Creating the Model and Training

In [9]:
# Create TF-IDF vectorizer
# max_features=5000 : use the most common 5000 words 
vectorizer = TfidfVectorizer(max_features=5000)

# Create logistic regression model
log_reg = LogisticRegression()

# Create the pipeline
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('log_reg', log_reg)])

# Train the model on the training data
print("\nTraining model...")
pipeline.fit(X_train, y_train)
print("Model training complete!")


Training model...
Model training complete!


### Evaluation the Performance of the Model

In [10]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy of Model: {accuracy:.3f}")

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred,target_names=['Negative', 'Positive']))


Accuracy of Model: 0.890

Classification Report:
              precision    recall  f1-score   support

    Negative       0.90      0.88      0.89      5000
    Positive       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



### Testing MOdel with New Reviews

In [13]:
# New reviews for testing the model
new_reviews = [
    "This movie was amazing, I loved every minute of it.",
    "The plot was confusing and the acting was bad.",
    "The movie was okay, nothing special.",
    "The acting was great, but the plot was bad.",
    "The movie was great, I loved it!"
]

# Make predictions on the new reviews
predicted_sentiments = pipeline.predict(new_reviews)

# See the predict probabilities
predicted_propabilities = pipeline.predict_proba(new_reviews)

for review, sentiment, probs in zip(new_reviews, predicted_sentiments, predicted_propabilities):
    sentiment_label = 'Positive' if sentiment == 1 else 'Negative'
    print(f"\nReview: {review}")
    print(f"-->Predicted Sentiment: {sentiment_label}")
    print(f"    (Confidence: Negative={probs[0]:.2f}, Positive={probs[1]:.2f})")



Review: This movie was amazing, I loved every minute of it.
-->Predicted Sentiment: Positive
    (Confidence: Negative=0.01, Positive=0.99)

Review: The plot was confusing and the acting was bad.
-->Predicted Sentiment: Negative
    (Confidence: Negative=0.99, Positive=0.01)

Review: The movie was okay, nothing special.
-->Predicted Sentiment: Negative
    (Confidence: Negative=0.97, Positive=0.03)

Review: The acting was great, but the plot was bad.
-->Predicted Sentiment: Negative
    (Confidence: Negative=0.89, Positive=0.11)

Review: The movie was great, I loved it!
-->Predicted Sentiment: Positive
    (Confidence: Negative=0.00, Positive=1.00)
