In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# 1.1 Import Libraries
import pandas as pd
import numpy as np
import re # Regular expressions for text cleaning
import nltk # Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # Optional: for stemming
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib # To save/load models and vectorizer (optional)

# Download necessary NLTK data (if not already downloaded)
nltk.download('stopwords')

print("Libraries imported successfully.")

Libraries imported successfully.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
try:
    df = pd.read_csv('/content/drive/MyDrive/Machine_Learning/IMDB_Dataset.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'IMDB Dataset.csv' not found. Please download it from Kaggle and place it in the correct directory.")
    # You might want to exit or handle this error more robustly
    exit() # Simple exit if file not found

# Display basic info and first few rows
print("\nDataset Info:")
df.info()

print("\nFirst 5 rows:")
print(df.head())

# Map sentiment labels to numerical values
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print("\nSentiment value counts (1: positive, 0: negative):")
print(df['sentiment'].value_counts())

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

First 5 rows:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Sentiment value counts (1: positive, 0: negative):
sentiment
1    25000
0    25000
Name: count, dtype: int64


In [6]:
stop_words = set(stopwords.words('english'))
# ps = PorterStemmer() # Initialize stemmer if using

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers, keep only letters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    # Convert to lowercase
    text = text.lower()
    # Tokenize (split into words) and remove stop words
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Optional: Stemming
    # words = [ps.stem(word) for word in words]
    # Join words back into a string
    text = ' '.join(words)
    return text

# Apply the preprocessing function to the 'review' column
print("\nPreprocessing text data... (This may take a few minutes)")
# Create a new column for cleaned reviews
df['cleaned_review'] = df['review'].apply(preprocess_text)
print("Text preprocessing complete.")

# Display original vs cleaned review for one example
print("\nExample Preprocessing:")
print("Original:", df['review'][0][:200] + "...") # Show first 200 chars
print("Cleaned:", df['cleaned_review'][0][:200] + "...")


Preprocessing text data... (This may take a few minutes)
Text preprocessing complete.

Example Preprocessing:
Original: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo...
Cleaned: one reviewers mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show ...


In [7]:
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
# stratify=y ensures the proportion of positive/negative reviews is similar in train and test sets

print(f"\nData Split:")
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# 3.2 TF-IDF Vectorization
# Initialize TF-IDF Vectorizer
# max_features limits the vocabulary size to the most frequent terms, useful for large datasets
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can tune max_features

# Fit the vectorizer on the training data and transform the training data
print("\nFitting TF-IDF Vectorizer and transforming training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the *same* fitted vectorizer
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF transformation complete.")
print(f"Shape of TF-IDF matrix (Train): {X_train_tfidf.shape}") # (num_samples, num_features)
print(f"Shape of TF-IDF matrix (Test): {X_test_tfidf.shape}")


Data Split:
Training set size: 37500 samples
Testing set size: 12500 samples

Fitting TF-IDF Vectorizer and transforming training data...
Transforming test data...
TF-IDF transformation complete.
Shape of TF-IDF matrix (Train): (37500, 5000)
Shape of TF-IDF matrix (Test): (12500, 5000)


In [8]:
log_reg = LogisticRegression(C=1.0, max_iter=1000, random_state=42, solver='liblinear') # liblinear is good for binary classification with larger datasets

print("\nTraining Logistic Regression model...")
log_reg.fit(X_train_tfidf, y_train)
print("Model training complete.")


Training Logistic Regression model...
Model training complete.


In [9]:
print("\nEvaluating model on the test set...")
y_pred = log_reg.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Format:
# [[TN, FP],
#  [FN, TP]]

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))


Evaluating model on the test set...

Accuracy: 0.8882

Confusion Matrix:
[[5501  749]
 [ 648 5602]]

Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.89      0.88      0.89      6250
Positive (1)       0.88      0.90      0.89      6250

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



In [10]:
new_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged throughout.",
    "What a waste of time. The plot was predictable and the characters were incredibly boring. I would not recommend this film.",
    "It was an okay movie, not great but not terrible either. Some good moments but overall quite average."
]

print("\n--- Testing on New Reviews ---")

# Preprocess the new reviews
cleaned_new_reviews = [preprocess_text(review) for review in new_reviews]
print("Cleaned Reviews:", cleaned_new_reviews)

# Transform using the fitted TF-IDF vectorizer
new_reviews_tfidf = tfidf_vectorizer.transform(cleaned_new_reviews)
print("Shape of TF-IDF for new reviews:", new_reviews_tfidf.shape)

# Predict sentiment
new_predictions = log_reg.predict(new_reviews_tfidf)
sentiment_labels = {1: 'Positive', 0: 'Negative'}

# Print results
for review, prediction in zip(new_reviews, new_predictions):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {sentiment_labels[prediction]} ({prediction})")

print("\n--- Project Complete ---")


--- Testing on New Reviews ---
Cleaned Reviews: ['movie absolutely fantastic acting superb storyline kept engaged throughout', 'waste time plot predictable characters incredibly boring would recommend film', 'okay movie great terrible either good moments overall quite average']
Shape of TF-IDF for new reviews: (3, 5000)

Review: "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged through..."
Predicted Sentiment: Positive (1)

Review: "What a waste of time. The plot was predictable and the characters were incredibly boring. I would no..."
Predicted Sentiment: Negative (0)

Review: "It was an okay movie, not great but not terrible either. Some good moments but overall quite average..."
Predicted Sentiment: Negative (0)

--- Project Complete ---
