In [1]:
# --- Step 0: Import Libraries --- #
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# --- Step 1: Load the Dataset --- #
print("Loading the IMDB dataset...")
file_path = '../datasets/IMDB Dataset.csv'
df = pd.read_csv(file_path)
print("Dataset loaded successfully.")

Loading the IMDB dataset...
Dataset loaded successfully.


In [3]:
# --- Step 2: Text Cleaning Function --- #
# We create a function to handle all our text cleaning steps in one place.
def clean_text(text):
    # 1. Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # 2. Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    # 3. Remove extra whitespace
    text = ' '.join(text.split())
    return text

print("Applying text cleaning...")
# Apply the cleaning function to all reviews
df['cleaned_review'] = df['review'].apply(clean_text)
print("Text cleaning complete.")


Applying text cleaning...
Text cleaning complete.


In [4]:
# --- Step 3: Define Features (X) and Target (y) --- #
X = df['cleaned_review']
y = df['sentiment']


In [5]:
# --- Step 4: Split the Data --- #
# We split our data into a training set and a testing set.
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split complete.")

Splitting data...
Data split complete.


In [6]:
# --- Step 5: Vectorization using TF-IDF --- #
print("Vectorizing text data...")
# Create the TF-IDF Vectorizer
# We'll also remove common English "stop words" (like 'the', 'a', 'in')
# and only consider words that appear in at least 5 reviews (min_df=5).
vectorizer = TfidfVectorizer(stop_words='english', min_df=5)

# Fit the vectorizer on the TRAINING data and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the TEST data using the SAME fitted vectorizer
X_test_tfidf = vectorizer.transform(X_test)
print("Vectorization complete.")

Vectorizing text data...
Vectorization complete.


In [7]:
# --- Step 6: Train the Logistic Regression Model --- #
print("Training the sentiment analysis model...")
# Logistic Regression is a great, fast baseline model for text classification.
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)
print("Model training complete.")

Training the sentiment analysis model...


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


Model training complete.


In [8]:
# --- Step 7: Evaluate the Model --- #
print("\n--- Model Evaluation ---")
# Make predictions on the unseen test data
predictions = model.predict(X_test_tfidf)

# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))


--- Model Evaluation ---
Model Accuracy: 89.22%

Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [9]:
# --- Step 8: Test with a New, Custom Review --- #
print("\n--- Testing with a new review ---")
new_review_positive = "This was an amazing and brilliant film. I loved every minute of it!"
new_review_negative = "What a complete waste of time. The acting was terrible and the plot was boring."

# Clean and vectorize the new reviews
cleaned_positive = clean_text(new_review_positive)
vectorized_positive = vectorizer.transform([cleaned_positive])

cleaned_negative = clean_text(new_review_negative)
vectorized_negative = vectorizer.transform([cleaned_negative])

# Make predictions
prediction_pos = model.predict(vectorized_positive)
prediction_neg = model.predict(vectorized_negative)

print(f"Review: '{new_review_positive}'")
print(f"Predicted Sentiment: {prediction_pos[0]}\n")

print(f"Review: '{new_review_negative}'")
print(f"Predicted Sentiment: {prediction_neg[0]}")


--- Testing with a new review ---
Review: 'This was an amazing and brilliant film. I loved every minute of it!'
Predicted Sentiment: positive

Review: 'What a complete waste of time. The acting was terrible and the plot was boring.'
Predicted Sentiment: negative
