In [17]:
# ====================================================================
# Section 1: Imports
# ====================================================================

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression # You can swap this for NaiveBayes
from sklearn.metrics import accuracy_score
import joblib

# Download NLTK resources (run this once)
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

In [None]:
# 2. Define Preprocessing Function (CRUCIAL: Must match backend's function!)
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Lowercase and split
    words = text.lower().split()
    # Remove stopwords
    words = [word for word in words if word not in STOPWORDS]
    return " ".join(words)

# 3. Load Data (full 50K IMDB)
try:
    df = pd.read_csv('IMDB Dataset.csv')
except FileNotFoundError:
    print("ERROR: 'IMDB Dataset.csv' not found. Ensure it is in the same folder.")
    raise

# Map sentiments
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Basic integrity checks
initial_len = len(df)
missing_sent = df['sentiment'].isna().sum()
print(f"Loaded {initial_len} rows. Missing sentiment labels: {missing_sent}")

# Preprocess reviews
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Show class distribution
class_counts = df['sentiment'].value_counts()
print("Class distribution:")
print(class_counts)

# Optional: For quick experimentation you can uncomment sampling
# df = df.sample(10000, random_state=42)
# print(f"Sampled down to {len(df)} rows for faster iteration")

# Train/Test split (stratified)
X = df['cleaned_review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

In [None]:
# 6. Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2)
X_train_features = tfidf_vectorizer.fit_transform(X_train)
X_test_features = tfidf_vectorizer.transform(X_test)
print(f"Feature space size: {X_train_features.shape[1]}")

# 7. Model Training (Logistic Regression with class_weight to handle imbalance)
model = LogisticRegression(max_iter=2000, class_weight='balanced', n_jobs=-1 if hasattr(LogisticRegression, 'n_jobs') else None)
model.fit(X_train_features, y_train)

# 8. Evaluation
y_pred = model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Extra metrics
from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Model Accuracy on Test Set: 100.00%


In [20]:
# 9. Save Artifacts to the 'backend/' folder
joblib.dump(model, '../backend/sentiment_model.pkl') 
joblib.dump(tfidf_vectorizer, '../backend/tfidf_vectorizer.pkl')
print("\nModel and Vectorizer saved successfully to the 'backend/' folder.")


Model and Vectorizer saved successfully to the 'backend/' folder.


## Next Steps
Replace the synthetic data with a real labeled sentiment dataset and retrain.