# 📊 IMDb Sentiment Analysis - Data Preprocessing

This notebook demonstrates the complete data preprocessing pipeline for IMDb sentiment analysis, including:
- Data loading and validation
- Text cleaning and preprocessing
- TF-IDF vectorization
- Exploratory data analysis
- Feature engineering


## 1. Import Libraries and Setup


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Add src to path - more robust path handling
import sys
import os
current_dir = os.getcwd()
src_path = os.path.join(current_dir, '..', 'src')
if os.path.exists(src_path):
    sys.path.insert(0, src_path)
    print(f"✅ Added {src_path} to Python path")
else:
    print(f"⚠️  Warning: {src_path} not found, trying alternative paths...")
    # Try alternative paths
    alt_paths = [
        os.path.join(current_dir, 'src'),
        os.path.join(os.path.dirname(current_dir), 'src'),
        'src'
    ]
    for alt_path in alt_paths:
        if os.path.exists(alt_path):
            sys.path.insert(0, alt_path)
            print(f"✅ Added {alt_path} to Python path")
            break
    else:
        print("❌ Could not find src directory")

# Import with error handling
try:
    from preprocessing import TextPreprocessor, validate_data, load_imdb_data
    print("✅ Successfully imported preprocessing modules!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please ensure you're running this notebook from the notebooks/ directory")
    print("and that the src/ directory exists with the required Python files.")
    # You can still continue with basic functionality
    print("Continuing with basic imports...")

print("✅ Libraries imported successfully!")


✅ Libraries imported successfully!


## 2. Load and Explore Dataset


In [None]:
# Load the IMDb dataset
dataset_path = "../IMDB Dataset.csv"
df = load_imdb_data(dataset_path)

if df is not None:
    print(f"✅ Dataset loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
    print(f"📋 Columns: {list(df.columns)}")
    print(f"🔍 Data types:\n{df.dtypes}")
else:
    print("❌ Failed to load dataset. Please check the file path.")


## 3. Data Validation and Quality Assessment


In [None]:
# Validate the dataset
print("🔍 Data Validation Report")
print("=" * 50)

# Check for missing values
print("📊 Missing Values Analysis:")
print(f"Missing values in 'review': {df['review'].isna().sum()}")
print(f"Missing values in 'sentiment': {df['sentiment'].isna().sum()}")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\n🔄 Duplicate Analysis:")
print(f"Total duplicates: {duplicates}")

# Check class distribution
print(f"\n📈 Class Distribution:")
class_counts = df['sentiment'].value_counts()
print(class_counts)
print(f"\nClass balance:")
for sentiment, count in class_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{sentiment}: {count} ({percentage:.1f}%)")

# Check text length statistics
print(f"\n📏 Text Length Statistics:")
df['text_length'] = df['review'].str.len()
print(df['text_length'].describe())

# Check for very short or very long reviews
short_reviews = (df['text_length'] < 10).sum()
long_reviews = (df['text_length'] > 10000).sum()
print(f"\n⚠️  Quality Issues:")
print(f"Very short reviews (< 10 chars): {short_reviews}")
print(f"Very long reviews (> 10,000 chars): {long_reviews}")

# Validate data using our validation function
print(f"\n✅ Data Validation:")
is_valid = validate_data(df)
print(f"Dataset is valid: {is_valid}")


## 4. Exploratory Data Analysis (EDA)


In [None]:
# Sample reviews for inspection
print("📝 Sample Reviews:")
print("=" * 80)

# Show sample positive and negative reviews
positive_samples = df[df['sentiment'] == 'positive']['review'].head(2)
negative_samples = df[df['sentiment'] == 'negative']['review'].head(2)

print("🌟 Positive Reviews:")
for i, review in enumerate(positive_samples, 1):
    print(f"{i}. {review[:200]}...")
    print()

print("😞 Negative Reviews:")
for i, review in enumerate(negative_samples, 1):
    print(f"{i}. {review[:200]}...")
    print()


In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Class distribution pie chart
class_counts = df['sentiment'].value_counts()
axes[0].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', 
           colors=['lightcoral', 'lightblue'], startangle=90)
axes[0].set_title('Class Distribution', fontsize=14, fontweight='bold')

# Text length distribution by sentiment
positive_lengths = df[df['sentiment'] == 'positive']['text_length']
negative_lengths = df[df['sentiment'] == 'negative']['text_length']

axes[1].hist([positive_lengths, negative_lengths], bins=50, alpha=0.7, 
            label=['Positive', 'Negative'], color=['lightblue', 'lightcoral'])
axes[1].set_xlabel('Text Length (characters)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Text Length Distribution by Sentiment', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].set_xlim(0, 2000)  # Focus on reasonable range

plt.tight_layout()
plt.show()

# Print statistics
print("📊 Text Length Statistics by Sentiment:")
print("=" * 50)
print("Positive Reviews:")
print(positive_lengths.describe())
print("\nNegative Reviews:")
print(negative_lengths.describe())


## 5. Text Preprocessing Pipeline


In [None]:
# Initialize the text preprocessor
print("🔧 Initializing Text Preprocessor")
print("=" * 50)

preprocessor = TextPreprocessor(
    max_features=5000,  # Maximum number of features for TF-IDF
    min_df=2,          # Minimum document frequency
    ngram_range=(1, 2) # Unigrams and bigrams
)

print(f"✅ Preprocessor initialized with:")
print(f"   - Max features: {preprocessor.max_features}")
print(f"   - Min document frequency: {preprocessor.min_df}")
print(f"   - N-gram range: {preprocessor.ngram_range}")

# Demonstrate text cleaning on sample reviews
print(f"\n🧹 Text Cleaning Demonstration:")
print("=" * 50)

sample_reviews = [
    "<p>This movie was <b>fantastic</b>! It's amazing! Check out http://example.com</p>",
    "Don't watch this terrible movie. It's awful and boring.",
    "Outstanding performances by all actors. Highly recommended!"
]

for i, review in enumerate(sample_reviews, 1):
    print(f"Sample {i}:")
    print(f"Original:  {review}")
    cleaned = preprocessor.clean_text(review)
    print(f"Cleaned:   {cleaned}")
    print()


In [None]:
# Apply preprocessing to the entire dataset
print("🔄 Applying Preprocessing Pipeline")
print("=" * 50)

# Preprocess the dataframe
df_processed = preprocessor.preprocess_dataframe(
    df, 
    text_column='review', 
    label_column='sentiment',
    remove_stopwords_flag=True
)

print(f"\n✅ Preprocessing completed!")
print(f"Original dataset size: {len(df)}")
print(f"Processed dataset size: {len(df_processed)}")
print(f"Removed: {len(df) - len(df_processed)} samples")

# Show before and after examples
print(f"\n📝 Before vs After Preprocessing Examples:")
print("=" * 60)

sample_indices = [0, 100, 1000]  # Sample different reviews
for idx in sample_indices:
    if idx < len(df) and idx < len(df_processed):
        original = df.iloc[idx]['review']
        processed = df_processed.iloc[idx]['review']
        sentiment = df_processed.iloc[idx]['sentiment']
        
        print(f"Sample {idx} ({sentiment}):")
        print(f"Original:  {original[:150]}...")
        print(f"Processed: {processed[:150]}...")
        print()


## 6. Feature Engineering with TF-IDF


In [None]:
# Prepare data for training
print("🎯 Preparing Data for Training")
print("=" * 50)

# Use the complete preprocessing pipeline
X_train, X_test, y_train, y_test, fitted_preprocessor = preprocessor.prepare_data(
    df, 
    text_column='review', 
    label_column='sentiment',
    test_size=0.2, 
    random_state=42,
    remove_stopwords_flag=True
)

print(f"✅ Data preparation completed!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Number of features: {X_train.shape[1]}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test labels shape: {y_test.shape}")

# Check class distribution in train/test sets
print(f"\n📊 Class Distribution in Train/Test Sets:")
print("=" * 50)

# Get class names from label encoder
class_names = fitted_preprocessor.label_encoder.classes_
print(f"Class names: {class_names}")

# Count classes in training set
unique_train, counts_train = np.unique(y_train, return_counts=True)
print(f"Training set distribution:")
for class_idx, count in zip(unique_train, counts_train):
    class_name = class_names[class_idx]
    percentage = (count / len(y_train)) * 100
    print(f"  {class_name}: {count} ({percentage:.1f}%)")

# Count classes in test set
unique_test, counts_test = np.unique(y_test, return_counts=True)
print(f"Test set distribution:")
for class_idx, count in zip(unique_test, counts_test):
    class_name = class_names[class_idx]
    percentage = (count / len(y_test)) * 100
    print(f"  {class_name}: {count} ({percentage:.1f}%)")


In [None]:
# Analyze TF-IDF features
print("🔍 TF-IDF Feature Analysis")
print("=" * 50)

# Get feature names
feature_names = fitted_preprocessor.get_feature_names()
print(f"Total features created: {len(feature_names)}")

# Show sample features
print(f"\n📝 Sample Features:")
print("=" * 30)
print("First 20 features:")
for i, feature in enumerate(feature_names[:20]):
    print(f"{i+1:2d}. {feature}")

print(f"\nLast 20 features:")
for i, feature in enumerate(feature_names[-20:], len(feature_names)-19):
    print(f"{i:2d}. {feature}")

# Analyze feature sparsity
print(f"\n📊 Feature Sparsity Analysis:")
print("=" * 40)
sparsity = 1.0 - (X_train.nnz / float(X_train.shape[0] * X_train.shape[1]))
print(f"Sparsity: {sparsity:.3f} ({sparsity*100:.1f}% of values are zero)")
print(f"Non-zero values: {X_train.nnz:,}")
print(f"Total possible values: {X_train.shape[0] * X_train.shape[1]:,}")

# Show most common features (highest mean TF-IDF scores)
print(f"\n🏆 Most Important Features (by mean TF-IDF score):")
print("=" * 60)

# Calculate mean TF-IDF scores
mean_scores = np.array(X_train.mean(axis=0)).flatten()
top_indices = np.argsort(mean_scores)[-20:][::-1]

for i, idx in enumerate(top_indices, 1):
    feature = feature_names[idx]
    score = mean_scores[idx]
    print(f"{i:2d}. {feature:25s} (score: {score:.4f})")


## 7. Data Quality Visualization


In [None]:
# Create comprehensive data quality visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Text length distribution after preprocessing
df_processed['text_length_processed'] = df_processed['review'].str.len()
positive_lengths_proc = df_processed[df_processed['sentiment'] == 'positive']['text_length_processed']
negative_lengths_proc = df_processed[df_processed['sentiment'] == 'negative']['text_length_processed']

axes[0, 0].hist([positive_lengths_proc, negative_lengths_proc], bins=50, alpha=0.7, 
                label=['Positive', 'Negative'], color=['lightblue', 'lightcoral'])
axes[0, 0].set_xlabel('Text Length (characters)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Text Length Distribution After Preprocessing', fontweight='bold')
axes[0, 0].legend()
axes[0, 0].set_xlim(0, 1000)

# 2. Feature importance (top 15 features)
top_15_indices = np.argsort(mean_scores)[-15:][::-1]
top_15_features = [feature_names[i] for i in top_15_indices]
top_15_scores = [mean_scores[i] for i in top_15_indices]

axes[0, 1].barh(range(len(top_15_features)), top_15_scores, color='skyblue')
axes[0, 1].set_yticks(range(len(top_15_features)))
axes[0, 1].set_yticklabels(top_15_features)
axes[0, 1].set_xlabel('Mean TF-IDF Score')
axes[0, 1].set_title('Top 15 Most Important Features', fontweight='bold')
axes[0, 1].invert_yaxis()

# 3. Sparsity visualization
sparsity_data = [sparsity * 100, (1 - sparsity) * 100]
labels = ['Zero Values', 'Non-zero Values']
colors = ['lightcoral', 'lightgreen']

axes[1, 0].pie(sparsity_data, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
axes[1, 0].set_title('Feature Matrix Sparsity', fontweight='bold')

# 4. Train/Test split visualization
train_size = len(y_train)
test_size = len(y_test)
split_data = [train_size, test_size]
split_labels = ['Training Set', 'Test Set']
split_colors = ['lightblue', 'lightgreen']

axes[1, 1].pie(split_data, labels=split_labels, autopct='%1.1f%%', colors=split_colors, startangle=90)
axes[1, 1].set_title('Train/Test Split (80/20)', fontweight='bold')

plt.tight_layout()
plt.show()

# Print summary statistics
print("📊 Data Quality Summary:")
print("=" * 50)
print(f"Original dataset size: {len(df):,}")
print(f"Processed dataset size: {len(df_processed):,}")
print(f"Samples removed: {len(df) - len(df_processed):,}")
print(f"Training samples: {len(y_train):,}")
print(f"Test samples: {len(y_test):,}")
print(f"Features created: {X_train.shape[1]:,}")
print(f"Feature sparsity: {sparsity:.1%}")
print(f"Average text length (original): {df['text_length'].mean():.1f} chars")
print(f"Average text length (processed): {df_processed['text_length_processed'].mean():.1f} chars")


## 8. Summary and Next Steps


In [None]:
# Save the preprocessed data and preprocessor for use in other notebooks
print("💾 Saving Preprocessed Data")
print("=" * 50)

import joblib
import os

# Create directories if they don't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../data', exist_ok=True)

# Save the fitted preprocessor
joblib.dump(fitted_preprocessor, '../models/preprocessor.joblib')
print("✅ Preprocessor saved to ../models/preprocessor.joblib")

# Save the processed data
joblib.dump({
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': feature_names,
    'class_names': class_names
}, '../data/processed_data.joblib')
print("✅ Processed data saved to ../data/processed_data.joblib")

print(f"\n🎯 Preprocessing Summary:")
print("=" * 50)
print(f"✅ Dataset loaded and validated")
print(f"✅ Text cleaning and preprocessing applied")
print(f"✅ TF-IDF features created ({X_train.shape[1]} features)")
print(f"✅ Train/test split completed (80/20)")
print(f"✅ Data saved for model training")
print(f"✅ Ready for next step: Model Training!")

print(f"\n📋 Key Statistics:")
print(f"   • Original samples: {len(df):,}")
print(f"   • Processed samples: {len(df_processed):,}")
print(f"   • Training samples: {len(y_train):,}")
print(f"   • Test samples: {len(y_test):,}")
print(f"   • Features: {X_train.shape[1]:,}")
print(f"   • Feature sparsity: {sparsity:.1%}")

print(f"\n🚀 Next Steps:")
print(f"   1. Run notebook 02_model_training.ipynb")
print(f"   2. Train multiple ML models")
print(f"   3. Create ensemble model")
print(f"   4. Evaluate model performance")
