# üìä Data Distribution Analysis
## Comprehensive Analysis of 5,000 IMDB Movies Dataset

This notebook analyzes the distribution, quality, and characteristics of our movie dataset before model training.

**Dataset**: `data/raw/imdb_movies_large.csv`  
**Target Variable**: `first_week` (First week box office income)

## 1. Import Libraries and Load Data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 8)
sns.set_palette("husl")

print("‚úÖ Libraries imported successfully!")

In [None]:
# Load the dataset
df = pd.read_csv('data/raw/imdb_movies_large.csv')

print(f"Dataset Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"\nDate Range: {df['release_date'].min()} to {df['release_date'].max()}")
print(f"\n‚úÖ Dataset loaded successfully!")

In [None]:
# Display first few rows
df.head(10)

## 2. Dataset Overview

In [None]:
# Basic information
print("=" * 80)
print("DATASET INFORMATION")
print("=" * 80)
df.info()

In [None]:
# Check for missing values
print("=" * 80)
print("MISSING VALUES")
print("=" * 80)

missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing[missing > 0],
    'Percentage': missing_pct[missing > 0]
})

if len(missing_df) > 0:
    display(missing_df.sort_values('Percentage', ascending=False))
else:
    print("\n‚úÖ No missing values found! Dataset is complete.")

## 3. Statistical Summary

In [None]:
# Statistical summary of numerical features
df.describe().T

In [None]:
# Key financial metrics in millions
financial_metrics = ['budget', 'revenue', 'first_week', 'opening_weekend', 'total_gross']

print("=" * 80)
print("FINANCIAL METRICS (in millions $)")
print("=" * 80)

for col in financial_metrics:
    if col in df.columns:
        print(f"\n{col.upper()}:")
        print(f"  Mean:   ${df[col].mean()/1e6:>10.2f}M")
        print(f"  Median: ${df[col].median()/1e6:>10.2f}M")
        print(f"  Std:    ${df[col].std()/1e6:>10.2f}M")
        print(f"  Min:    ${df[col].min()/1e6:>10.2f}M")
        print(f"  Max:    ${df[col].max()/1e6:>10.2f}M")

## 4. Target Variable Analysis

In [None]:
# Detailed analysis of target variable (first_week)
target = 'first_week'

print("=" * 80)
print("TARGET VARIABLE: first_week (First Week Box Office Income)")
print("=" * 80)

print(f"\n{'Statistic':<30} {'Value'}")
print("-" * 80)
print(f"{'Count':<30} {df[target].count():>15,}")
print(f"{'Mean':<30} ${df[target].mean():>14,.2f}")
print(f"{'Median':<30} ${df[target].median():>14,.2f}")
print(f"{'Std Dev':<30} ${df[target].std():>14,.2f}")
print(f"{'Min':<30} ${df[target].min():>14,.2f}")
print(f"{'25th Percentile':<30} ${df[target].quantile(0.25):>14,.2f}")
print(f"{'50th Percentile':<30} ${df[target].quantile(0.50):>14,.2f}")
print(f"{'75th Percentile':<30} ${df[target].quantile(0.75):>14,.2f}")
print(f"{'90th Percentile':<30} ${df[target].quantile(0.90):>14,.2f}")
print(f"{'95th Percentile':<30} ${df[target].quantile(0.95):>14,.2f}")
print(f"{'99th Percentile':<30} ${df[target].quantile(0.99):>14,.2f}")
print(f"{'Max':<30} ${df[target].max():>14,.2f}")
print(f"{'Coefficient of Variation':<30} {(df[target].std() / df[target].mean()) * 100:>14.2f}%")

## 5. Distribution Visualizations

In [None]:
# Distribution of target variable (first_week)
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Histogram
axes[0, 0].hist(df['first_week'] / 1e6, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of First Week Income', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('First Week Income (Millions $)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['first_week'].mean() / 1e6, color='red', linestyle='--', label=f"Mean: ${df['first_week'].mean()/1e6:.1f}M")
axes[0, 0].axvline(df['first_week'].median() / 1e6, color='green', linestyle='--', label=f"Median: ${df['first_week'].median()/1e6:.1f}M")
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Log-scale histogram
axes[0, 1].hist(np.log10(df['first_week'] + 1), bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Distribution of First Week Income (Log Scale)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Log10(First Week Income + 1)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(alpha=0.3)

# Box plot
axes[1, 0].boxplot(df['first_week'] / 1e6, vert=True, patch_artist=True,
                   boxprops=dict(facecolor='lightblue', alpha=0.7),
                   medianprops=dict(color='red', linewidth=2))
axes[1, 0].set_title('Box Plot of First Week Income', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('First Week Income (Millions $)')
axes[1, 0].grid(alpha=0.3)

# QQ plot
stats.probplot(df['first_week'], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot (Normal Distribution)', fontsize=14, fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Print skewness and kurtosis
print(f"\nSkewness: {stats.skew(df['first_week']):.4f}")
print(f"Kurtosis: {stats.kurtosis(df['first_week']):.4f}")
print("\n‚ö†Ô∏è Distribution is heavily right-skewed - log transformation recommended!")

In [None]:
# Distribution of key financial features
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Budget
axes[0, 0].hist(df['budget'] / 1e6, bins=50, color='green', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Budget Distribution', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Budget (Millions $)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['budget'].mean() / 1e6, color='red', linestyle='--', label=f"Mean: ${df['budget'].mean()/1e6:.1f}M")
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Revenue
axes[0, 1].hist(df['revenue'] / 1e6, bins=50, color='purple', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Revenue Distribution', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Revenue (Millions $)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['revenue'].mean() / 1e6, color='red', linestyle='--', label=f"Mean: ${df['revenue'].mean()/1e6:.1f}M")
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# IMDB Rating
axes[1, 0].hist(df['imdb_rating'], bins=30, color='orange', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('IMDB Rating Distribution', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('IMDB Rating')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df['imdb_rating'].mean(), color='red', linestyle='--', label=f"Mean: {df['imdb_rating'].mean():.2f}")
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# IMDB Votes (log scale)
axes[1, 1].hist(np.log10(df['imdb_votes']), bins=40, color='teal', edgecolor='black', alpha=0.7)
axes[1, 1].set_title('IMDB Votes Distribution (Log Scale)', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Log10(IMDB Votes)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Correlation Analysis

In [None]:
# Calculate correlations with target variable
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
target_correlations = df[numerical_cols].corrwith(df['first_week']).sort_values(ascending=False)

print("=" * 80)
print("TOP 20 FEATURES CORRELATED WITH FIRST WEEK INCOME")
print("=" * 80)
print(f"\n{'Feature':<40} {'Correlation':>12}")
print("-" * 80)
for feature, corr in target_correlations.head(20).items():
    print(f"{feature:<40} {corr:>12.4f}")

print("\n" + "=" * 80)
print("BOTTOM 10 FEATURES (Least Correlated)")
print("=" * 80)
print(f"\n{'Feature':<40} {'Correlation':>12}")
print("-" * 80)
for feature, corr in target_correlations.tail(10).items():
    if pd.notna(corr):
        print(f"{feature:<40} {corr:>12.4f}")

In [None]:
# Correlation heatmap for top features
top_features = target_correlations.head(15).index.tolist()

plt.figure(figsize=(14, 12))
correlation_matrix = df[top_features].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap - Top 15 Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\n‚ö†Ô∏è WARNING: High correlations detected!")
print("Features with >0.95 correlation may indicate data leakage:")
high_corr = correlation_matrix[correlation_matrix > 0.95].stack().reset_index()
high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
if len(high_corr) > 0:
    for _, row in high_corr.iterrows():
        print(f"  - {row['level_0']} ‚Üî {row['level_1']}: {row[0]:.3f}")

## 7. Relationship Visualizations

In [None]:
# Scatter plots: Key features vs First Week
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Budget vs First Week
axes[0, 0].scatter(df['budget'] / 1e6, df['first_week'] / 1e6, alpha=0.5, s=30)
axes[0, 0].set_title('Budget vs First Week Income', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Budget (Millions $)')
axes[0, 0].set_ylabel('First Week (Millions $)')
axes[0, 0].grid(alpha=0.3)

# IMDB Rating vs First Week
axes[0, 1].scatter(df['imdb_rating'], df['first_week'] / 1e6, alpha=0.5, s=30, color='green')
axes[0, 1].set_title('IMDB Rating vs First Week Income', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('IMDB Rating')
axes[0, 1].set_ylabel('First Week (Millions $)')
axes[0, 1].grid(alpha=0.3)

# IMDB Votes vs First Week (log scale)
axes[0, 2].scatter(df['imdb_votes'], df['first_week'] / 1e6, alpha=0.5, s=30, color='purple')
axes[0, 2].set_xscale('log')
axes[0, 2].set_title('IMDB Votes vs First Week Income', fontsize=12, fontweight='bold')
axes[0, 2].set_xlabel('IMDB Votes (log scale)')
axes[0, 2].set_ylabel('First Week (Millions $)')
axes[0, 2].grid(alpha=0.3)

# Popularity vs First Week
axes[1, 0].scatter(df['popularity'], df['first_week'] / 1e6, alpha=0.5, s=30, color='orange')
axes[1, 0].set_title('Popularity vs First Week Income', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Popularity Score')
axes[1, 0].set_ylabel('First Week (Millions $)')
axes[1, 0].grid(alpha=0.3)

# Twitter Mentions vs First Week
axes[1, 1].scatter(df['twitter_mentions'], df['first_week'] / 1e6, alpha=0.5, s=30, color='red')
axes[1, 1].set_title('Twitter Mentions vs First Week Income', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Twitter Mentions')
axes[1, 1].set_ylabel('First Week (Millions $)')
axes[1, 1].grid(alpha=0.3)

# Num Theaters vs First Week
axes[1, 2].scatter(df['num_theaters'], df['first_week'] / 1e6, alpha=0.5, s=30, color='teal')
axes[1, 2].set_title('Number of Theaters vs First Week Income', fontsize=12, fontweight='bold')
axes[1, 2].set_xlabel('Number of Theaters')
axes[1, 2].set_ylabel('First Week (Millions $)')
axes[1, 2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Categorical Features Analysis

In [None]:
# Extract year from release_date
df['year'] = pd.to_datetime(df['release_date']).dt.year

# Movies per year
year_counts = df['year'].value_counts().sort_index()

plt.figure(figsize=(16, 6))
plt.bar(year_counts.index, year_counts.values, color='steelblue', edgecolor='black', alpha=0.7)
plt.title('Number of Movies per Year (1990-2024)', fontsize=16, fontweight='bold')
plt.xlabel('Year', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nTotal years covered: {year_counts.index.min()} - {year_counts.index.max()}")
print(f"Average movies per year: {year_counts.mean():.1f}")

In [None]:
# Top genres
genre_counts = df['genre'].value_counts().head(15)

plt.figure(figsize=(14, 8))
plt.barh(range(len(genre_counts)), genre_counts.values, color='coral', edgecolor='black', alpha=0.7)
plt.yticks(range(len(genre_counts)), genre_counts.index)
plt.title('Top 15 Most Common Genres', fontsize=16, fontweight='bold')
plt.xlabel('Number of Movies', fontsize=12)
plt.ylabel('Genre', fontsize=12)
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nTotal unique genres: {df['genre'].nunique()}")

In [None]:
# Sequels vs Original movies
if 'is_sequel' in df.columns:
    sequel_data = df.groupby('is_sequel')['first_week'].agg(['mean', 'median', 'count'])
    sequel_data.index = ['Original', 'Sequel']
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Count
    axes[0].bar(sequel_data.index, sequel_data['count'], color=['steelblue', 'coral'], 
                edgecolor='black', alpha=0.7)
    axes[0].set_title('Original vs Sequel Movies', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Number of Movies')
    axes[0].grid(axis='y', alpha=0.3)
    for i, v in enumerate(sequel_data['count']):
        axes[0].text(i, v + 50, str(int(v)), ha='center', fontweight='bold')
    
    # Average First Week Income
    axes[1].bar(sequel_data.index, sequel_data['mean'] / 1e6, color=['steelblue', 'coral'],
                edgecolor='black', alpha=0.7)
    axes[1].set_title('Average First Week Income', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('First Week Income (Millions $)')
    axes[1].grid(axis='y', alpha=0.3)
    for i, v in enumerate(sequel_data['mean'] / 1e6):
        axes[1].text(i, v + 2, f'${v:.1f}M', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nSequels earn {sequel_data.loc['Sequel', 'mean'] / sequel_data.loc['Original', 'mean']:.2f}x more on average")

In [None]:
# Summer vs Non-summer releases
if 'is_summer' in df.columns:
    summer_data = df.groupby('is_summer')['first_week'].agg(['mean', 'median', 'count'])
    summer_data.index = ['Non-Summer', 'Summer']
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Count
    axes[0].bar(summer_data.index, summer_data['count'], color=['lightblue', 'orange'],
                edgecolor='black', alpha=0.7)
    axes[0].set_title('Summer vs Non-Summer Releases', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Number of Movies')
    axes[0].grid(axis='y', alpha=0.3)
    for i, v in enumerate(summer_data['count']):
        axes[0].text(i, v + 50, str(int(v)), ha='center', fontweight='bold')
    
    # Average First Week Income
    if summer_data['count'].min() > 0:  # Only if we have both categories
        axes[1].bar(summer_data.index, summer_data['mean'] / 1e6, color=['lightblue', 'orange'],
                    edgecolor='black', alpha=0.7)
        axes[1].set_title('Average First Week Income', fontsize=14, fontweight='bold')
        axes[1].set_ylabel('First Week Income (Millions $)')
        axes[1].grid(axis='y', alpha=0.3)
        for i, v in enumerate(summer_data['mean'] / 1e6):
            axes[1].text(i, v + 2, f'${v:.1f}M', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

## 9. Outlier Detection

In [None]:
# Outlier detection using IQR method
outlier_features = ['budget', 'revenue', 'first_week', 'opening_weekend', 'imdb_votes']

print("=" * 80)
print("OUTLIER DETECTION (IQR Method)")
print("=" * 80)
print(f"\n{'Feature':<25} {'Outliers':>10} {'Percentage':>12}")
print("-" * 80)

outlier_summary = {}
for col in outlier_features:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        pct = (outliers / len(df)) * 100
        outlier_summary[col] = {'count': outliers, 'percentage': pct}
        print(f"{col:<25} {outliers:>10} {pct:>11.2f}%")

print("\n‚ö†Ô∏è Note: High outlier percentages indicate skewed distributions")

In [None]:
# Box plots for outlier visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for idx, col in enumerate(outlier_features):
    if col in df.columns:
        row = idx // 3
        col_idx = idx % 3
        
        axes[row, col_idx].boxplot(df[col] / 1e6 if col != 'imdb_votes' else df[col],
                                   vert=True, patch_artist=True,
                                   boxprops=dict(facecolor='lightblue', alpha=0.7),
                                   medianprops=dict(color='red', linewidth=2),
                                   flierprops=dict(marker='o', markerfacecolor='red', markersize=4, alpha=0.5))
        
        title = col.replace('_', ' ').title()
        axes[row, col_idx].set_title(f'Box Plot: {title}', fontsize=12, fontweight='bold')
        ylabel = f"{title} ({'Millions $' if col != 'imdb_votes' else 'Votes'})"
        axes[row, col_idx].set_ylabel(ylabel)
        axes[row, col_idx].grid(alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

## 10. Distribution Shapes (Skewness & Kurtosis)

In [None]:
# Calculate skewness and kurtosis
key_features = ['budget', 'revenue', 'first_week', 'imdb_rating', 'imdb_votes']

print("=" * 80)
print("DISTRIBUTION ANALYSIS - SKEWNESS AND KURTOSIS")
print("=" * 80)
print(f"\n{'Feature':<25} {'Skewness':>12} {'Kurtosis':>12} {'Distribution'}")
print("-" * 80)

for col in key_features:
    if col in df.columns and df[col].std() > 0:
        skew = stats.skew(df[col].dropna())
        kurt = stats.kurtosis(df[col].dropna())
        
        # Interpret distribution
        if abs(skew) < 0.5:
            dist = "‚úÖ Normal"
        elif skew > 1:
            dist = "‚ö†Ô∏è Highly Right-skewed"
        elif skew > 0:
            dist = "‚ö†Ô∏è Right-skewed"
        elif skew < -1:
            dist = "‚ö†Ô∏è Highly Left-skewed"
        else:
            dist = "‚ö†Ô∏è Left-skewed"
        
        print(f"{col:<25} {skew:>12.4f} {kurt:>12.4f} {dist}")

print("\nüí° Interpretation:")
print("  ‚Ä¢ Skewness close to 0: Normal distribution")
print("  ‚Ä¢ Skewness > 1: Highly right-skewed (log transformation recommended)")
print("  ‚Ä¢ Kurtosis > 3: Heavy tails (more outliers than normal distribution)")

## 11. Data Quality Recommendations

In [None]:
# Generate recommendations
print("=" * 80)
print("DATA QUALITY RECOMMENDATIONS FOR MODEL TRAINING")
print("=" * 80)

recommendations = []

# Check skewness
for col in ['budget', 'revenue', 'first_week', 'imdb_votes']:
    if col in df.columns:
        skew = stats.skew(df[col].dropna())
        if abs(skew) > 1:
            recommendations.append(f"‚ö†Ô∏è  '{col}' is highly skewed ({skew:.2f}). Apply log transformation: log(x + 1)")

# Check outliers
for col in outlier_features:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = ((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum()
        pct = (outliers / len(df)) * 100
        if pct > 5:
            recommendations.append(f"‚ö†Ô∏è  '{col}' has {pct:.1f}% outliers. Consider RobustScaler or keep for ensemble models.")

# Check scale differences
recommendations.append("‚ö†Ô∏è  Features have vastly different scales (1-5B). Use StandardScaler or RobustScaler.")

# Check correlation (data leakage)
high_corr_features = target_correlations[target_correlations > 0.95].index.tolist()
if len(high_corr_features) > 2:
    leakage_features = [f for f in high_corr_features if f not in ['first_week']]
    recommendations.append(f"üö® DATA LEAKAGE: Remove these features: {', '.join(leakage_features)}")

# Print recommendations
print("\nüìã Action Items:\n")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

print("\n" + "=" * 80)
print("RECOMMENDED FEATURE ENGINEERING STEPS")
print("=" * 80)
print("""
1. Remove Data Leakage:
   ‚úó Drop: opening_weekend, average_per_theater, total_gross, revenue
   ‚úì Keep: budget, ratings, social media, temporal features

2. Apply Log Transformations:
   ‚Ä¢ log(budget + 1)
   ‚Ä¢ log(first_week + 1)  [TARGET]
   ‚Ä¢ log(imdb_votes + 1)
   ‚Ä¢ log(ticket_presales + 1)

3. Feature Scaling:
   ‚Ä¢ Use RobustScaler (handles outliers better)
   ‚Ä¢ Or StandardScaler for tree-based models

4. Handle Outliers:
   ‚Ä¢ Keep outliers (blockbusters are real data!)
   ‚Ä¢ Use robust models: Random Forest, XGBoost, CatBoost

5. Feature Selection:
   ‚Ä¢ Focus on features with correlation > 0.3
   ‚Ä¢ Consider feature importance from tree models
""")

## 12. Key Insights Summary

In [None]:
print("=" * 80)
print("KEY INSIGHTS FROM DATA ANALYSIS")
print("=" * 80)

insights = []

# ROI
roi = (df['revenue'] / df['budget']).median()
insights.append(f"üí∞ Median ROI: {roi:.2f}x (most movies don't break even!)")

# Rating
high_rated = (df['imdb_rating'] >= 8.0).sum()
insights.append(f"‚≠ê {high_rated} movies ({high_rated/len(df)*100:.1f}%) rated 8.0+")

# Popularity
popular = (df['imdb_votes'] >= 500000).sum()
insights.append(f"üé¨ {popular} blockbusters ({popular/len(df)*100:.1f}%) have 500K+ votes")

# Sequels
if 'is_sequel' in df.columns and df['is_sequel'].sum() > 0:
    sequel_avg = df[df['is_sequel'] == 1]['first_week'].mean()
    original_avg = df[df['is_sequel'] == 0]['first_week'].mean()
    ratio = sequel_avg / original_avg
    insights.append(f"üé≠ Sequels earn {ratio:.2f}x {'more' if ratio > 1 else 'less'} in first week")

# Budget correlation
budget_corr = df['budget'].corr(df['first_week'])
insights.append(f"üíµ Budget correlation with first week: {budget_corr:.3f} (strong positive)")

# Social media impact
twitter_corr = df['twitter_mentions'].corr(df['first_week'])
insights.append(f"üì± Twitter mentions correlation: {twitter_corr:.3f} (moderate positive)")

# Target variance
cv = (df['first_week'].std() / df['first_week'].mean()) * 100
insights.append(f"üìä First week income has {cv:.1f}% coefficient of variation (highly variable!)")

# Print insights
print("\nüîç Insights:\n")
for insight in insights:
    print(f"  {insight}")

print("\n" + "=" * 80)
print("‚úÖ ANALYSIS COMPLETE!")
print("=" * 80)
print(f"\nDataset: data/raw/imdb_movies_large.csv")
print(f"Movies: {len(df):,}")
print(f"Features: {len(df.columns)}")
print(f"Target: first_week (${df['first_week'].mean():,.0f} average)")
print(f"\nüí° Data is ready for feature engineering and model training!")
print("=" * 80)

## 13. Next Steps

### Recommended Actions:

1. **Feature Engineering**:
   - Remove data leakage features (opening_weekend, average_per_theater, total_gross, revenue)
   - Apply log transformations to skewed features
   - Scale features using RobustScaler

2. **Model Training**:
   - Use ensemble methods (Random Forest, XGBoost, CatBoost)
   - Implement stacking with meta-learner
   - Cross-validation for robust evaluation

3. **Model Evaluation**:
   - RMSE, MAE, R¬≤ metrics
   - Feature importance analysis
   - Residual analysis

---

**Ready to proceed with model training! üöÄ**