In [1]:
import sys
import os
sys.path.append('../')

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Import custom modules
from src.utils.data_loader import InsuranceDataProcessor
from src.analysis.eda_analyzer import EDAAnalyzer
from src.visualization.eda_plots import EDAVisualizer

# Set display options
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.3f}'.format)

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [8]:
# SECTION 1: DATA LOADING AND VALIDATION
print("=" * 80)
print("SECTION 1: DATA LOADING AND VALIDATION")
print("=" * 80)

# Initialize data processor (config is relative to notebook)
processor = InsuranceDataProcessor(config_path="../config/config.yaml")

# Load data
df = processor.load_data()

# Display basic information
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns ({len(df.columns)}):")
print(df.columns.tolist())

print(f"\nData Types Summary:")
print(df.dtypes.value_counts())

# Display first few rows
print("\nFirst 5 rows:")
display(df.head())

# Display last few rows
print("\nLast 5 rows:")
display(df.tail())

2025-12-06 13:51:03,167 - src.utils.data_loader - INFO - Loading data from data\raw\insurance_data.csv
2025-12-06 13:51:03,171 - src.utils.data_loader - ERROR - Error loading data: [Errno 2] No such file or directory: 'data\\raw\\insurance_data.csv'


SECTION 1: DATA LOADING AND VALIDATION


FileNotFoundError: [Errno 2] No such file or directory: 'data\\raw\\insurance_data.csv'

In [None]:
# SECTION 2: DATA QUALITY ASSESSMENT
print("=" * 80)
print("SECTION 2: DATA QUALITY ASSESSMENT")
print("=" * 80)

# Validate data structure
validation_results = processor.validate_data_structure()

# Display data types
print("\nData Types:")
dtype_summary = pd.DataFrame({
    'Column': df.columns,
    'DataType': df.dtypes.astype(str),
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})

display(dtype_summary.sort_values('Null Percentage', ascending=False).head(20))

# Missing values analysis
print("\nMissing Values Analysis:")
missing_summary = dtype_summary[dtype_summary['Null Percentage'] > 0].sort_values('Null Percentage', ascending=False)

if len(missing_summary) > 0:
    print(f"Total columns with missing values: {len(missing_summary)}")
    print(f"Total missing values: {df.isnull().sum().sum()}")
    print(f"Overall missing percentage: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100):.2f}%")
    
    # Plot missing values
    plt.figure(figsize=(12, 6))
    missing_summary.head(20).plot(kind='bar', x='Column', y='Null Percentage', legend=False)
    plt.title('Top 20 Columns with Missing Values', fontsize=14)
    plt.xlabel('Columns')
    plt.ylabel('Missing Percentage (%)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset!")

In [None]:
# SECTION 3: DATA PREPROCESSING
print("=" * 80)
print("SECTION 3: DATA PREPROCESSING")
print("=" * 80)

# Preprocess data
df_processed = processor.preprocess_data()

print(f"Original shape: {processor.metadata['original_shape']}")
print(f"Processed shape: {df_processed.shape}")

# Display processed data info
print("\nProcessed Data Info:")
print(f"Memory usage: {df_processed.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Check for duplicates
duplicate_count = df_processed.duplicated().sum()
print(f"\nDuplicate rows: {duplicate_count} ({duplicate_count/len(df_processed)*100:.2f}%)")

if duplicate_count > 0:
    print("Removing duplicates...")
    df_processed = df_processed.drop_duplicates()
    print(f"New shape after removing duplicates: {df_processed.shape}")

# Save processed data
processor.save_processed_data()
print("\nProcessed data saved successfully!")

In [None]:
# SECTION 4: DESCRIPTIVE STATISTICS
print("=" * 80)
print("SECTION 4: DESCRIPTIVE STATISTICS")
print("=" * 80)

# Initialize EDA Analyzer
analyzer = EDAAnalyzer(df_processed)

# Compute descriptive statistics
desc_stats = analyzer.compute_descriptive_statistics()

print("Descriptive Statistics for Key Numerical Features:")
key_features = ['TotalPremium', 'TotalClaims', 'LossRatio', 'CustomValueEstimate', 
                'SumInsured', 'CalculatedPremiumPerTerm']

key_stats = desc_stats[desc_stats['feature'].isin(key_features)]
display(key_stats[['feature', 'mean', 'std', 'min', '25%', 'median', '75%', 'max', 
                   'skewness', 'kurtosis', 'missing_pct']])

# Create summary table
summary_table = pd.DataFrame({
    'Metric': [
        'Total Premium', 'Total Claims', 'Average Loss Ratio',
        'Average Premium per Policy', 'Average Claim per Policy',
        'Total Policies', 'Date Range'
    ],
    'Value': [
        f"${df_processed['TotalPremium'].sum():,.0f}",
        f"${df_processed['TotalClaims'].sum():,.0f}",
        f"{df_processed['LossRatio'].mean():.3f}",
        f"${df_processed['TotalPremium'].mean():,.0f}",
        f"${df_processed['TotalClaims'].mean():,.0f}",
        f"{df_processed['PolicyID'].nunique() if 'PolicyID' in df_processed.columns else len(df_processed):,.0f}",
        f"{df_processed['TransactionMonth'].min().date()} to {df_processed['TransactionMonth'].max().date()}"
        if 'TransactionMonth' in df_processed.columns else 'N/A'
    ]
})

print("\nKey Business Metrics Summary:")
display(summary_table)

In [None]:
# SECTION 5: UNIVARIATE ANALYSIS
print("=" * 80)
print("SECTION 5: UNIVARIATE ANALYSIS")
print("=" * 80)

# Initialize visualizer
visualizer = EDAVisualizer(df_processed, save_dir="../reports/figures")

# Analyze distributions
distribution_results = analyzer.analyze_distributions()

# Plot data quality summary
missing_df = analyzer.analyze_missing_values()
visualizer.plot_data_quality_summary(missing_df)

# Plot numeric distributions
numeric_cols = analyzer.numeric_cols[:15]  # First 15 numeric columns
print(f"\nPlotting distributions for {len(numeric_cols)} numeric columns...")
visualizer.plot_numeric_distributions(numeric_cols)

# Plot categorical distributions
categorical_cols = analyzer.categorical_cols[:10]  # First 10 categorical columns
print(f"\nPlotting distributions for {len(categorical_cols)} categorical columns...")
visualizer.plot_categorical_distributions(categorical_cols)

# Display distribution insights
print("\nDistribution Insights:")
for col, stats in distribution_results.get('numeric_distributions', {}).items():
    if col in ['TotalPremium', 'TotalClaims', 'LossRatio']:
        print(f"\n{col}:")
        print(f"  Mean: {stats['mean']:.2f}, Median: {stats['median']:.2f}")
        print(f"  Coefficient of Variation: {stats['coefficient_of_variation']:.2f}"
              if stats['coefficient_of_variation'] else "  Coefficient of Variation: N/A")

In [None]:
# SECTION 6: BIVARIATE/MULTIVARIATE ANALYSIS
print("=" * 80)
print("SECTION 6: BIVARIATE/MULTIVARIATE ANALYSIS")
print("=" * 80)

# Correlation analysis
print("Analyzing correlations...")
correlation_matrix = analyzer.analyze_correlations(method='spearman')

# Plot correlation matrix
visualizer.plot_correlation_matrix(correlation_matrix)

# Display highly correlated pairs
if 'correlations' in analyzer.results and analyzer.results['correlations'].get('high_correlation_pairs') is not None:
    high_corr = analyzer.results['correlations']['high_correlation_pairs']
    if len(high_corr) > 0:
        print(f"\nFound {len(high_corr)} highly correlated pairs (|correlation| > 0.7):")
        display(high_corr.head(10))
    else:
        print("\nNo highly correlated pairs found (|correlation| > 0.7)")

# Scatter plot matrix for key variables
key_vars = ['TotalPremium', 'TotalClaims', 'LossRatio', 'CustomValueEstimate']
key_df = df_processed[key_vars].dropna()

if len(key_df) > 0:
    print("\nCreating scatter plot matrix for key variables...")
    scatter_matrix = pd.plotting.scatter_matrix(key_df, figsize=(12, 12), diagonal='hist', 
                                               alpha=0.5, grid=True)
    plt.suptitle('Scatter Plot Matrix of Key Variables', fontsize=16, y=1.02)
    plt.tight_layout()
    plt.savefig("../reports/figures/scatter_matrix.png", dpi=300, bbox_inches='tight')
    plt.show()

# Pairwise relationships analysis
print("\nAnalyzing pairwise relationships...")
pair_relationships = []

for i, var1 in enumerate(key_vars):
    for var2 in key_vars[i+1:]:
        if var1 in df_processed.columns and var2 in df_processed.columns:
            corr = df_processed[[var1, var2]].corr().iloc[0, 1]
            pair_relationships.append({
                'Variable 1': var1,
                'Variable 2': var2,
                'Correlation': corr,
                'Relationship': 'Strong Positive' if corr > 0.7 else
                               'Moderate Positive' if corr > 0.3 else
                               'Weak Positive' if corr > 0 else
                               'Weak Negative' if corr > -0.3 else
                               'Moderate Negative' if corr > -0.7 else
                               'Strong Negative'
            })

pair_df = pd.DataFrame(pair_relationships)
print("\nPairwise Relationships Summary:")
display(pair_df)

In [None]:
# SECTION 7: LOSS RATIO ANALYSIS
print("=" * 80)
print("SECTION 7: LOSS RATIO ANALYSIS")
print("=" * 80)

# Calculate overall loss ratio
overall_loss_ratio = df_processed['LossRatio'].mean()
print(f"\nOverall Portfolio Loss Ratio: {overall_loss_ratio:.3f}")

# Analyze loss ratio by different dimensions
dimensions = ['Province', 'VehicleType', 'Gender', 'CoverType', 'Make']
dimension_results = analyzer.analyze_by_dimensions(dimensions)

# Display loss ratio by province
if 'Province' in dimension_results:
    print("\nLoss Ratio by Province (Top 10):")
    province_loss = dimension_results['Province'].sort_values('LossRatio_mean', ascending=False).head(10)
    display(province_loss[['Province', 'LossRatio_mean', 'TotalPremium_sum', 'TotalClaims_sum', 
                          'Premium_Share', 'Claims_Share']])

# Display loss ratio by vehicle type
if 'VehicleType' in dimension_results:
    print("\nLoss Ratio by Vehicle Type:")
    vehicle_loss = dimension_results['VehicleType'].sort_values('LossRatio_mean', ascending=False)
    display(vehicle_loss[['VehicleType', 'LossRatio_mean', 'TotalPremium_sum', 'TotalClaims_sum']])

# Display loss ratio by gender
if 'Gender' in dimension_results:
    print("\nLoss Ratio by Gender:")
    gender_loss = dimension_results['Gender'].sort_values('LossRatio_mean', ascending=False)
    display(gender_loss[['Gender', 'LossRatio_mean', 'TotalPremium_sum', 'TotalClaims_sum']])

# Plot loss ratio analysis
visualizer.plot_loss_ratio_analysis()

# Create detailed loss ratio report
print("\n" + "=" * 80)
print("LOSS RATIO INSIGHTS SUMMARY")
print("=" * 80)

loss_ratio_insights = []

# Provincial insights
if 'Province' in dimension_results:
    highest_province = dimension_results['Province'].loc[dimension_results['Province']['LossRatio_mean'].idxmax()]
    lowest_province = dimension_results['Province'].loc[dimension_results['Province']['LossRatio_mean'].idxmin()]
    
    loss_ratio_insights.append({
        'Insight': f"Highest Loss Ratio Province: {highest_province['Province']}",
        'Value': f"{highest_province['LossRatio_mean']:.3f}",
        'Interpretation': f"Claims are {highest_province['LossRatio_mean']/overall_loss_ratio:.1f}x portfolio average"
    })
    
    loss_ratio_insights.append({
        'Insight': f"Lowest Loss Ratio Province: {lowest_province['Province']}",
        'Value': f"{lowest_province['LossRatio_mean']:.3f}",
        'Interpretation': f"Claims are {lowest_province['LossRatio_mean']/overall_loss_ratio:.1f}x portfolio average"
    })

# Vehicle type insights
if 'VehicleType' in dimension_results:
    highest_vehicle = dimension_results['VehicleType'].loc[dimension_results['VehicleType']['LossRatio_mean'].idxmax()]
    
    loss_ratio_insights.append({
        'Insight': f"Highest Risk Vehicle Type: {highest_vehicle['VehicleType']}",
        'Value': f"{highest_vehicle['LossRatio_mean']:.3f}",
        'Interpretation': f"Consider adjusting premiums or coverage for this vehicle type"
    })

# Gender insights
if 'Gender' in dimension_results:
    gender_comparison = dimension_results['Gender']
    if len(gender_comparison) >= 2:
        gender_diff = gender_comparison['LossRatio_mean'].max() - gender_comparison['LossRatio_mean'].min()
        
        loss_ratio_insights.append({
            'Insight': "Gender-based Loss Ratio Difference",
            'Value': f"{gender_diff:.3f}",
            'Interpretation': f"Significant difference in loss ratios between genders"
        })

insights_df = pd.DataFrame(loss_ratio_insights)
display(insights_df)

In [None]:
# SECTION 8: OUTLIER DETECTION
print("=" * 80)
print("SECTION 8: OUTLIER DETECTION")
print("=" * 80)

# Detect outliers
outlier_results = analyzer.detect_outliers(method='iqr', threshold=1.5)

# Display outlier summary
if 'summary' in outlier_results:
    summary = outlier_results['summary']
    print(f"\nOutlier Detection Summary:")
    print(f"Total records with outliers: {summary['total_outlier_records']:,} "
          f"({summary['percentage_outlier_records']:.2f}%)")
    print(f"Features with outliers: {summary['features_with_outliers']}")

# Display top features with most outliers
if 'outliers_by_feature' in outlier_results:
    outlier_features = []
    for feature, stats in outlier_results['outliers_by_feature'].items():
        if stats['count'] > 0:
            outlier_features.append({
                'Feature': feature,
                'Outliers': stats['count'],
                'Percentage': stats['percentage'],
                'Lower Bound': stats.get('lower_bound', 'N/A'),
                'Upper Bound': stats.get('upper_bound', 'N/A')
            })
    
    if outlier_features:
        outlier_df = pd.DataFrame(outlier_features).sort_values('Outliers', ascending=False)
        print(f"\nTop 10 Features with Most Outliers:")
        display(outlier_df.head(10))
        
        # Plot outlier analysis
        visualizer.plot_outlier_analysis(outlier_results)
        
        # Analyze impact of outliers on key metrics
        print("\nAnalyzing impact of outliers on key metrics...")
        
        for feature in ['TotalClaims', 'TotalPremium', 'CustomValueEstimate']:
            if feature in outlier_results['outliers_by_feature']:
                stats = outlier_results['outliers_by_feature'][feature]
                if stats['count'] > 0:
                    outlier_indices = stats['outlier_indices']
                    outlier_sum = df_processed.loc[outlier_indices, feature].sum()
                    total_sum = df_processed[feature].sum()
                    
                    print(f"\n{feature}:")
                    print(f"  Outliers contribute: ${outlier_sum:,.0f} "
                          f"({outlier_sum/total_sum*100:.1f}% of total)")
                    print(f"  Number of outlier records: {stats['count']:,} "
                          f"({stats['percentage']:.1f}% of records)")
    else:
        print("\nNo outliers detected in any features!")
else:
    print("\nOutlier detection results not available!")

In [None]:
# SECTION 9: TEMPORAL TREND ANALYSIS
print("=" * 80)
print("SECTION 9: TEMPORAL TREND ANALYSIS")
print("=" * 80)

# Analyze temporal trends
temporal_results = analyzer.analyze_temporal_trends()

if 'monthly_data' in temporal_results:
    monthly_data = temporal_results['monthly_data']
    
    print(f"\nTemporal Analysis Period: {monthly_data.index[0].date()} to {monthly_data.index[-1].date()}")
    print(f"Number of months analyzed: {len(monthly_data)}")
    
    # Display monthly summary
    print("\nMonthly Summary Statistics:")
    monthly_summary = monthly_data.describe()
    display(monthly_summary)
    
    # Calculate trends
    print("\nTrend Analysis:")
    for key in ['TotalPremium_trend', 'TotalClaims_trend', 'LossRatio_trend']:
        if key in temporal_results:
            trend = temporal_results[key]
            direction = 'üìà Increasing' if trend['slope'] > 0 else 'üìâ Decreasing'
            significance = 'Significant' if trend['p_value'] < 0.05 else 'Not Significant'
            
            print(f"\n{key.replace('_trend', '').replace('Total', '')}:")
            print(f"  Direction: {direction} (slope: {trend['slope']:.4f})")
            print(f"  R-squared: {trend['r_squared']:.3f}")
            print(f"  P-value: {trend['p_value']:.4f} ({significance})")
            print(f"  Percent Change: {trend['percent_change']:.1f}% over period")
    
    # Plot temporal trends
    visualizer.plot_temporal_trends(monthly_data)
    
    # Monthly volatility analysis
    print("\nMonthly Volatility Analysis:")
    volatility_stats = {
        'Metric': ['TotalPremium', 'TotalClaims', 'LossRatio'],
        'Std_Dev': [monthly_data['TotalPremium'].std(), 
                   monthly_data['TotalClaims'].std(), 
                   monthly_data['LossRatio'].std()],
        'CV': [monthly_data['TotalPremium'].std() / monthly_data['TotalPremium'].mean(),
              monthly_data['TotalClaims'].std() / monthly_data['TotalClaims'].mean(),
              monthly_data['LossRatio'].std() / monthly_data['LossRatio'].mean()]
    }
    
    volatility_df = pd.DataFrame(volatility_stats)
    display(volatility_df)
    
    # Identify peak months
    print("\nPeak Months Analysis:")
    peak_months = {
        'Highest Premium Month': monthly_data['TotalPremium'].idxmax().strftime('%B %Y'),
        'Highest Claims Month': monthly_data['TotalClaims'].idxmax().strftime('%B %Y'),
        'Highest Loss Ratio Month': monthly_data['LossRatio'].idxmax().strftime('%B %Y'),
        'Lowest Loss Ratio Month': monthly_data['LossRatio'].idxmin().strftime('%B %Y')
    }
    
    for metric, month in peak_months.items():
        print(f"  {metric}: {month}")
else:
    print("Temporal analysis not available. Check if 'TransactionMonth' column exists.")

In [None]:
# SECTION 10: VEHICLE MAKE/MODEL ANALYSIS
print("=" * 80)
print("SECTION 10: VEHICLE MAKE/MODEL ANALYSIS")
print("=" * 80)

if all(col in df_processed.columns for col in ['Make', 'Model', 'TotalClaims']):
    # Analyze vehicle makes and models
    vehicle_analysis = df_processed.groupby(['Make', 'Model']).agg({
        'TotalClaims': ['sum', 'mean', 'count'],
        'TotalPremium': ['sum', 'mean'],
        'LossRatio': ['mean', 'std'],
        'CustomValueEstimate': 'mean'
    }).round(2)
    
    # Flatten column names
    vehicle_analysis.columns = ['_'.join(col).strip() for col in vehicle_analysis.columns.values]
    vehicle_analysis = vehicle_analysis.reset_index()
    
    # Filter for sufficient data
    vehicle_analysis = vehicle_analysis[vehicle_analysis['TotalClaims_count'] > 5]
    
    print(f"\nAnalyzed {len(vehicle_analysis)} unique make/model combinations "
          f"(with >5 policies each)")
    
    # Top 10 by total claims
    print("\nTop 10 Vehicle Makes/Models by Total Claims:")
    top_claims = vehicle_analysis.sort_values('TotalClaims_sum', ascending=False).head(10)
    display(top_claims[['Make', 'Model', 'TotalClaims_sum', 'TotalClaims_mean', 
                       'LossRatio_mean', 'TotalClaims_count']])
    
    # Top 10 by loss ratio
    print("\nTop 10 Vehicle Makes/Models by Loss Ratio (Highest Risk):")
    top_risk = vehicle_analysis.sort_values('LossRatio_mean', ascending=False).head(10)
    display(top_risk[['Make', 'Model', 'LossRatio_mean', 'TotalClaims_sum', 
                     'TotalPremium_sum', 'TotalClaims_count']])
    
    # Bottom 10 by loss ratio
    print("\nTop 10 Vehicle Makes/Models by Loss Ratio (Lowest Risk):")
    bottom_risk = vehicle_analysis.sort_values('LossRatio_mean', ascending=True).head(10)
    display(bottom_risk[['Make', 'Model', 'LossRatio_mean', 'TotalClaims_sum', 
                        'TotalPremium_sum', 'TotalClaims_count']])
    
    # Make-level analysis
    print("\nVehicle Make Analysis (Aggregated):")
    make_analysis = df_processed.groupby('Make').agg({
        'TotalClaims': 'sum',
        'TotalPremium': 'sum',
        'LossRatio': 'mean',
        'PolicyID': 'count'
    }).reset_index()
    
    make_analysis = make_analysis[make_analysis['PolicyID'] > 10]  # Filter for sufficient data
    make_analysis['LossRatio'] = make_analysis['TotalClaims'] / make_analysis['TotalPremium']
    make_analysis['Premium_Share'] = (make_analysis['TotalPremium'] / make_analysis['TotalPremium'].sum()) * 100
    make_analysis['Claims_Share'] = (make_analysis['TotalClaims'] / make_analysis['TotalClaims'].sum()) * 100
    
    print(f"\nAnalyzed {len(make_analysis)} makes (with >10 policies each)")
    
    # Display top risky makes
    print("\nTop 10 Risky Makes by Loss Ratio:")
    display(make_analysis.sort_values('LossRatio', ascending=False).head(10)[
        ['Make', 'LossRatio', 'TotalClaims', 'TotalPremium', 'PolicyID', 'Claims_Share', 'Premium_Share']
    ])
    
    # Display safest makes
    print("\nTop 10 Safest Makes by Loss Ratio:")
    display(make_analysis.sort_values('LossRatio', ascending=True).head(10)[
        ['Make', 'LossRatio', 'TotalClaims', 'TotalPremium', 'PolicyID', 'Claims_Share', 'Premium_Share']
    ])
    
    # Create visualizations
    # 1. Bar chart of top 10 makes by claims
    plt.figure(figsize=(12, 6))
    top_makes = make_analysis.sort_values('TotalClaims', ascending=False).head(10)
    plt.barh(top_makes['Make'], top_makes['TotalClaims'], color='steelblue')
    plt.xlabel('Total Claims ($)')
    plt.title('Top 10 Vehicle Makes by Total Claims', fontsize=14)
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.savefig("../reports/figures/top_makes_claims.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # 2. Scatter plot: Premium vs Claims by Make
    plt.figure(figsize=(10, 8))
    plt.scatter(make_analysis['TotalPremium'], make_analysis['TotalClaims'], 
               s=make_analysis['PolicyID']/10, alpha=0.6, c=make_analysis['LossRatio'], 
               cmap='RdYlGn_r')
    plt.colorbar(label='Loss Ratio')
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel('Total Premium (Log Scale)')
    plt.ylabel('Total Claims (Log Scale)')
    plt.title('Premium vs Claims by Vehicle Make', fontsize=14)
    plt.grid(True, alpha=0.3)
    
    # Annotate top makes
    for idx, row in make_analysis.nlargest(5, 'TotalClaims').iterrows():
        plt.annotate(row['Make'], (row['TotalPremium'], row['TotalClaims']),
                    fontsize=9, alpha=0.8)
    
    plt.tight_layout()
    plt.savefig("../reports/figures/premium_vs_claims_makes.png", dpi=300, bbox_inches='tight')
    plt.show()
    
else:
    print("Vehicle make/model analysis not available. Required columns missing.")

In [None]:
# SECTION 11: CREATIVE VISUALIZATIONS
print("=" * 80)
print("SECTION 11: CREATIVE VISUALIZATIONS")
print("=" * 80)

print("\nCreating Creative Visualization 1: Risk Heatmap by Province and Vehicle Type")
visualizer.create_creative_visualization_1()

print("\nCreating Creative Visualization 2: Interactive Risk Profile Dashboard")
visualizer.create_creative_visualization_2()

print("\nCreating Creative Visualization 3: Temporal Risk Evolution Dashboard")
visualizer.create_creative_visualization_3()

print("\n‚úÖ All creative visualizations created and saved to reports/figures/")

In [None]:
# SECTION 12: COMPREHENSIVE SUMMARY REPORT
print("=" * 80)
print("SECTION 12: COMPREHENSIVE SUMMARY REPORT")
print("=" * 80)

# Generate comprehensive summary
summary_report = analyzer.generate_summary_report()

print("\nüìä COMPREHENSIVE EDA SUMMARY")
print("=" * 50)

# Dataset Overview
print("\n1. DATASET OVERVIEW:")
print(f"   ‚Ä¢ Shape: {summary_report['dataset_overview']['shape']}")
print(f"   ‚Ä¢ Memory Usage: {summary_report['dataset_overview']['memory_usage_mb']:.2f} MB")
if summary_report['dataset_overview']['date_range']['start']:
    print(f"   ‚Ä¢ Date Range: {summary_report['dataset_overview']['date_range']['start'].date()} "
          f"to {summary_report['dataset_overview']['date_range']['end'].date()}")
    print(f"   ‚Ä¢ Duration: {summary_report['dataset_overview']['date_range']['duration_days']} days")

# Data Quality
print("\n2. DATA QUALITY:")
print(f"   ‚Ä¢ Total Missing Values: {summary_report['data_quality']['total_missing_values']:,}")
print(f"   ‚Ä¢ Missing Percentage: {summary_report['data_quality']['missing_value_percentage']:.2f}%")
print(f"   ‚Ä¢ Duplicate Rows: {summary_report['data_quality']['duplicate_rows']:,} "
      f"({summary_report['data_quality']['duplicate_percentage']:.2f}%)")

# Key Metrics
print("\n3. KEY BUSINESS METRICS:")
for metric, value in summary_report['key_metrics'].items():
    if value is not None:
        metric_name = metric.replace('_', ' ').title()
        if 'ratio' in metric or 'average' in metric:
            print(f"   ‚Ä¢ {metric_name}: {value:.3f}")
        else:
            print(f"   ‚Ä¢ {metric_name}: ${value:,.0f}")

# Risk Insights
print("\n4. RISK INSIGHTS:")
if 'risk_insights' in summary_report:
    insights = summary_report['risk_insights']
    
    if 'top_risky_provinces' in insights:
        print("\n   Top 5 Risky Provinces:")
        for province in insights['top_risky_provinces'][:5]:
            print(f"     ‚Ä¢ {province['Province']}: Loss Ratio = {province['mean']:.3f} "
                  f"({province['count']} policies)")
    
    if 'top_risky_vehicles' in insights:
        print("\n   Top 5 Risky Vehicle Types:")
        for vehicle in insights['top_risky_vehicles'][:5]:
            print(f"     ‚Ä¢ {vehicle['VehicleType']}: Loss Ratio = {vehicle['mean']:.3f} "
                  f"({vehicle['count']} policies)")
    
    if 'highest_claim_makes_models' in insights:
        print("\n   Top 5 High-Claim Vehicle Makes/Models:")
        for vehicle in insights['highest_claim_makes_models'][:5]:
            print(f"     ‚Ä¢ {vehicle['Make']} {vehicle['Model']}: "
                  f"${vehicle['sum']:,.0f} total claims")

# Recommendations
print("\n5. KEY RECOMMENDATIONS:")
print("   ‚Ä¢ Review pricing strategy for high-loss-ratio provinces")
print("   ‚Ä¢ Investigate risk factors for top risky vehicle types")
print("   ‚Ä¢ Monitor temporal trends for seasonal patterns")
print("   ‚Ä¢ Consider data quality improvements for columns with high missing values")
print("   ‚Ä¢ Further investigate outlier cases in claims and premiums")

print("\n" + "=" * 50)
print("‚úÖ EDA COMPLETED SUCCESSFULLY")
print("=" * 50)

# Save summary report to file
import json
with open("../reports/docs/eda_summary.json", "w") as f:
    json.dump(summary_report, f, indent=2, default=str)

print("\nüìÅ Summary report saved to: ../reports/docs/eda_summary.json")