# ðŸ“ˆ Exploratory Data Analysis

**Project:** Predicting Paid Amount for Medical Claims  
**Stage:** EDA & Data Understanding  

---

## Overview

This notebook provides comprehensive exploratory data analysis:

1. **Univariate Analysis** - Distribution of individual features
2. **Bivariate Analysis** - Relationships between features and target
3. **Correlation Analysis** - Feature correlations
4. **Missing Value Analysis** - Patterns and handling strategies
5. **Outlier Detection** - Identifying and handling anomalies

In [None]:
# Imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.logger import setup_logging, get_logger
from src.data.data_loader import DataLoader

# Setup
setup_logging(log_level="INFO")
logger = get_logger(__name__)
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Paths
INTERIM_DIR = project_root / "data" / "interim"
FIGURES_DIR = project_root / "reports" / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print("âœ“ Setup complete")

## 1. Load Data `

In [None]:
# Load data from previous stage
parquet_path = INTERIM_DIR / "sampled_claims.parquet"

if parquet_path.exists():
    df = pd.read_parquet(parquet_path)
    print(f"âœ“ Loaded data: {len(df):,} rows, {len(df.columns)} columns")
else:
    # Create demo data
    np.random.seed(42)
    n = 50000
    df = pd.DataFrame({
        'CLAIM_ID_KEY': np.random.randint(1, 20000, n),
        'AGE': np.random.choice(['25', '35', '45', '55', '65', '75', '90+'], n),
        'SEX': np.random.choice(['M', 'F'], n),
        'AMT_BILLED': np.abs(np.random.exponential(1000, n)),
        'AMT_PAID': np.abs(np.random.exponential(500, n)),
        'AMT_DEDUCT': np.abs(np.random.exponential(100, n)),
        'AMT_COINS': np.abs(np.random.exponential(50, n)),
        'FORM_TYPE': np.random.choice(['P', 'I', 'O'], n),
        'PRODUCT_TYPE': np.random.choice(['HMO', 'PPO', 'POS'], n),
        'ICD_DIAG_01_PRIMARY': np.random.choice(['Z00', 'J06', 'M54', 'I10', 'K21'], n),
        'CLIENT_LOS': np.random.choice([0, 1, 2, 3, np.nan], n, p=[0.7, 0.1, 0.08, 0.07, 0.05]),
    })
    print(f"âœ“ Created demo data: {len(df):,} rows")

df.head()

## 2. Data Overview

In [None]:
# Basic statistics
print("ðŸ“Š Dataset Shape:", df.shape)
print("\nðŸ“Š Data Types:")
print(df.dtypes.value_counts())

# Numeric summary
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nðŸ“Š Numeric Columns ({len(numeric_cols)}):")
df[numeric_cols].describe().round(2)

## 3. Target Variable Analysis (AMT_PAID)

In [None]:
# Target variable distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Raw distribution
axes[0].hist(df['AMT_PAID'], bins=50, edgecolor='black', alpha=0.7, color='#3498db')
axes[0].set_xlabel('Paid Amount ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Paid Amount')
axes[0].axvline(df['AMT_PAID'].mean(), color='red', linestyle='--', label=f"Mean: ${df['AMT_PAID'].mean():,.0f}")
axes[0].legend()

# Log distribution
log_paid = np.log1p(df['AMT_PAID'])
axes[1].hist(log_paid, bins=50, edgecolor='black', alpha=0.7, color='#2ecc71')
axes[1].set_xlabel('Log(Paid Amount + 1)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Log-Transformed Distribution')

# Box plot
axes[2].boxplot(df['AMT_PAID'].dropna(), vert=True)
axes[2].set_ylabel('Paid Amount ($)')
axes[2].set_title('Box Plot of Paid Amount')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nðŸ“Š Target Statistics:")
print(f"  Mean: ${df['AMT_PAID'].mean():,.2f}")
print(f"  Median: ${df['AMT_PAID'].median():,.2f}")
print(f"  Std Dev: ${df['AMT_PAID'].std():,.2f}")
print(f"  Skewness: {df['AMT_PAID'].skew():.2f}")

## 4. Categorical Feature Analysis

In [None]:
# Categorical columns analysis
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"ðŸ“Š Categorical Columns ({len(cat_cols)}): {cat_cols}")

# Plot distributions
n_cols = min(len(cat_cols), 4)
if n_cols > 0:
    fig, axes = plt.subplots(1, n_cols, figsize=(4*n_cols, 4))
    if n_cols == 1:
        axes = [axes]
    
    for ax, col in zip(axes, cat_cols[:n_cols]):
        value_counts = df[col].value_counts().head(10)
        ax.barh(value_counts.index.astype(str), value_counts.values, color='#9b59b6', alpha=0.8)
        ax.set_xlabel('Count')
        ax.set_title(f'{col}')
        ax.invert_yaxis()
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'categorical_distributions.png', dpi=150, bbox_inches='tight')
    plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix for numeric columns
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r', 
            center=0, square=True, linewidths=0.5)
plt.title('Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Top correlations with target
if 'AMT_PAID' in corr_matrix.columns:
    target_corr = corr_matrix['AMT_PAID'].drop('AMT_PAID').abs().sort_values(ascending=False)
    print("\nðŸ“Š Top Correlations with AMT_PAID:")
    for feat, corr in target_corr.head(10).items():
        print(f"  {feat}: {corr:.4f}")

## 6. Feature vs Target Relationships

In [None]:
# Relationship between AMT_BILLED and AMT_PAID
if 'AMT_BILLED' in df.columns and 'AMT_PAID' in df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Sample for performance
    sample = df.sample(min(5000, len(df)), random_state=42)
    ax.scatter(sample['AMT_BILLED'], sample['AMT_PAID'], alpha=0.3, s=10, c='#3498db')
    
    # Add regression line
    z = np.polyfit(sample['AMT_BILLED'], sample['AMT_PAID'], 1)
    p = np.poly1d(z)
    x_line = np.linspace(sample['AMT_BILLED'].min(), sample['AMT_BILLED'].max(), 100)
    ax.plot(x_line, p(x_line), 'r--', lw=2, label=f'Trend: y={z[0]:.2f}x + {z[1]:.2f}')
    
    ax.set_xlabel('Billed Amount ($)', fontsize=12)
    ax.set_ylabel('Paid Amount ($)', fontsize=12)
    ax.set_title('Billed Amount vs Paid Amount', fontsize=14)
    ax.legend()
    
    plt.tight_layout()
    plt.savefig(FIGURES_DIR / 'billed_vs_paid.png', dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# Save processed data for next stage
print("\n" + "="*60)
print("ðŸ“Š EDA SUMMARY")
print("="*60)
print(f"  Dataset: {len(df):,} rows, {len(df.columns)} columns")
print(f"  Numeric features: {len(numeric_cols)}")
print(f"  Categorical features: {len(cat_cols)}")
print(f"  Figures saved to: {FIGURES_DIR}")
print(f"\nâœ… EDA completed! Next: Run 03_feature_engineering.ipynb")