# SmartClaim: Exploratory Data Analysis

This notebook explores the synthetic insurance claims dataset to understand:
- Class distribution (fraud vs legitimate)
- Feature distributions and correlations
- Patterns that distinguish fraudulent claims

**Goal**: Build intuition before modeling


In [None]:
# Import libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Add parent directories to path
sys.path.insert(0, str(Path.cwd().parent.parent))

print("✓ Libraries imported successfully")


## 1. Load the Data

First, let's load the synthetic claims data and take a quick look.


In [None]:
# Load the data
from src.data.load_data import load_claims_data

df = load_claims_data("../../data/processed/synthetic_claims.csv")

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Basic info
print("Dataset Info:")
df.info()
print("\n" + "="*50)
print("Summary Statistics:")
df.describe()


## 2. Class Balance

How many fraudulent vs legitimate claims do we have? This is critical because imbalanced datasets require special handling.


In [None]:
# Class distribution
fraud_counts = df['is_fraud'].value_counts()
fraud_pcts = df['is_fraud'].value_counts(normalize=True)

print("Class Distribution:")
print(f"  Legitimate (0): {fraud_counts[0]:,} ({fraud_pcts[0]:.2%})")
print(f"  Fraud (1):      {fraud_counts[1]:,} ({fraud_pcts[1]:.2%})")
print(f"\n  Imbalance ratio: {fraud_counts[0] / fraud_counts[1]:.2f}:1")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
axes[0].bar(['Legitimate', 'Fraud'], fraud_counts.values, color=['#2ecc71', '#e74c3c'])
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Class Distribution (Counts)', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(fraud_counts.values):
    axes[0].text(i, v + 50, f'{v:,}', ha='center', fontweight='bold')

# Percentage plot
axes[1].pie(fraud_counts.values, labels=['Legitimate', 'Fraud'], autopct='%1.1f%%',
            colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n💡 This is an imbalanced dataset - we'll need to handle this in modeling!")


## 3. Numeric Feature Distributions

Let's see how numeric features differ between fraud and legitimate claims.


In [None]:
# Numeric features
numeric_features = ['age', 'vehicle_age', 'claim_amount', 'num_prior_claims', 
                    'policy_tenure_months', 'reported_delay_days']

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(numeric_features):
    # Plot distributions for each class
    df[df['is_fraud'] == 0][feature].hist(ax=axes[idx], bins=30, alpha=0.6, 
                                            label='Legitimate', color='#2ecc71')
    df[df['is_fraud'] == 1][feature].hist(ax=axes[idx], bins=30, alpha=0.6, 
                                            label='Fraud', color='#e74c3c')
    
    axes[idx].set_xlabel(feature.replace('_', ' ').title(), fontsize=11)
    axes[idx].set_ylabel('Frequency', fontsize=11)
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)
    
    # Add mean lines
    legit_mean = df[df['is_fraud'] == 0][feature].mean()
    fraud_mean = df[df['is_fraud'] == 1][feature].mean()
    axes[idx].axvline(legit_mean, color='#2ecc71', linestyle='--', linewidth=2, alpha=0.8)
    axes[idx].axvline(fraud_mean, color='#e74c3c', linestyle='--', linewidth=2, alpha=0.8)

plt.suptitle('Feature Distributions: Fraud vs Legitimate', fontsize=16, fontweight='bold', y=1.00)
plt.tight_layout()
plt.show()

print("\n🔍 Key Observations:")
print("  - Fraudulent claims tend to have higher amounts")
print("  - Fraudsters are often younger")
print("  - Fraudulent claims involve older vehicles")
print("  - Fraudsters have more prior claims")
print("  - Fraudulent claims are reported with longer delays")


## 4. Correlation Analysis

Which features are correlated with fraud? Which features are correlated with each other?


In [None]:
# Correlation heatmap (numeric features only)
numeric_df = df[numeric_features + ['has_police_report', 'is_fraud']]
correlation_matrix = numeric_df.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\n🔍 Correlations with Fraud (is_fraud):")
fraud_corr = correlation_matrix['is_fraud'].sort_values(ascending=False)
for feature, corr in fraud_corr.items():
    if feature != 'is_fraud':
        print(f"  {feature:30s}: {corr:+.3f}")


## 5. Categorical Feature Analysis

How do fraud rates vary across different accident types and regions?


In [None]:
# Fraud rates by categorical features
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Fraud rate by accident type
fraud_by_accident = df.groupby('accident_type')['is_fraud'].agg(['mean', 'count'])
fraud_by_accident['mean'].plot(kind='bar', ax=axes[0], color='#e74c3c', alpha=0.7)
axes[0].set_ylabel('Fraud Rate', fontsize=12)
axes[0].set_xlabel('Accident Type', fontsize=12)
axes[0].set_title('Fraud Rate by Accident Type', fontsize=14, fontweight='bold')
axes[0].set_ylim(0, max(fraud_by_accident['mean']) * 1.2)
axes[0].grid(axis='y', alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

for i, (idx, row) in enumerate(fraud_by_accident.iterrows()):
    axes[0].text(i, row['mean'] + 0.005, f"{row['mean']:.1%}\n(n={row['count']})", 
                ha='center', fontsize=9)

# Fraud rate by region
fraud_by_region = df.groupby('region')['is_fraud'].agg(['mean', 'count'])
fraud_by_region['mean'].plot(kind='bar', ax=axes[1], color='#3498db', alpha=0.7)
axes[1].set_ylabel('Fraud Rate', fontsize=12)
axes[1].set_xlabel('Region', fontsize=12)
axes[1].set_title('Fraud Rate by Region', fontsize=14, fontweight='bold')
axes[1].set_ylim(0, max(fraud_by_region['mean']) * 1.2)
axes[1].grid(axis='y', alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

for i, (idx, row) in enumerate(fraud_by_region.iterrows()):
    axes[1].text(i, row['mean'] + 0.005, f"{row['mean']:.1%}\n(n={row['count']})", 
                ha='center', fontsize=9)

plt.tight_layout()
plt.show()

print("\n🔍 Key Observations:")
print(f"  - Highest fraud rate accident type: {fraud_by_accident['mean'].idxmax()} ({fraud_by_accident['mean'].max():.1%})")
print(f"  - Lowest fraud rate accident type: {fraud_by_accident['mean'].idxmin()} ({fraud_by_accident['mean'].min():.1%})")


## 6. Police Report Impact

Does having a police report correlate with fraud likelihood?


In [None]:
# Police report analysis
police_fraud = df.groupby('has_police_report')['is_fraud'].agg(['mean', 'count'])

plt.figure(figsize=(10, 6))
bars = plt.bar(['No Police Report', 'Has Police Report'], 
               police_fraud['mean'].values, 
               color=['#e74c3c', '#2ecc71'], alpha=0.7, edgecolor='black', linewidth=2)
plt.ylabel('Fraud Rate', fontsize=12)
plt.title('Fraud Rate: With vs Without Police Report', fontsize=14, fontweight='bold')
plt.ylim(0, max(police_fraud['mean']) * 1.2)
plt.grid(axis='y', alpha=0.3)

# Add value labels
for i, (idx, row) in enumerate(police_fraud.iterrows()):
    plt.text(i, row['mean'] + 0.01, f"{row['mean']:.1%}\n({row['count']:,} claims)", 
            ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n🔍 Key Insight:")
print(f"  - Claims WITHOUT police reports have {police_fraud.loc[0, 'mean']:.1%} fraud rate")
print(f"  - Claims WITH police reports have {police_fraud.loc[1, 'mean']:.1%} fraud rate")
print(f"  - Police reports are a strong indicator of legitimacy!")


## 7. Summary & Insights for Modeling

Based on this EDA, here's what we learned:

### Key Patterns
- **Class Imbalance**: ~12% fraud rate → need to handle in modeling (class weights, SMOTE, or appropriate metrics)
- **Strong Predictors**: 
  - Police report presence (strongest signal)
  - Claim amount (higher = more suspicious)
  - Reported delay (longer = more suspicious)
  - Number of prior claims (more = more suspicious)
  
### Modeling Strategy
1. **Handle imbalance**: Use class_weight='balanced' or scale_pos_weight
2. **Metrics**: Focus on F1, Precision, Recall, and PR-AUC (not just accuracy)
3. **Features**: All features show some discriminative power
4. **Baseline**: Start with Logistic Regression, then try XGBoost for non-linear patterns

**Ready for modeling!** 🚀
