# Medical Insurance Cost Prediction - Data Exploration

This notebook provides an initial exploration of the medical insurance dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 1. Load and Inspect the Data

In [None]:
# Load the dataset
data = pd.read_csv('../data/raw/insurance.csv')

print(f"Dataset shape: {data.shape}")
print(f"\nColumn names: {list(data.columns)}")
print(f"\nData types:")
print(data.dtypes)

In [None]:
# Display first few rows
data.head(10)

In [None]:
# Basic statistics
data.describe()

In [None]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:")
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

## 2. Exploratory Data Analysis

In [None]:
# Distribution of the target variable (charges)
fig = px.histogram(data, x='charges', nbins=50, title='Distribution of Insurance Charges')
fig.update_layout(xaxis_title="Insurance Charges ($)", yaxis_title="Frequency")
fig.show()

In [None]:
# Categorical variables analysis
categorical_cols = ['sex', 'smoker', 'region']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Categorical Variables Analysis', fontsize=16)

for i, col in enumerate(categorical_cols):
    row = i // 2
    col_idx = i % 2
    
    # Count plot
    data[col].value_counts().plot(kind='bar', ax=axes[row, col_idx])
    axes[row, col_idx].set_title(f'Distribution of {col.title()}')
    axes[row, col_idx].set_xlabel(col.title())
    axes[row, col_idx].set_ylabel('Count')
    axes[row, col_idx].tick_params(axis='x', rotation=45)

# Remove empty subplot
fig.delaxes(axes[1, 1])

plt.tight_layout()
plt.show()

In [None]:
# Box plots for charges by categorical variables
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle('Insurance Charges by Categorical Variables', fontsize=16)

for i, col in enumerate(categorical_cols):
    sns.boxplot(data=data, x=col, y='charges', ax=axes[i])
    axes[i].set_title(f'Charges by {col.title()}')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
numeric_data = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numeric Variables')
plt.tight_layout()
plt.show()

In [None]:
# Scatter plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Scatter Plots: Numeric Variables vs Charges', fontsize=16)

numeric_cols = ['age', 'bmi', 'children']

for i, col in enumerate(numeric_cols):
    row = i // 2
    col_idx = i % 2
    
    # Scatter plot colored by smoker status
    for smoker_status in data['smoker'].unique():
        subset = data[data['smoker'] == smoker_status]
        axes[row, col_idx].scatter(subset[col], subset['charges'], 
                                  label=f'Smoker: {smoker_status}', alpha=0.6)
    
    axes[row, col_idx].set_xlabel(col.title())
    axes[row, col_idx].set_ylabel('Charges')
    axes[row, col_idx].set_title(f'{col.title()} vs Charges')
    axes[row, col_idx].legend()

# Remove empty subplot
fig.delaxes(axes[1, 1])

plt.tight_layout()
plt.show()

## 3. Key Insights

In [None]:
# Statistical analysis of charges by smoker status
smoker_stats = data.groupby('smoker')['charges'].agg(['mean', 'median', 'std', 'min', 'max'])
print("Insurance Charges Statistics by Smoker Status:")
print(smoker_stats)

# Calculate the ratio
smoker_mean = smoker_stats.loc['yes', 'mean']
non_smoker_mean = smoker_stats.loc['no', 'mean']
ratio = smoker_mean / non_smoker_mean

print(f"\nSmokers pay {ratio:.2f}x more than non-smokers on average")

In [None]:
# Age analysis
age_correlation = data['age'].corr(data['charges'])
print(f"Correlation between age and charges: {age_correlation:.3f}")

# BMI analysis
bmi_correlation = data['bmi'].corr(data['charges'])
print(f"Correlation between BMI and charges: {bmi_correlation:.3f}")

# Regional analysis
regional_stats = data.groupby('region')['charges'].mean().sort_values(ascending=False)
print("\nAverage charges by region:")
print(regional_stats)

In [None]:
# Summary of key findings
print("=" * 50)
print("KEY FINDINGS FROM DATA EXPLORATION")
print("=" * 50)

print(f"1. Dataset contains {len(data):,} records with {len(data.columns)} features")
print(f"2. No missing values found in the dataset")
print(f"3. Average insurance cost: ${data['charges'].mean():,.2f}")
print(f"4. Smoking impact: Smokers pay {ratio:.1f}x more than non-smokers")
print(f"5. Age correlation with charges: {age_correlation:.3f} (moderate positive)")
print(f"6. BMI correlation with charges: {bmi_correlation:.3f} (weak positive)")
print(f"7. Highest cost region: {regional_stats.index[0]} (${regional_stats.iloc[0]:,.2f})")
print(f"8. Age range: {data['age'].min()} - {data['age'].max()} years")
print(f"9. BMI range: {data['bmi'].min():.1f} - {data['bmi'].max():.1f}")
print(f"10. Children range: {data['children'].min()} - {data['children'].max()}")

## 4. Data Quality Assessment

In [None]:
# Check for outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("Outlier Analysis:")
print("-" * 40)

for col in numeric_data.columns:
    outlier_count, lower, upper = detect_outliers(data, col)
    print(f"{col}: {outlier_count} outliers (bounds: {lower:.2f} - {upper:.2f})")

In [None]:
# Check data distribution
print("Data Distribution Analysis:")
print("-" * 40)

for col in numeric_data.columns:
    skewness = data[col].skew()
    kurtosis = data[col].kurtosis()
    print(f"{col}: Skewness = {skewness:.3f}, Kurtosis = {kurtosis:.3f}")

## Next Steps

Based on this exploration, the next steps would be:

1. **Feature Engineering**: Create interaction terms, especially between smoking status and other variables
2. **Data Preprocessing**: Encode categorical variables and scale features if needed
3. **Model Development**: Try different regression algorithms
4. **Model Evaluation**: Compare performance using appropriate metrics
5. **Model Interpretation**: Understand feature importance and model behavior

The data appears to be clean and ready for modeling, with smoking status being the most significant factor affecting insurance costs.