In [2]:
# ===============================
# Task 1: EDA for Insurance Data
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv("../data/insurance_data.csv")

# 1. Data Overview
print("First 5 rows:")
print(data.head())
print("\nData Info:")
print(data.info())
print("\nDescriptive Statistics:")
print(data.describe())

# 2. Check Missing Values
print("\nMissing Values:")
print(data.isnull().sum())

# 3. Convert Columns to Correct Types
categorical_cols = ['Province', 'VehicleType', 'Gender']
for col in categorical_cols:
    if col in data.columns:
        data[col] = data[col].astype('category')

if 'PolicyStartDate' in data.columns:
    data['PolicyStartDate'] = pd.to_datetime(data['PolicyStartDate'])

# 4. Univariate Analysis
numerical_cols = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate']
for col in numerical_cols:
    if col in data.columns:
        plt.figure(figsize=(8,5))
        sns.histplot(data[col], bins=30, kde=True)
        plt.title(f"Distribution of {col}")
        plt.savefig(f"../plots/{col}_hist.png")
        plt.show()

for col in categorical_cols:
    if col in data.columns:
        plt.figure(figsize=(8,5))
        sns.countplot(x=col, data=data)
        plt.title(f"Count of {col}")
        plt.savefig(f"../plots/{col}_count.png")
        plt.show()

# 5. Bivariate Analysis
if 'TotalPremium' in data.columns and 'TotalClaims' in data.columns and 'VehicleType' in data.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(x='TotalPremium', y='TotalClaims', hue='VehicleType', data=data)
    plt.title("Total Premium vs Total Claims by Vehicle Type")
    plt.savefig("../plots/premium_vs_claims_vehicle.png")
    plt.show()

if 'TotalPremium' in data.columns and 'TotalClaims' in data.columns and 'Province' in data.columns:
    data['LossRatio'] = data['TotalClaims'] / data['TotalPremium']
    plt.figure(figsize=(10,6))
    sns.barplot(x='Province', y='LossRatio', data=data)
    plt.title("Loss Ratio by Province")
    plt.savefig("../plots/lossratio_by_province.png")
    plt.show()

if 'TotalClaims' in data.columns and 'Gender' in data.columns:
    data['ClaimSeverity'] = np.where(data['TotalClaims']>0, data['TotalClaims'], np.nan)
    plt.figure(figsize=(8,6))
    sns.boxplot(x='Gender', y='ClaimSeverity', data=data)
    plt.title("Claim Severity by Gender")
    plt.savefig("../plots/claimseverity_by_gender.png")
    plt.show()

# 6. Outlier Detection
for col in numerical_cols:
    if col in data.columns:
        plt.figure(figsize=(8,5))
        sns.boxplot(x=data[col])
        plt.title(f"Outlier Detection: {col}")
        plt.savefig(f"../plots/{col}_boxplot.png")
        plt.show()

# 7. Trends Over Time
if 'PolicyStartDate' in data.columns and 'TotalPremium' in data.columns and 'TotalClaims' in data.columns:
    data['Month'] = data['PolicyStartDate'].dt.to_period('M')
    monthly_summary = data.groupby('Month').agg(
        TotalPremium=('TotalPremium','sum'),
        TotalClaims=('TotalClaims','sum')
    ).reset_index()

    plt.figure(figsize=(12,6))
    sns.lineplot(x='Month', y='TotalPremium', data=monthly_summary, label='TotalPremium')
    sns.lineplot(x='Month', y='TotalClaims', data=monthly_summary, label='TotalClaims')
    plt.xticks(rotation=45)
    plt.title("Monthly Premiums and C


SyntaxError: unterminated string literal (detected at line 97) (2119576553.py, line 97)