In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/cleaned_claims.csv")


Temporal Features

In [2]:
# Policy Age at Time of Incident
df['policy_age_days'] = (
    pd.to_datetime(df['incident_date']) - 
    pd.to_datetime(df['policy_bind_date'])
).dt.days


Financial Risk Indicators

In [7]:
# Net Capital Impact
df['net_capital'] = df['capital-gains'] - df['capital-loss']


In [8]:
# Claim to Premium Ratio
df['claim_premium_ratio'] = df['total_claim_amount'] / df['policy_annual_premium']


Customer Stability Indicators

In [9]:
# New Customer Flag
df['is_new_customer'] = np.where(df['months_as_customer'] < 24, 1, 0)


Incident Severity Encoding
- Create ordered severity levels:

In [10]:
severity_map = {
    'Trivial Damage': 1,
    'Minor Damage': 2,
    'Major Damage': 3,
    'Total Loss': 4
}

df['incident_severity_encoded'] = df['incident_severity'].map(severity_map)


Target Variable

In [11]:
df['fraud_flag'] = df['fraud_reported'].map({'Y': 1, 'N': 0})


Drop High-Cardinality / Leakage Columns

In [12]:
drop_cols = [
    'policy_number',
    'insured_zip',
    'incident_location',
    'incident_city',
    'policy_bind_date',
    'incident_date',
    'fraud_reported'
]

df.drop(columns=drop_cols, inplace=True)


Encoding Categorical Variables

In [13]:
# One-Hot Encoding 
categorical_cols = df.select_dtypes(include='object').columns

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [14]:
df_encoded.to_csv("../data/processed/feature_engineered_claims.csv", index=False)
