In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [69]:
# Load the data
df = pd.read_csv("car_insurance_claim.csv")
df.head()

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,EDUCATION,...,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG,URBANICITY
0,0,60.0,0,11.0,"$67,349",No,$0,z_No,M,PhD,...,Minivan,yes,"$4,461",2,No,3,$0,18.0,0,Highly Urban/ Urban
1,0,43.0,0,11.0,"$91,449",No,"$257,252",z_No,M,z_High School,...,Minivan,yes,$0,0,No,0,$0,1.0,0,Highly Urban/ Urban
2,0,48.0,0,11.0,"$52,881",No,$0,z_No,M,Bachelors,...,Van,yes,$0,0,No,2,$0,10.0,0,Highly Urban/ Urban
3,0,35.0,1,10.0,"$16,039",No,"$124,191",Yes,z_F,z_High School,...,z_SUV,no,"$38,690",2,No,3,$0,10.0,0,Highly Urban/ Urban
4,0,51.0,0,14.0,,No,"$306,251",Yes,M,<High School,...,Minivan,yes,$0,0,No,0,$0,6.0,0,Highly Urban/ Urban


In [70]:
# Convert monetary values to numeric by removing "$" and ","
monetary_columns = ["INCOME", "HOME_VAL", "BLUEBOOK", "OLDCLAIM", "CLM_AMT"]
for col in monetary_columns:
    df[col] = df[col].replace({'\\$': '', ',': ''}, regex=True).astype(str)
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [71]:
# Handle missing values: Fill numerical columns with median and categorical with mode
for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [72]:
# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

In [73]:
df.head()

Unnamed: 0,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,HOME_VAL,TRAVTIME,BLUEBOOK,TIF,OLDCLAIM,...,OCCUPATION_z_Blue Collar,CAR_USE_Private,CAR_TYPE_Panel Truck,CAR_TYPE_Pickup,CAR_TYPE_Sports Car,CAR_TYPE_Van,CAR_TYPE_z_SUV,RED_CAR_yes,REVOKED_Yes,URBANICITY_z_Highly Rural/ Rural
0,0,60.0,0,11.0,67349.0,0.0,14,14230,11,4461,...,False,True,False,False,False,False,False,True,False,False
1,0,43.0,0,11.0,91449.0,257252.0,22,14940,1,0,...,True,False,False,False,False,False,False,True,False,False
2,0,48.0,0,11.0,52881.0,0.0,26,21970,1,0,...,False,True,False,False,False,True,False,True,False,False
3,0,35.0,1,10.0,16039.0,124191.0,5,4010,4,38690,...,False,True,False,False,False,False,True,False,False,False
4,0,51.0,0,14.0,53529.0,306251.0,32,15440,7,0,...,True,True,False,False,False,False,False,True,False,False


In [74]:
# Define target variables
X = df.drop(columns=["CLAIM_FLAG", "CLM_AMT"])  # Features
y_classification = df["CLAIM_FLAG"]  # Classification target
y_regression = df["CLM_AMT"]  # Regression target

In [75]:
# Data spliting
X_train, X_test, y_train_cls, y_test_cls, y_train_reg, y_test_reg = train_test_split(
    X, y_classification, y_regression, test_size=0.2, random_state=42)

In [76]:
# Scale the feature sets
scaler_cls = StandardScaler()
scaler_reg = StandardScaler()

# Scale classification data
X_train_cls_scaled = scaler_cls.fit_transform(X_train)
X_test_cls_scaled = scaler_cls.transform(X_test)

# Scale regression data
X_train_reg_scaled = scaler_reg.fit_transform(X_train)
X_test_reg_scaled = scaler_reg.transform(X_test)

In [77]:
# Train RandomForest models on scaled data
rf_cls = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cls.fit(X_train_cls_scaled, y_train_cls)

rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train_reg_scaled, y_train_reg)

In [78]:
# Predictions
y_pred_cls = rf_cls.predict(X_test_cls_scaled)
y_prob_cls = rf_cls.predict_proba(X_test_cls_scaled)[:, 1]
y_pred_reg = rf_reg.predict(X_test_reg_scaled)

print("Classification Predictions:", y_pred_cls)
print("Regression Predictions:", y_pred_reg)

Classification Predictions: [0 1 0 ... 0 0 0]
Regression Predictions: [ 908.53 2479.74 2543.19 ... 1653.15 1562.44  129.44]


In [79]:
# Classification Performance Metrics
classification_results = {
    "Accuracy": accuracy_score(y_test_cls, y_pred_cls),
    "AUC": roc_auc_score(y_test_cls, y_prob_cls),
    "Classification Report": classification_report(y_test_cls, y_pred_cls, output_dict=True)
}

print("\n=== Classification Results ===")
print(f"Accuracy: {classification_results['Accuracy']:.4f}")
print(f"AUC: {classification_results['AUC']:.4f}")

# Print Classification Report
report = classification_report(y_test_cls, y_pred_cls)
print("\nClassification Report:\n", report)

# Regression Performance Metrics
regression_results = {
    "MAE": mean_absolute_error(y_test_reg, y_pred_reg),
    "MSE": mean_squared_error(y_test_reg, y_pred_reg),
    "RMSE": np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
}

print("\n=== Regression Results ===")
print(f"Mean Absolute Error (MAE): {regression_results['MAE']:.2f}")
print(f"Mean Squared Error (MSE): {regression_results['MSE']:.2f}")
print(f"Root Mean Squared Error (RMSE): {regression_results['RMSE']:.2f}")


=== Classification Results ===
Accuracy: 0.7865
AUC: 0.8034

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.94      0.87      1506
           1       0.70      0.37      0.48       555

    accuracy                           0.79      2061
   macro avg       0.75      0.65      0.67      2061
weighted avg       0.77      0.79      0.76      2061


=== Regression Results ===
Mean Absolute Error (MAE): 1989.22
Mean Squared Error (MSE): 16366987.83
Root Mean Squared Error (RMSE): 4045.61


**Accounting Sensitive Data**

In [80]:
# Feature importance analysis
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance_Classification": rf_cls.feature_importances_,
    "Importance_Regression": rf_reg.feature_importances_
}).sort_values(by=["Importance_Classification", "Importance_Regression"], ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance_df.head(10))


Top 10 Important Features:
     Feature  Importance_Classification  Importance_Regression
4     INCOME                   0.085706               0.084920
7   BLUEBOOK                   0.085573               0.128545
1        AGE                   0.076475               0.084096
6   TRAVTIME                   0.074203               0.074999
5   HOME_VAL                   0.072641               0.072725
9   OLDCLAIM                   0.071505               0.097913
11   MVR_PTS                   0.053840               0.048773
12   CAR_AGE                   0.051429               0.043758
3        YOJ                   0.048491               0.046000
8        TIF                   0.045820               0.041805


In [81]:
# Remove less important features (based on importance)
selected_features = feature_importance_df["Feature"][:10]  # Keep top 10 features
X_train_cls_selected = X_train[selected_features].drop(columns=["INCOME","HOME_VAL"])
X_test_cls_selected = X_test[selected_features].drop(columns=["INCOME","HOME_VAL"])
X_train_reg_selected = X_train[selected_features].drop(columns=["INCOME","HOME_VAL"])
X_test_reg_selected = X_test[selected_features].drop(columns=["INCOME","HOME_VAL"])

In [82]:
# Scale the feature sets
scaler_cls_selected = StandardScaler()
scaler_reg_selected = StandardScaler()

# Scale classification data
X_train_cls_selected_scaled = scaler_cls_selected.fit_transform(X_train_cls_selected)
X_test_cls_selected_scaled = scaler_cls_selected.transform(X_test_cls_selected)

# Scale regression data
X_train_reg_selected_scaled = scaler_reg_selected.fit_transform(X_train_reg_selected)
X_test_reg_selected_scaled = scaler_reg_selected.transform(X_test_reg_selected)

In [83]:
# Train RandomForest models on scaled data
rf_cls_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cls_selected.fit(X_train_cls_selected_scaled, y_train_cls)

rf_reg_selected = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg_selected.fit(X_train_reg_selected_scaled, y_train_reg)

In [84]:
# Predictions
y_pred_cls_selected = rf_cls_selected.predict(X_test_cls_selected_scaled)
y_prob_cls_selected = rf_cls_selected.predict_proba(X_test_cls_selected_scaled)[:, 1]
y_pred_reg_selected = rf_reg_selected.predict(X_test_reg_selected_scaled)

print("Classification Predictions:", y_pred_cls_selected)
print("Regression Predictions:", y_pred_reg_selected)

Classification Predictions: [0 0 1 ... 0 0 0]
Regression Predictions: [2216.6  1709.14 2477.58 ...  358.68 1097.51 1180.29]


In [85]:
# Classification Performance Metrics
classification_results_selected = {
    "Accuracy": accuracy_score(y_test_cls, y_pred_cls_selected),
    "AUC": roc_auc_score(y_test_cls, y_prob_cls_selected),
    "Classification Report": classification_report(y_test_cls, y_pred_cls_selected, output_dict=True)
}

# Print Classification Report
print("\n=== Classification Results (Selected Features) ===")
print(f"Accuracy: {classification_results_selected['Accuracy']:.4f}")
print(f"AUC: {classification_results_selected['AUC']:.4f}")
print("\nClassification Report:\n", classification_report(y_test_cls, y_pred_cls_selected))

# Regression Performance Metrics
regression_results_selected = {
    "MAE": mean_absolute_error(y_test_reg, y_pred_reg_selected),
    "MSE": mean_squared_error(y_test_reg, y_pred_reg_selected),
    "RMSE": np.sqrt(mean_squared_error(y_test_reg, y_pred_reg_selected))
}

print("\n=== Regression Results (Selected Features) ===")
print(f"Mean Absolute Error (MAE): {regression_results_selected['MAE']:.2f}")
print(f"Mean Squared Error (MSE): {regression_results_selected['MSE']:.2f}")
print(f"Root Mean Squared Error (RMSE): {regression_results_selected['RMSE']:.2f}")


=== Classification Results (Selected Features) ===
Accuracy: 0.7472
AUC: 0.7117

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.93      0.84      1506
           1       0.57      0.25      0.35       555

    accuracy                           0.75      2061
   macro avg       0.67      0.59      0.60      2061
weighted avg       0.72      0.75      0.71      2061


=== Regression Results (Selected Features) ===
Mean Absolute Error (MAE): 2185.76
Mean Squared Error (MSE): 17574703.15
Root Mean Squared Error (RMSE): 4192.22
