In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, mean_absolute_error, mean_squared_error
import numpy as np


In [2]:
df=pd.read_csv('insurance_claims.csv')

In [3]:
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   object 
 4   policy_state                 1000 non-null   object 
 5   policy_csl                   1000 non-null   object 
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   object 
 11  insured_education_level      1000 non-null   object 
 12  insured_occupation           1000 non-null   object 
 13  insured_hobbies    

In [6]:
df = df.drop(columns=['_c39'])


In [7]:
df['authorities_contacted'].fillna(value='Unknown', inplace=True)  # Replace missing values with 'Unknown'
df['policy_bind_date'] = pd.to_datetime(df['policy_bind_date'])
df['incident_date'] = pd.to_datetime(df['incident_date'])


In [8]:
label_encoder = LabelEncoder()
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])


In [9]:
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])


In [10]:
print(df.head())


   months_as_customer       age  policy_number policy_bind_date  policy_state  \
0            1.078140  0.990836      -0.095953       2014-10-17             2   
1            0.208995  0.334073      -0.791527       2006-06-27             1   
2           -0.608002 -1.088913       0.550566       2000-09-06             2   
3            0.452355  0.224613      -1.239334       1990-05-25             0   
4            0.208995  0.552994      -0.695834       2014-06-06             0   

   policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  \
0           1          -0.222383               0.616705       -0.479476   
1           1           1.412784              -0.242521        1.697980   
2           0           1.412784               0.642233        1.697980   
3           1           1.412784               0.652886        2.133471   
4           2          -0.222383               1.341980        2.133471   

   insured_zip  ...  witnesses  police_report_available  total

In [11]:
df.to_csv("preprocessed_dataset.csv", index=False)


In [7]:
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,1.07814,0.990836,-0.095953,2014-10-17,2,1,-0.222383,0.616705,-0.479476,-0.489529,...,0.461838,2,0.714257,-0.189283,1.165505,0.749653,10,1,-0.18344,1
1,0.208995,0.334073,-0.791527,2006-06-27,1,1,1.412784,-0.242521,1.69798,-0.461008,...,-1.3387,0,-1.807312,-1.363822,-1.372696,-1.823346,8,12,0.315491,1
2,-0.608002,-1.088913,0.550566,2000-09-06,2,0,1.412784,0.642233,1.69798,-0.984885,...,1.362107,1,-0.686362,0.054644,-0.736072,-0.785565,4,30,0.315491,0
3,0.452355,0.224613,-1.239334,1990-05-25,0,1,1.412784,0.652886,2.133471,1.491682,...,0.461838,1,0.403135,-0.22413,-0.219722,0.677607,3,34,1.479664,1
4,0.208995,0.552994,-0.695834,2014-06-06,0,2,-0.222383,1.34198,2.133471,1.527808,...,-0.438431,1,-1.753121,-1.257232,-1.399654,-1.768252,0,31,0.648112,0


In [8]:
df = pd.read_csv('preprocessed_dataset.csv')

X = df.drop(columns=['fraud_reported','policy_bind_date', 'incident_date'])
y = df['fraud_reported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    auc_roc = roc_auc_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    return auc_roc, precision, accuracy, mae, mse, rmse, mape


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
# Logistic Regression
logistic_model = LogisticRegression()
logistic_results = train_and_evaluate_model(logistic_model, X_train, y_train, X_test, y_test)

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_results = train_and_evaluate_model(dt_model, X_train, y_train, X_test, y_test)

# Random Forest
rf_model = RandomForestClassifier()
rf_results = train_and_evaluate_model(rf_model, X_train, y_train, X_test, y_test)

# Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_results = train_and_evaluate_model(gb_model, X_train, y_train, X_test, y_test)

# Support Vector Machines (SVM)
svm_model = SVC()
svm_results = train_and_evaluate_model(svm_model, X_train, y_train, X_test, y_test)

# k-Nearest Neighbors (k-NN)
knn_model = KNeighborsClassifier()
knn_results = train_and_evaluate_model(knn_model, X_train, y_train, X_test, y_test)


In [15]:
results_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting', 'SVM', 'k-NN'],
    'AUC-ROC': [logistic_results[0], dt_results[0], rf_results[0], gb_results[0], svm_results[0], knn_results[0]],
    'Precision': [logistic_results[1], dt_results[1], rf_results[1], gb_results[1], svm_results[1], knn_results[1]],
    'Accuracy': [logistic_results[2], dt_results[2], rf_results[2], gb_results[2], svm_results[2], knn_results[2]],
    'MAE': [logistic_results[3], dt_results[3], rf_results[3], gb_results[3], svm_results[3], knn_results[3]],
    'MSE': [logistic_results[4], dt_results[4], rf_results[4], gb_results[4], svm_results[4], knn_results[4]],
    'RMSE': [logistic_results[5], dt_results[5], rf_results[5], gb_results[5], svm_results[5], knn_results[5]],
    'MAPE': [logistic_results[6], dt_results[6], rf_results[6], gb_results[6], svm_results[6], knn_results[6]]
})


In [16]:
results_df

Unnamed: 0,Model,AUC-ROC,Precision,Accuracy,MAE,MSE,RMSE,MAPE
0,Logistic Regression,0.539185,0.4,0.7,0.3,0.3,0.547723,inf
1,Decision Tree,0.702508,0.574074,0.765,0.235,0.235,0.484768,inf
2,Random Forest,0.576803,0.52,0.73,0.27,0.27,0.519615,inf
3,Gradient Boosting,0.715047,0.592593,0.775,0.225,0.225,0.474342,inf
4,SVM,0.5,0.0,0.725,0.275,0.275,0.524404,100.0
5,k-NN,0.481505,0.111111,0.69,0.31,0.31,0.556776,inf
