In [2]:
## Importing required libraries
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
import seaborn as sns
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
%matplotlib inline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    roc_curve, auc
)

In [3]:
## Read in the file
df = pd.read_csv("/content/sample_data/Kenyan_Motor_Insurance_2023_2024.csv")
df.head()

Unnamed: 0,Year,Policy_ID,Customer_Age,Gender,Region,Vehicle_Type,Vehicle_Age,Vehicle_Value_KES,Vehicle_Engine_Capacity,Use_Purpose,Annual_Premium_KES,Claims_Frequency,Total_Claim_Amount_KES,No_Claim_Bonus_%,Accident_Cause,Policy_Term_Months,Previous_Claims_Count,Driver_Experience_Years,Third_Party_Only,Claims_Status
0,2023,P202300001,58,Male,Nakuru,PSV,19,3229084,2000,Business,203851,1,866500,0,Theft,6,0,38,Yes,1
1,2023,P202300002,43,Male,Thika,Private,3,1739911,2000,Taxi,64471,0,0,20,,6,2,22,No,0
2,2023,P202300003,40,Male,Kakamega,Private,0,2781931,1500,Business,146378,0,0,50,,12,0,22,Yes,0
3,2023,P202300004,46,Male,Nakuru,Private,9,1154811,2500,Personal,59810,0,0,30,,12,0,26,No,0
4,2023,P202300005,63,Female,Nakuru,Commercial,19,3452991,1000,Business,143266,0,0,30,,6,0,41,No,0


In [4]:
##Checking data shape
df.shape

(2000, 20)

In [5]:
##checking column information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Year                     2000 non-null   int64 
 1   Policy_ID                2000 non-null   object
 2   Customer_Age             2000 non-null   int64 
 3   Gender                   2000 non-null   object
 4   Region                   2000 non-null   object
 5   Vehicle_Type             2000 non-null   object
 6   Vehicle_Age              2000 non-null   int64 
 7   Vehicle_Value_KES        2000 non-null   int64 
 8   Vehicle_Engine_Capacity  2000 non-null   int64 
 9   Use_Purpose              2000 non-null   object
 10  Annual_Premium_KES       2000 non-null   int64 
 11  Claims_Frequency         2000 non-null   int64 
 12  Total_Claim_Amount_KES   2000 non-null   int64 
 13  No_Claim_Bonus_%         2000 non-null   int64 
 14  Accident_Cause           662 non-null   

In [9]:
##Checking for missing values
df.isnull().sum()

Unnamed: 0,0
Year,0
Policy_ID,0
Customer_Age,0
Gender,0
Region,0
Vehicle_Type,0
Vehicle_Age,0
Vehicle_Value_KES,0
Vehicle_Engine_Capacity,0
Use_Purpose,0


In [13]:
##Checking for duplicates
df.duplicated().value_counts()

Unnamed: 0,count
False,2000


## Data Preprocessing & Feature Engineering

In [10]:
##Dropping the accident cause column
df.drop(['Accident_Cause'], axis =1, inplace =True)

In [11]:
df.head()

Unnamed: 0,Year,Policy_ID,Customer_Age,Gender,Region,Vehicle_Type,Vehicle_Age,Vehicle_Value_KES,Vehicle_Engine_Capacity,Use_Purpose,Annual_Premium_KES,Claims_Frequency,Total_Claim_Amount_KES,No_Claim_Bonus_%,Policy_Term_Months,Previous_Claims_Count,Driver_Experience_Years,Third_Party_Only,Claims_Status
0,2023,P202300001,58,Male,Nakuru,PSV,19,3229084,2000,Business,203851,1,866500,0,6,0,38,Yes,1
1,2023,P202300002,43,Male,Thika,Private,3,1739911,2000,Taxi,64471,0,0,20,6,2,22,No,0
2,2023,P202300003,40,Male,Kakamega,Private,0,2781931,1500,Business,146378,0,0,50,12,0,22,Yes,0
3,2023,P202300004,46,Male,Nakuru,Private,9,1154811,2500,Personal,59810,0,0,30,12,0,26,No,0
4,2023,P202300005,63,Female,Nakuru,Commercial,19,3452991,1000,Business,143266,0,0,30,6,0,41,No,0


In [14]:
##drop policy_id, unique identifier of a policy to reduce bias of individual claim characteristics on model
df.drop(columns=['Policy_ID'], axis =1, inplace =True)

In [19]:
##Create a copy of the initial data
df_copy = df.copy()

In [22]:
##Identifying columns for encoding
categorical_cols = df_copy.select_dtypes(include='object').columns
numerical_cols = df_copy.select_dtypes(exclude='object').columns
One_hot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = One_hot.fit_transform(df_copy[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=One_hot.get_feature_names_out(categorical_cols), index=df_copy.index)
df_encoded = pd.concat([df_copy[numerical_cols], encoded_df], axis=1)

In [23]:
df_encoded.head()

Unnamed: 0,Year,Customer_Age,Vehicle_Age,Vehicle_Value_KES,Vehicle_Engine_Capacity,Annual_Premium_KES,Claims_Frequency,Total_Claim_Amount_KES,No_Claim_Bonus_%,Policy_Term_Months,Previous_Claims_Count,Driver_Experience_Years,Claims_Status,Gender_Female,Gender_Male,Region_Eldoret,Region_Kakamega,Region_Kisumu,Region_Meru,Region_Mombasa,Region_Nairobi,Region_Nakuru,Region_Thika,Vehicle_Type_Commercial,Vehicle_Type_Motorcycle,Vehicle_Type_PSV,Vehicle_Type_Private,Use_Purpose_Business,Use_Purpose_Personal,Use_Purpose_Taxi,Third_Party_Only_No,Third_Party_Only_Yes
0,2023,58,19,3229084,2000,203851,1,866500,0,6,0,38,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2023,43,3,1739911,2000,64471,0,0,20,6,2,22,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
2,2023,40,0,2781931,1500,146378,0,0,50,12,0,22,0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3,2023,46,9,1154811,2500,59810,0,0,30,12,0,26,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,2023,63,19,3452991,1000,143266,0,0,30,6,0,41,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [24]:
df_encoded.groupby('Claims_Status')['No_Claim_Bonus_%'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Claims_Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1338.0,27.660688,14.838395,10.0,12.5,30.0,50.0,50.0
1,662.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
#Separating X and Y target features and dropping columns likely to result in data leakage
X = df_encoded.drop(columns = ['Claims_Status','Claims_Frequency', 'Previous_Claims_Count','Total_Claim_Amount_KES','No_Claim_Bonus_%'], axis=1)
y = df_encoded['Claims_Status']

In [26]:
#Split Data into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print('Train_size:', X_train.shape, y_train.shape)
print('Test_size:', X_test.shape, y_test.shape)

Train_size: (1600, 27) (1600,)
Test_size: (400, 27) (400,)
