In [189]:
# Regular EDA libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# we want our plots to apear within notebooks
%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.set_style("darkgrid")
plt.style.use('ggplot')
import scipy.stats as spstats

from sklearn.preprocessing import OrdinalEncoder

# Models from scikit-learn & XGboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN, SMOTE, SMOTENC
import random
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, StandardScaler


# Model Evaluation libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (classification_report, confusion_matrix, f1_score,
                             precision_recall_curve, precision_score,
                             recall_score, roc_auc_score)
from sklearn.model_selection import (GridSearchCV, RepeatedStratifiedKFold,
                                     StratifiedKFold, cross_val_score,
                                     train_test_split)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer, accuracy_score

In [190]:
# Importing the data link
data_link = 'https://raw.githubusercontent.com/Wkimani/Auto-Insurance-Fraud-Detection-Using-ML/main/fraud_oracle.csv'

#loading the csv file
df = pd.read_csv(data_link)

#viewing the first 5 rows
df.head()

df_copy=df.copy()

### 5. Data Pre-Processing and Feature Engineering

Clean up column 'PolicyType'

As Policy Type and Base Policy consist of repetitive information, we will remove the policy information under Policy Type


In [191]:
df['PolicyType'].value_counts()


PolicyType
Sedan - Collision       5584
Sedan - Liability       4987
Sedan - All Perils      4087
Sport - Collision        348
Utility - All Perils     340
Utility - Collision       30
Sport - All Perils        22
Utility - Liability       21
Sport - Liability          1
Name: count, dtype: int64

In [192]:
# Create a function to remove the policy information
def ptconvert(data):
    data = data.split()
    for i in data:
        pt = data[0]
        return pt

In [193]:
df['PolicyType'] = df['PolicyType'].astype(str).apply(ptconvert)

In [194]:
df['PolicyType'].unique()

array(['Sport', 'Sedan', 'Utility'], dtype=object)

1. Make

In [195]:
to_str = ['WeekOfMonth', 'WeekOfMonthClaimed','RepNumber', 'Deductible', 'DriverRating', 'Year']


In [196]:
# For Loop to convert the date type to string
for i in to_str:
    df[i]= df[i].astype('str')

In [197]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  object
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  object
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

#### Log Transformation for Age


In [198]:
# Final check to see if all the features are in integer or float datatypes
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  object
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  object
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

#### Label Encoding

In [199]:
# Columns to convert
columns_to_convert = ['RepNumber', 'Deductible']

for col in columns_to_convert:
    # Convert to numeric, replacing invalid values with NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [200]:
df.columns

Index(['Month', 'WeekOfMonth', 'DayOfWeek', 'Make', 'AccidentArea',
       'DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed', 'Sex',
       'MaritalStatus', 'Age', 'Fault', 'PolicyType', 'VehicleCategory',
       'VehiclePrice', 'FraudFound_P', 'PolicyNumber', 'RepNumber',
       'Deductible', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle',
       'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
       'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'Year',
       'BasePolicy'],
      dtype='object')

In [201]:
from sklearn.preprocessing import LabelEncoder

# Keep a copy of the original DataFrame with the categorical columns
df_categorical = df[['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'Make','DriverRating',
                     'Days_Policy_Accident', 'Days_Policy_Claim', 'PoliceReportFiled', 'WitnessPresent','VehiclePrice',
                     'AgentType', 'AddressChange_Claim', 'PastNumberOfClaims','NumberOfSuppliments','NumberOfCars', 'Year', 'BasePolicy', 'Month', 'AgeOfVehicle','AgeOfPolicyHolder']].copy()

# Initialize LabelEncoder
label_encoder = LabelEncoder()


# Apply LabelEncoder to each categorical column
for column in df_categorical.columns:
    df_categorical[column] = label_encoder.fit_transform(df_categorical[column])
# No duplicates - unique identifier , which is not nessasery for analyis --> drop the column
df.drop('PolicyNumber', axis = 1, inplace = True)
# Concatenate the label encoded DataFrame with the original numerical columns
df_final = pd.concat([df_categorical, df.select_dtypes(exclude=['object'])], axis=1)

# Final check to see if all the features are in integer or float datatypes after encoding
print(df_final.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   AccidentArea          15420 non-null  int32
 1   Sex                   15420 non-null  int32
 2   MaritalStatus         15420 non-null  int32
 3   Fault                 15420 non-null  int32
 4   PolicyType            15420 non-null  int32
 5   VehicleCategory       15420 non-null  int32
 6   Make                  15420 non-null  int32
 7   DriverRating          15420 non-null  int32
 8   Days_Policy_Accident  15420 non-null  int32
 9   Days_Policy_Claim     15420 non-null  int32
 10  PoliceReportFiled     15420 non-null  int32
 11  WitnessPresent        15420 non-null  int32
 12  VehiclePrice          15420 non-null  int32
 13  AgentType             15420 non-null  int32
 14  AddressChange_Claim   15420 non-null  int32
 15  PastNumberOfClaims    15420 non-null  int32
 16  Numb

In [202]:
df_final.to_csv('ML_preprocessed.csv')

### 6. Model Training

In this section, we will train the dataset using several classification algorithms:

Logistic Regression,
K-Nearest Neighbour,
Random Forest,
XGBoost

The best model that suits the business problem based on the metrics - Recall, ROC AUC and Precision-Recall Curve will be selected for hyperparameter tuning.

We will adjust the class weights to deal with this set of imbalanced dataset.

In [203]:
# Define X and y dataset for Machine Learning
X = df_final.drop(['FraudFound_P','Age'], axis=1)
y = df_final['FraudFound_P']

In [204]:
#Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [205]:
import pandas as pd

# Assuming 'df_train' is your preprocessed training data DataFrame and 'df_test' is your preprocessed test data DataFrame

# Check column names in the preprocessed training data
print("Column names in the preprocessed training data:")
print(X_train.columns)

# Check column names in the preprocessed test data
print("Column names in the preprocessed test data:")
print(X_test.columns)

# Get sets of column names for training and test data
train_columns = set(X_train.columns)
test_columns = set(X_test.columns)

# Check for any differences
difference = train_columns.symmetric_difference(test_columns)

if len(difference) == 0:
    print("The column names in the preprocessed training data and the preprocessed test data are identical.")
else:
    print("There are differences in column names between the preprocessed training data and the preprocessed test data:")
    print("Columns present in the preprocessed training data but not in the preprocessed test data:", train_columns - test_columns)
    print("Columns present in the preprocessed test data but not in the preprocessed training data:", test_columns - train_columns)


Column names in the preprocessed training data:
Index(['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType',
       'VehicleCategory', 'Make', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PoliceReportFiled', 'WitnessPresent',
       'VehiclePrice', 'AgentType', 'AddressChange_Claim',
       'PastNumberOfClaims', 'NumberOfSuppliments', 'NumberOfCars', 'Year',
       'BasePolicy', 'Month', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'RepNumber',
       'Deductible'],
      dtype='object')
Column names in the preprocessed test data:
Index(['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType',
       'VehicleCategory', 'Make', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PoliceReportFiled', 'WitnessPresent',
       'VehiclePrice', 'AgentType', 'AddressChange_Claim',
       'PastNumberOfClaims', 'NumberOfSuppliments', 'NumberOfCars', 'Year',
       'BasePolicy', 'Month', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'RepNumber',
       'Dedu

In [206]:
# Train classifiers
clf = LogisticRegression()
clf.fit(X_train_resampled, y_train_resampled)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [207]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.78      0.86      2887
           1       0.13      0.47      0.20       197

    accuracy                           0.76      3084
   macro avg       0.54      0.62      0.53      3084
weighted avg       0.90      0.76      0.82      3084



In [208]:
X.columns

Index(['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType',
       'VehicleCategory', 'Make', 'DriverRating', 'Days_Policy_Accident',
       'Days_Policy_Claim', 'PoliceReportFiled', 'WitnessPresent',
       'VehiclePrice', 'AgentType', 'AddressChange_Claim',
       'PastNumberOfClaims', 'NumberOfSuppliments', 'NumberOfCars', 'Year',
       'BasePolicy', 'Month', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'RepNumber',
       'Deductible'],
      dtype='object')

In [210]:
import pickle

# Save the trained model as .sav
with open("fraud_model.sav", "wb") as file:
    pickle.dump(clf, file)


#### Finally , save the classifier model for future deployment.

In [186]:
from sklearn.preprocessing import LabelEncoder
import pickle

# Initialize a dictionary to store the encoders
encoders = {}

# Example columns for demonstration; replace with your actual DataFrame and columns
df_categorical = df[['AccidentArea', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'Make','DriverRating',
                     'Days_Policy_Accident', 'Days_Policy_Claim', 'PoliceReportFiled', 'WitnessPresent','VehiclePrice',
                     'AgentType', 'AddressChange_Claim', 'PastNumberOfClaims','NumberOfSuppliments','NumberOfCars', 'Year',
                     'BasePolicy', 'Month', 'AgeOfVehicle','AgeOfPolicyHolder']].copy()

# Apply LabelEncoder to each categorical column and save the encoder
for column in df_categorical.columns:
    le = LabelEncoder()
    df_categorical[column] = le.fit_transform(df_categorical[column])
    encoders[column] = le

# Save the encoders to a file
filename = 'encoder.sav'
pickle.dump(encoders, open(filename, 'wb'))

print("Encoders saved successfully in 'encoder.sav'!")


Encoders saved successfully in 'encoder.sav'!


Optimized Recall Threshold: 0.000


['optimized_model.pkl']

In [218]:
import joblib
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.metrics import precision_recall_curve

# Load and split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply ADASYN for balancing data
adasyn = ADASYN(random_state=42)
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression Model
clf = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42)
clf.fit(X_train_scaled, y_train_resampled)

# Compute Precision-Recall Curve for Threshold Optimization
y_scores = clf.predict_proba(X_test_scaled)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Compute Precision-Recall Curve
y_scores = clf.predict_proba(X_test_scaled)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Find optimal threshold balancing recall and precision
optimal_idx = np.argmax(recall - precision)  # Prioritizing recall
# Set a minimum threshold to prevent too many similar values
optimal_threshold = max(0.40, min(0.70, thresholds[optimal_idx]))

print(f"Optimized Recall Threshold: {optimal_threshold:.2f}")

# Store the threshold inside the model
clf.optimal_threshold = optimal_threshold
import pickle

# Save the trained model
with open("optimized_model.pkl", "wb") as model_file:
    pickle.dump(clf, model_file)  # Save the model

# Save the scaler separately
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)  # Save the scaler


Optimized Recall Threshold: 0.40
