In [19]:
# Dependencies: pandas, numpy, scikit-learn
# !pip install pandas
# !pip install numpy
# !pip install scikit-learn

In [20]:
import pandas as pd
import numpy as np

from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.impute import KNNImputer

In [21]:
# get train data into a DataFrame indexed by claim_number
dataset = pd.read_csv("train_2023.csv", index_col="claim_number")
dataset.head()

Unnamed: 0_level_0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_date,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,50,F,1.0,66,39117,1,0,Own,50051,1/2/2016,...,18,Broker,0,5464.903723,3.0,Large,16786.28845,blue,34183.43639,0
1,47,M,1.0,78,38498,1,0,Own,50012,12/28/2015,...,81,Broker,0,5448.155974,4.0,Large,20793.29881,black,14528.37973,0
2,28,M,0.0,76,33343,0,1,Rent,20158,2/26/2016,...,94,Broker,1,1858.971699,4.0,Compact,6729.47649,blue,7182.832636,0
3,36,M,1.0,56,35832,1,0,Own,50054,9/20/2015,...,95,Phone,0,4040.932844,5.0,Compact,24914.26785,gray,47891.78879,0
4,60,F,1.0,79,40948,1,1,Rent,80010,4/14/2015,...,53,Phone,1,4876.606138,6.0,Medium,17392.41997,black,7546.494014,0


In [22]:
# output amount of rows that have missing values
print("Number of rows with missing values: ", dataset.isnull().sum().sum())
print("There are multiple ways to deal with this.")
print("KNN Imputer seemed to make the most sense.")

Number of rows with missing values:  177
There are multiple ways to deal with this.
KNN Imputer seemed to make the most sense.


In [23]:
dataset['claim_date'] = pd.to_datetime(dataset['claim_date'])
# print lowest and highest claim_date, working with a year of data
# this is important to note when we are extracting date features
print(dataset['claim_date'].min())
print(dataset['claim_date'].max())

2015-01-01 00:00:00
2016-12-31 00:00:00


In [36]:
def data_preprocessing_1(df):
    '''Preprocess the data that the model can use.'''

    # replace gender F with 0 and M with 1 in the DataFrame
    df['gender'] = np.where(df['gender'] == 'F', 0, 1)

    # replace living_status Rent with 0 and O with 1 in the DataFrame
    df['living_status'] = np.where(df['living_status'] == 'Rent', 0, 1)

    # add some relevant date features
    df['claim_date'] = pd.to_datetime(df['claim_date'])
    df['claim_day_of_week'] = df['claim_date'].dt.dayofweek
    df['month'] = df['claim_date'].dt.month
    df['day_of_month'] = df['claim_date'].dt.day
    df['day_of_year'] = df['claim_date'].dt.dayofyear
    df.drop('claim_date',axis=1, inplace=True)

    # replace accident_site with 0, 0.5, 1
    df['accident_site'] = np.where(df['accident_site'] == 'Local', 0,
                                        np.where(df['accident_site'] == 'Parking Lot', 0.5, 1))

    # replace channel with 0 if it is 'Broker'
    df['channel'] = np.where(df['channel'] == 'Broker', 0, 1)

    # replace vehicle_category ['Large', 'Compact', 'Medium'] with 0, 0.5, 1
    df['vehicle_category'] = np.where(df['vehicle_category'] == 'Large', 0,
                                        np.where(df['vehicle_category'] == 'Compact', 0.5, 1))

    # replace vehicle_color ['blue', 'black', 'gray', 'silver', 'red', 'white', 'other'] with 0, 1, 2, 3, 4, 5, 6
    df['vehicle_color'] = np.where(df['vehicle_color'] == 'blue', 0,
                                        np.where(df['vehicle_color'] == 'black', 1/6,
                                        np.where(df['vehicle_color'] == 'gray', 2/6,
                                        np.where(df['vehicle_color'] == 'silver', 3/6,
                                        np.where(df['vehicle_color'] == 'red', 4/6,
                                        np.where(df['vehicle_color'] == 'white', 5/6, 1))))))

    return df

In [25]:
# preprocess the data and check result
dataset = data_preprocessing_1(dataset)
dataset

Unnamed: 0_level_0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_day_of_week,...,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud,month,day_of_month,day_of_year
claim_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,50,0,1.0,66,39117,1,0,1,50051,5,...,5464.903723,3.0,0.0,16786.288450,0.000000,34183.436390,0,1,2,2
1,47,1,1.0,78,38498,1,0,1,50012,0,...,5448.155974,4.0,0.0,20793.298810,0.166667,14528.379730,0,12,28,362
2,28,1,0.0,76,33343,0,1,0,20158,4,...,1858.971699,4.0,0.5,6729.476490,0.000000,7182.832636,0,2,26,57
3,36,1,1.0,56,35832,1,0,1,50054,6,...,4040.932844,5.0,0.5,24914.267850,0.333333,47891.788790,0,9,20,263
4,60,0,1.0,79,40948,1,1,0,80010,1,...,4876.606138,6.0,1.0,17392.419970,0.166667,7546.494014,0,4,14,104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18995,53,1,0.0,81,39711,1,1,0,85070,0,...,9528.853233,7.0,0.5,8285.507871,0.166667,19578.388760,0,2,16,47
18996,39,0,1.0,76,36644,0,1,0,80020,1,...,3895.307825,4.0,1.0,11536.180460,0.500000,17599.182810,0,4,28,118
18997,46,1,1.0,91,38291,0,0,0,15019,3,...,2851.046390,6.0,0.5,13601.011440,0.333333,20711.012560,0,4,16,106
18998,40,1,0.0,87,36895,1,1,1,20142,4,...,3593.339941,5.0,0.5,38314.191020,0.666667,33472.728910,0,11,20,324


In [26]:
# ouput the number of frauds and non-frauds. Reveals a class imbalance.
print(dataset['fraud'].value_counts())

0    16062
1     2938
Name: fraud, dtype: int64


In [27]:
# set features to be all columns except fraud
features = dataset.columns.drop('fraud')

train_data, val_dataset = train_test_split(dataset, test_size=0.1, shuffle=True, random_state=1) #0.1 works best



# Initialize KNNImputer with desired number of neighbors
knn_imputer_t = KNNImputer(n_neighbors=100)

# store columns before losing it with imputer
columns = train_data.columns

# Apply KNN imputation to fill in missing values
train_d  = knn_imputer_t.fit_transform(train_data.dropna())
train_data = knn_imputer_t.transform(train_data)    
train_data = pd.DataFrame(train_data,columns=columns)



# separate fraud and non-fraud cases
fraud = train_data[train_data['fraud'] == 1]
non_fraud = train_data[train_data['fraud'] == 0]

# calculate fraction of fraud cases in training set to use for downsampling
# this can be used to downsample non-fraud cases to match the number of fraud cases
# ultimately did not settle with this number from various experiments, although
# it might be worth looking into again
fraud_ratio = len(fraud) / len(non_fraud)

# downsample non-fraud cases for training set
non_fraud = non_fraud.sample(frac=0.2, random_state=1) # 0.2 works ~best, close to actual ratio

# concatenate fraud and downsampled non-fraud cases for training set
train_dataset = pd.concat([fraud, non_fraud])

# y is the fraud column
y_train = train_dataset['fraud']

# X is all columns except fraud
X_train = train_dataset[features]

# X_test for validation set
x_val = val_dataset[features]
y_val = val_dataset['fraud']

In [28]:
# ouput the number of frauds and non-frauds, now more balanced
print(y_train.value_counts())

0.0    2891
1.0    2646
Name: fraud, dtype: int64


In [29]:
# check X_train before training
X_train.head()

Unnamed: 0,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,claim_day_of_week,...,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,month,day_of_month,day_of_year
5,31.0,0.0,0.0,80.0,34350.0,1.0,0.0,0.0,80044.0,0.0,...,0.0,2357.290454,1.0,0.0,15115.07301,0.166667,28088.89158,1.0,26.0,26.0
18,24.0,0.0,1.0,40.0,31770.0,1.0,1.0,0.0,50014.0,2.0,...,1.0,5506.981487,4.0,0.5,17717.84202,0.166667,19870.61261,11.0,25.0,329.0
21,40.0,0.0,1.0,84.0,36889.0,0.0,0.0,0.0,85055.0,0.0,...,1.0,3507.332992,4.0,0.0,10974.30868,0.0,8388.157059,2.0,16.0,47.0
25,44.0,1.0,1.0,76.0,37848.0,1.0,1.0,1.0,80021.0,5.0,...,1.0,10016.14999,3.0,0.0,24303.34104,0.166667,7489.957099,2.0,21.0,52.0
28,46.0,0.0,0.0,59.0,38302.0,1.0,0.0,0.0,20138.0,4.0,...,1.0,1892.754879,7.0,0.5,60300.1246,0.666667,22191.46218,11.0,18.0,323.0


In [30]:
# KNN Imputing for validation data
# Initialize KNNImputer with desired number of neighbors
knn_imputer = KNNImputer(n_neighbors=100)

# store columns before losing it with imputer
columns = x_val.columns

# Apply KNN imputation to fill in missing values
train_d  = knn_imputer.fit_transform(X_train)
x_val = knn_imputer.transform(x_val)    
x_val = pd.DataFrame(x_val,columns=columns)


In [31]:
# ouput the number of frauds and non-frauds
print(val_dataset['fraud'].value_counts())

0    1608
1     292
Name: fraud, dtype: int64


In [32]:
# Establish base model. Hyperparameters were tuned both with keras tuner (when working with TFDF) and manually.
base_model = RandomForestClassifier(n_estimators=450, max_depth=9, random_state=1)

# Bagging. Significant increase in F1 score. Found best results with ~30 estimators.
bagging_model = BaggingClassifier(base_model, n_estimators=30, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_predictions = bagging_model.predict(x_val)
bagging_f1_score = f1_score(y_val, bagging_predictions)

# # Boosting (did not help increase F1 score)
# boosting_model = AdaBoostClassifier(base_model, n_estimators=10, random_state=42)
# boosting_model.fit(X_train, y_train)
# boosting_predictions = boosting_model.predict(new)
# boosting_f1_score = f1_score(y_val, boosting_predictions)

print("Bagging F1 score:", bagging_f1_score)
# print("Boosting F1 score:", boosting_f1_score)

Bagging F1 score: 0.3722943722943723


In [37]:
# get test data into a DataFrame indexed by claim_number
# the format of the output file should be:
# claim_number	fraud
# 19000	0
# 19001	0
# 19002	0
# 19003	1

# import test_2023.csv
test = pd.read_csv('test_2023.csv')

# do data preprocessing on test data
test = data_preprocessing_1(test)



# Apply KNN imputation to fill in missing values
test_x = test[features]

# store columns before losing it with imputer
columns = test_x.columns

test_x = knn_imputer.transform(test_x)    
test_x = pd.DataFrame(test_x,columns=columns)



bagging_predictions_test = bagging_model.predict(test_x)
test['fraud'] = bagging_predictions_test.astype(int)


# output the prediction to a csv file, replacing 'prediction' with 'fraud'
test[['claim_number', 'fraud']].to_csv('predictions.csv', index=False)

[1. 0. 0. ... 1. 1. 1.]


In [44]:
feature_importances = np.mean([tree.feature_importances_ for tree in bagging_model.estimators_], axis=0)
importance_mapping = dict(zip(x_val.columns, feature_importances))
sorted_importances = sorted(importance_mapping.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importances:
    print(f"{feature}: {importance:.5f}")

annual_income: 0.09335
claim_est_payout: 0.08133
vehicle_price: 0.07033
safty_rating: 0.06816
vehicle_weight: 0.06787
age_of_driver: 0.06696
zip_code: 0.06124
day_of_year: 0.05906
liab_prct: 0.05880
day_of_month: 0.04842
age_of_vehicle: 0.04386
past_num_of_claims: 0.04038
high_education_ind: 0.03778
month: 0.02847
claim_day_of_week: 0.02790
vehicle_color: 0.02756
witness_present_ind: 0.02321
address_change_ind: 0.01752
marital_status: 0.01513
vehicle_category: 0.01433
accident_site: 0.01394
gender: 0.00975
living_status: 0.00876
channel: 0.00822
policy_report_filed_ind: 0.00768
