## Notebook Setup

In [438]:
# Import Libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, roc_auc_score

import sys
sys.path.append('../01_src/features')
from custom_encoder import CustomLabelEncoder

In [439]:
# Read Data
vcl_data = pd.read_csv('../02_data/02_processed/fraud_oracle_processed.csv')
vcl_data.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,FraudFound_P
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,26 to 30,No,No,External,0,03: 1 year,3 to 4,1994,Liability,0
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,31 to 35,Yes,No,External,0,01: No change,1,1994,Collision,0
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,41 to 50,No,No,External,0,01: No change,1,1994,Collision,0
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,51 to 65,Yes,No,External,> 5,01: No change,1,1994,Liability,0
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,31 to 35,No,No,External,0,01: No change,1,1994,Collision,0


In [440]:
X = vcl_data[[*vcl_data.drop(columns = 'FraudFound_P').columns]]
y = vcl_data['FraudFound_P']

## Feature Engineering

### Label Encoding

In [441]:
vcl_data_enc = vcl_data.copy()

In [442]:
# Custom Encoding

label_mapping = {
    'Month': {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
              'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
              },
    'DayOfWeek': {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,
                  'Friday': 5, 'Saturday': 6, 'Sunday': 7
                  },
    
    'DayOfWeekClaimed': {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4,
                         'Friday': 5, 'Saturday': 6, 'Sunday': 7
                         },

    'MonthClaimed': {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
                     'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
                     },
}

date_cols = ['Month', 'DayOfWeek', 'DayOfWeekClaimed', 'MonthClaimed']

for col in date_cols:
    try:
        enc = CustomLabelEncoder(label_mapping[col])
        vcl_data_enc[col] = enc.fit_transform(vcl_data[col])
    except:
        print(col)

In [443]:
# Label Encoder

to_encode = [*vcl_data_enc.select_dtypes('object').columns]

enc = LabelEncoder()

for col in to_encode:
    vcl_data_enc[col] = enc.fit_transform(vcl_data[col])

In [444]:
vcl_data_enc.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,FraudFound_P
0,12,5,3,6,1,2,1,1,0,2,...,3,0,0,0,0,2,2,1994,2,0
1,1,3,3,6,1,1,1,4,1,2,...,4,1,0,0,0,0,0,1994,1,0
2,10,5,5,6,1,4,11,2,1,1,...,6,0,0,0,0,0,0,1994,1,0
3,6,2,6,17,0,5,7,1,1,1,...,7,1,0,0,3,0,0,1994,2,0
4,1,5,1,6,1,2,2,2,0,2,...,4,0,0,0,0,0,0,1994,1,0


### Standard Scaler

In [445]:
# Standard Scaler
vcl_data_scl = vcl_data_enc.copy()

scl= StandardScaler()

for column in [[*vcl_data_scl.columns]]:
    vcl_data_scl[column] = scl.fit_transform(vcl_data_enc[column])

vcl_data_scl.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy,FraudFound_P
0,1.614832,1.71747,-0.408247,-0.778914,0.339912,-0.569958,-1.545641,-1.345531,-2.317647,1.466379,...,-1.356083,-0.168969,-0.075329,-0.126009,-0.941044,1.952488,4.957764,-1.078734,1.231756,-0.252335
1,-1.552442,0.164156,-0.408247,-0.778914,0.339912,-1.257987,-1.545641,1.037235,0.431472,1.466379,...,-0.639417,5.918252,-0.075329,-0.126009,-0.941044,-0.26956,-0.257748,-1.078734,-0.046508,-0.252335
2,1.038964,1.71747,0.602229,-0.778914,0.339912,0.806099,1.340317,-0.551276,0.431472,-0.638452,...,0.793914,-0.168969,-0.075329,-0.126009,-0.941044,-0.26956,-0.257748,-1.078734,-0.046508,-0.252335
3,-0.112772,-0.612501,1.107467,1.303309,-2.941934,1.494127,0.185934,-1.345531,0.431472,-0.638452,...,1.51058,5.918252,-0.075329,-0.126009,1.460737,-0.26956,-0.257748,-1.078734,1.231756,-0.252335
4,-1.552442,1.71747,-1.418723,-0.778914,0.339912,-0.569958,-1.257045,-0.551276,-2.317647,1.466379,...,-0.639417,-0.168969,-0.075329,-0.126009,-0.941044,-0.26956,-0.257748,-1.078734,-0.046508,-0.252335


### Train Test Split

In [446]:
X_train, X_test, y_train, y_test = train_test_split(vcl_data_scl[X.columns.values], y, test_size = .3, random_state=42)

In [447]:
X_train.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
9234,-0.976574,0.940813,0.602229,-0.778914,0.339912,1.494127,-0.968449,1.037235,0.431472,-0.638452,...,-0.433285,-0.639417,-0.168969,-0.075329,-0.126009,0.660143,-0.26956,-0.257748,0.166158,-0.046508
11819,-1.552442,1.71747,1.107467,-1.536086,0.339912,-1.257987,-1.545641,1.83149,0.431472,-0.638452,...,0.996379,0.077249,-0.168969,-0.075329,-0.126009,1.460737,-0.26956,-0.257748,1.411049,-1.324772
5830,0.175162,0.940813,-1.418723,0.546137,0.339912,0.11807,0.474529,-1.345531,0.431472,1.466379,...,0.281547,0.077249,-0.168969,-0.075329,-0.126009,-0.140451,-0.26956,-0.257748,-1.078734,-1.324772
9020,-0.68864,-1.389158,-1.418723,-0.211035,0.339912,0.806099,-0.679854,-1.345531,0.431472,-0.638452,...,-0.433285,-0.639417,-0.168969,-0.075329,-0.126009,0.660143,-0.26956,-0.257748,0.166158,-0.046508
12395,0.75103,0.164156,-0.408247,-1.536086,-2.941934,-0.569958,0.763125,0.24298,0.431472,-0.638452,...,0.996379,1.51058,-0.168969,-0.075329,-0.126009,-0.941044,-0.26956,-0.257748,1.411049,-0.046508


### Feature Selection

In [448]:
sk = SelectKBest(mutual_info_classif, k = 5)
feature_selected = sk.fit(X_train, y_train).get_support()
features = [*X_train.columns[feature_selected]]
features

['Fault', 'PolicyType', 'VehicleCategory', 'AgentType', 'BasePolicy']

### Modelling

#### LocalOUtlierFactor

In [449]:
from sklearn.neighbors import LocalOutlierFactor

In [450]:
lof = LocalOutlierFactor(novelty = True, contamination = .05)
lof.fit(X_train[features])
y_pred = lof.predict(X_test[features])



In [451]:
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

In [452]:
np.unique(y_pred, return_counts=True)

(array([0, 1]), array([4616,   10], dtype=int64))

In [453]:
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.9360138348465197
0.4988479262672811


#### OneClassSVM

In [454]:
from sklearn.svm import OneClassSVM

In [479]:
clf = OneClassSVM()
clf.fit(X_train[features])

y_scores = clf.score_samples(X_test[features])

threshold = np.quantile(y_scores, 0.05)

y_pred = y_scores <= threshold

In [480]:
y_pred[y_pred == False] = 0
y_pred[y_pred == True] = 1

In [481]:
np.unique(y_pred, return_counts=True)

(array([False,  True]), array([4374,  252], dtype=int64))

In [482]:
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.8906182447038478
0.5007830878798621
