# Install Packsges

In [94]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [95]:
# Import Data
df = pd.read_csv('/Users/samlai/Documents/Vehicle_insurance_claim_project/data/raw/fraud_oracle.csv')

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

In [97]:
df_cleaned = df.copy()

# Data Cleaning

## Drop useless column and change data type

In [98]:
# Drop ID Column
df_cleaned = df_cleaned.drop(columns = ['PolicyNumber'])

In [99]:
# RepNumber should be categorical
# WeekOfMonth and WeekOfMonthClaimed should be categorical
# Deductible should be categorical
# Year should be categorical
# DriverRating should be categorical
df_cleaned['RepNumber'] = df_cleaned['RepNumber'].astype('category')
df_cleaned['WeekOfMonth'] = df_cleaned['WeekOfMonth'].astype('category')
df_cleaned['WeekOfMonthClaimed'] = df_cleaned['WeekOfMonthClaimed'].astype('category')
df_cleaned['Deductible'] = df_cleaned['Deductible'].astype('category')
df_cleaned['Year'] = df_cleaned['Year'].astype('category')
df_cleaned['DriverRating'] = df_cleaned['DriverRating'].astype('category')

## Encode Categorical Data

### Encode Binary Variables (AccidentArea, Sex, Fault, PoliceReportFiled, WitnessPresent, AgentType)

In [100]:
from sklearn.preprocessing import LabelEncoder

binary_columns = [col for col in df_cleaned.columns if len(df_cleaned[col].unique()) == 2 and
                  col != 'FraudFound_P']

le = LabelEncoder()
for col in binary_columns:
    df_cleaned[col] = le.fit_transform(df_cleaned[col])

### Encode Ordinal Categorical Variables

將有順序的類別變數Encode

In [101]:
for col in ['DriverRating', 'PastNumberOfClaims', 'AgeOfVehicle', 'NumberOfSuppliments']:
    print(f'Unique Values in {col}: {df_cleaned[col].unique()}')
    print('-'*100)


Unique Values in DriverRating: [1, 4, 3, 2]
Categories (4, int64): [1, 2, 3, 4]
----------------------------------------------------------------------------------------------------
Unique Values in PastNumberOfClaims: ['none' '1' '2 to 4' 'more than 4']
----------------------------------------------------------------------------------------------------
Unique Values in AgeOfVehicle: ['3 years' '6 years' '7 years' 'more than 7' '5 years' 'new' '4 years'
 '2 years']
----------------------------------------------------------------------------------------------------
Unique Values in NumberOfSuppliments: ['none' 'more than 5' '3 to 5' '1 to 2']
----------------------------------------------------------------------------------------------------


In [102]:
# DriverRating is already numerical, don't need to encode
PastNumberClaims_label = {'none': 0, '1': 1, '2 to 4': 2, 'more than 4': 3}
AgeOfVehicle_label = {'new': 0, '2 years': 1, '3 years': 2, '4 years': 3, '5 years': 4,
                      '6 years': 5, '7 years': 6, 'more than 7': 7}
NumberOfSuppliments_label = {'none': 0, '1 to 2': 1, '3 to 5': 2, 'more than 5': 3}

df_cleaned['PastNumberOfClaims'] = df_cleaned['PastNumberOfClaims'].map(PastNumberClaims_label)
df_cleaned['AgeOfVehicle'] = df_cleaned['AgeOfVehicle'].map(AgeOfVehicle_label)
df_cleaned['NumberOfSuppliments'] = df_cleaned['NumberOfSuppliments'].map(NumberOfSuppliments_label)

### Encode Nominal Categorical Variables

EDA中發現VehiclePrice, Deductible, Age of Policy Holder, Number of Cars, AddressChangeClaim與與詐欺率之間不是簡單的線性關係，因此用One-Hot encoding較適合（可以獨立學習每個類別）

In [103]:
# Encode Nominal Variables
df_cleaned_v2 = pd.get_dummies(df_cleaned, drop_first = True)
df_cleaned_v2.shape

(15420, 139)

### Drop similar features first, then encode Nominal Categorical Variables

In [104]:
df_cleaned_v3 = df_cleaned.drop(columns = ['DayOfWeekClaimed', 'MonthClaimed', 'WeekOfMonthClaimed'])

In [105]:
df_cleaned_v3 = pd.get_dummies(df_cleaned_v3, drop_first = True)
df_cleaned_v3.shape

(15420, 116)

### Impute strange values in Age column

In [106]:
df_cleaned_v4 = df_cleaned_v3.copy()

In [107]:
len(df_cleaned_v4[df_cleaned_v4['Age'] == 0])

320

In [108]:
df_cleaned_v4['Age'] = df_cleaned_v4['Age'].apply(lambda x: np.nan if x == 0 else x)

# Initialize Random Forest Imputer
from sklearn.ensemble import RandomForestRegressor
rf_imputer = RandomForestRegressor(n_estimators=100, random_state=42)

# Get features for imputation (exclude Age column)
features = df_cleaned_v4.drop('Age', axis=1).columns
X = df_cleaned_v4[features]
y = df_cleaned_v4['Age']

# Fit imputer on non-null values
mask_not_null = ~df_cleaned_v4['Age'].isna()
rf_imputer.fit(X[mask_not_null], y[mask_not_null])

# Predict missing values
mask_null = df_cleaned_v4['Age'].isna()
df_cleaned_v4.loc[mask_null, 'Age'] = rf_imputer.predict(X[mask_null])

### Use Variance Threshold to select features

In [111]:
df_cleaned_v5 = df_cleaned_v4.copy()

In [113]:
from sklearn.feature_selection import VarianceThreshold

X = df_cleaned_v5.drop(columns = 'FraudFound_P')

selector = VarianceThreshold(threshold = 0.01)
X_selected = selector.fit_transform(X)
X_selected.shape

(15420, 86)

In [120]:
# Get selected features
selected_features = X.columns[selector.get_support()]

# create a dataframe only with selected features
df_cleaned_v5 = df_cleaned_v5[selected_features]

# Add back target variable
df_cleaned_v5['FraudFound_P'] = df_cleaned_v4['FraudFound_P']

### Use SMOTE first, then use Variancethreshold

In [124]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
X = df_cleaned_v5.drop(columns = 'FraudFound_P')
y = df_cleaned_v5['FraudFound_P']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
y_train_smote.value_counts()

FraudFound_P
0    11598
1    11598
Name: count, dtype: int64

In [125]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold = 0.01)
X_selected = selector.fit_transform(X_train_smote)
X_selected.shape

(23196, 84)

In [126]:
# Get selected features
selected_features = X_train_smote.columns[selector.get_support()]

# create a dataframe only with selected features
X_train_smote_selected = X_train_smote[selected_features]
X_test_selected = X_test[selected_features]

# Add back y_train_smote and y_test
df_train_v6 = X_train_smote_selected.copy()
df_train_v6['FraudFound_P'] = y_train_smote

df_test_v6 = X_test_selected.copy()
df_test_v6['FraudFound_P'] = y_test

# Export Datasets

In [84]:
# df_cleaned_v2 --> without any feature engineering, without dropping any columns
df_cleaned_v2.to_csv('../data/processed/data_cleaned_v2.csv', index = False)

In [89]:
#df_cleaned_v3 --> without any feature engineering, but drop similar features
df_cleaned_v3.to_csv('../data/processed/data_cleaned_v3.csv', index = False)

In [110]:
#df_cleaned_v4 --> without any feature engineering, but drop similar features and correct strange values in Age
df_cleaned_v4.to_csv('../data/processed/data_cleaned_v4.csv', index = False)

In [123]:
#df_cleaned_v5 --> without any feature engineering, from v4 but selected features by Variancethreshold
df_cleaned_v5.to_csv('../data/processed/data_cleaned_v5.csv', index = False)

In [129]:
# df_train_v6, df_test_v6 --> use smote first then Variancethreshold
df_train_v6.to_csv('../data/processed/df_train_v6.csv', index = False)
df_test_v6.to_csv('../data/processed/df_test_v6.csv', index = False)