In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

### Importing dataset

In [2]:
df = pd.read_csv(r'C:\Users\sidde\OneDrive\Documents\Dissertation\Final\Data_source\Insurance data.csv')


df.head()

Unnamed: 0,claim_number,age_of_driver,gender,marital_status,safty_rating,annual_income,high_education_ind,address_change_ind,living_status,zip_code,...,liab_prct,channel,policy_report_filed_ind,claim_est_payout,age_of_vehicle,vehicle_category,vehicle_price,vehicle_color,vehicle_weight,fraud
0,1,46,M,1.0,85,38301,1,1,Rent,80006,...,74,Broker,0,7530.940993,9.0,Compact,12885.45235,white,16161.33381,0
1,3,21,F,0.0,75,30445,0,1,Rent,15021,...,79,Online,0,2966.024895,4.0,Large,29429.45218,white,28691.96422,0
2,4,49,F,0.0,87,38923,0,1,Own,20158,...,0,Broker,0,6283.888333,3.0,Compact,21701.18195,white,22090.94758,1
3,5,58,F,1.0,58,40605,1,0,Own,15024,...,99,Broker,1,6169.747994,4.0,Medium,13198.27344,other,38329.58106,1
4,6,38,M,1.0,95,36380,1,0,Rent,50034,...,7,Broker,0,4541.38715,7.0,Medium,38060.21122,gray,25876.56319,0


### EDA


#### 1. Handling missing values



In [3]:
df.isna().sum()

claim_number                 0
age_of_driver                0
gender                       0
marital_status               5
safty_rating                 0
annual_income                0
high_education_ind           0
address_change_ind           0
living_status                0
zip_code                     0
claim_date                   0
claim_day_of_week            0
accident_site                0
past_num_of_claims           0
witness_present_ind        132
liab_prct                    0
channel                      0
policy_report_filed_ind      0
claim_est_payout            17
age_of_vehicle               8
vehicle_category             0
vehicle_price                0
vehicle_color                0
vehicle_weight               0
fraud                        0
dtype: int64

In [4]:
# Replace NaN values with 0's for marital_status,witness_present_ind as it has binary values
df['marital_status'].fillna(0, inplace=True)
df['witness_present_ind'].fillna(0, inplace=True)

# Replace NaN values with mean values for claim_est_payout,age_of_vehicle as it has continuous values
df['claim_est_payout'].fillna(df['claim_est_payout'].median(), inplace=True)
df['age_of_vehicle'].fillna(df['age_of_vehicle'].median(), inplace=True)



#### Cleaning independent variables

In [5]:
median_age = df['age_of_driver'].median()
# Replace ages greater than 100 with the median age
df['age_of_driver'] = np.where(df['age_of_driver'] > 100, median_age, df['age_of_driver'])


median_income = df['annual_income'].median()
# Replace ages greater than 100 with the median age
df['annual_income'] = np.where(df['annual_income'] < 0, median_income, df['annual_income'])

In [6]:
df['annual_income']

0        38301.0
1        30445.0
2        38923.0
3        40605.0
4        36380.0
          ...   
17993    42338.0
17994    35579.0
17995    32953.0
17996    39519.0
17997    41126.0
Name: annual_income, Length: 17998, dtype: float64

#### 2. Cleaning target variable

In [7]:
# Checking the target variable
df["fraud"].value_counts()


fraud
 0    15179
 1     2816
-1        3
Name: count, dtype: int64

In [8]:
# Target variable fraud has -1,0,1 values where it can have only 0's and 1's. So dropping the outliers
df = df[df["fraud"] != -1]

#### 3. Handling temporal values


In [9]:
# Convert the 'Date' column to a datetime object
df['claim_date'] = pd.to_datetime(df['claim_date'])
# Extract the year and create a new 'Year' column
df['Claim_Year'] = df['claim_date'].dt.year

#### 4.Visualizing the target variable

In [None]:
ax = df['fraud'].value_counts().plot.bar(color='skyblue', edgecolor='black')
# Adding data labels to the bars
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')
# Display the plot
plt.title('Fraud Distribution')  
plt.show()

In [None]:
age_bins = [18, 30, 40, 50, 60, 70, 100, 120]  # Define your age bins
df['age_group'] = pd.cut(df['age_of_driver'], bins=age_bins, right=True, labels=[f'{age_bins[i-1]}-{age_bins[i]}' for i in range(1, len(age_bins))])

# Count the number of fraud cases in each age group
fraud_counts = df.groupby('age_group')['fraud'].sum()

# Bar chart
fraud_counts.plot(kind='bar', color='red')
plt.title('Fraud Cases Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Number of Fraud Cases')

# Display the plot
plt.show()

# Drop the 'age_group' column if you no longer need it
df.drop('age_group', axis=1, inplace=True)

In [None]:
age_bins = [3,6,9,12,15,18,21]  # Define your age bins
df['Vechicle_age_group'] = pd.cut(df['age_of_vehicle'], bins=age_bins, right=True, labels=[f'{age_bins[i-1]}-{age_bins[i]}' for i in range(1, len(age_bins))])

# Count the number of fraud cases in each age group
fraud_counts = df.groupby('Vechicle_age_group')['fraud'].sum()

# Bar chart
fraud_counts.plot(kind='bar', color='blue')
plt.title('Fraud Cases Distribution by Vehicle Age Group')
plt.xlabel('Vehicle Age Group')
plt.ylabel('Number of Fraud Cases')

# Display the plot
plt.show()

# Drop the 'age_group' column if you no longer need it
df.drop('Vechicle_age_group', axis=1, inplace=True)

#### 5. Visualizing insured's discrete/continuous data

In [None]:
# Create a histogram or density plot
for col in ["age_of_driver", "safty_rating"]:
    sns.histplot(data=df, x=col, hue='fraud', bins=50, kde=True)
    plt.title(f'{col}')
    plt.show()

#### 6. Visualizing insured's categorical data

In [None]:
columns_of_interest = ["gender", "marital_status", "high_education_ind", "address_change_ind", "living_status"]

for col2 in columns_of_interest:
    sns.set_style('whitegrid')
    plt.figure(figsize=(8, 5))  # Adjust the figure size as needed
    
    # Create count plot
    ax = sns.countplot(x=col2, hue='fraud', data=df)
    
    # Add data labels to the bars
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha='center', va='center', xytext=(0, 10), textcoords='offset points')
    
    plt.title(f'{col2}')
    plt.show()

*Marital Status* and *high_education_ind* has almost same claim count pattern. In Feature engineering, we shall drop either one if required.

#### 7. Year vs Claims

In [None]:
ax = sns.countplot(x='Claim_Year', hue='fraud', data=df, palette='RdYlGn')

# Add data labels to the bars
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Display the count plot
plt.title('Claim_Year by Claims')
plt.show()

In 2016, the fraud count seemed to be increased in count compared to 2015

#### 8. Accident site vs Claims

In [None]:
site_counts = df['accident_site'].value_counts()

# Create a pie chart
fig, ax = plt.subplots()
ax.pie(site_counts, labels=site_counts.index, autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral', 'lightgreen'])

# Add data labels to the wedges
for i, (label, count) in enumerate(zip(site_counts.index, site_counts)):
    angle = sum(site_counts[:i]) + count / 2
    x = 0.5 * (1.2 * (angle > 180) - 1)
    y = 1.2 * angle % 180
    ax.annotate(f'{count}', (x, y), xytext=(1.35 * x, 1.35 * y), ha='center', va='center')

# Set aspect ratio to be equal, ensuring the pie chart is circular
ax.axis('equal')

# Display the pie chart
plt.title('Accident Site Distribution')
plt.show()

In [None]:
custom_palette = ['skyblue', 'lightcoral']
sns.countplot(x='accident_site', hue='fraud', data=df, palette=custom_palette)

Accident site LOCAL has the higesht number of claims and highest fraud reported

In [None]:
custom_palette = ['skyblue', 'lightcoral']
sns.countplot(x='channel', hue='fraud', data=df, palette=custom_palette)

In [None]:

# Violin plot for 'accident_site' and 'claim_est_payout'
sns.violinplot(x='accident_site', y='claim_est_payout', hue='fraud', data=df, split=True)
plt.title('Accident Site vs. Claim Est Payout')
plt.show()


#### 9. Claim Estimation Payout for Fraud and Non-Fraud Cases 


In [None]:
# Box plot of claim_est_payout
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='fraud', y='claim_est_payout', data=df)
plt.title('Boxplot of Claim Estimation Payout for Fraud and Non-Fraud Cases')

# Adding data labels
for i, box in enumerate(ax.artists):
    box_value = df[df['fraud'] == i]['claim_est_payout']
    ax.text(i, box_value.median(), f'Median: {box_value.median():.2f}', verticalalignment='center', 
            fontdict={'color': 'white', 'weight': 'bold'})

plt.show()

#### 10. Living status vs Claim Est Payout

In [None]:
# Violin plot for 'living_status' and 'claim_est_payout'
sns.violinplot(x='living_status', y='claim_est_payout', hue='fraud', data=df, split=True)
plt.title('Living Status vs. Claim Est Payout')
plt.show()

In [None]:
# Relationship between 'age_of_driver' and 'claim_est_payout'
sns.scatterplot(x='age_of_driver', y='claim_est_payout', hue='fraud', data=df)
plt.title('Age of Driver vs. Claim Est Payout')
plt.show()

# Relationship between 'annual_income' and 'claim_est_payout'
sns.scatterplot(x='annual_income', y='claim_est_payout', hue='fraud', data=df)
plt.title('Annual Income vs. Claim Est Payout')
plt.show()



### Feature Engineering

In [10]:
#Zip code & claim_date is dropped as its not useful for classification
df = df.drop('claim_date', axis=1)
#Instead of claim_date, Claim_Year will be suitable.
df = df.drop('zip_code', axis=1)

df = df.drop('claim_number', axis=1)
#Claim Number is a unique column, hence removed.

#### DEEP LEARNING

In [11]:
numerical_features=[feature for feature in df.columns if df[feature].dtype!='O']

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Extract numerical columns (you might need to adapt this based on your DataFrame structure)
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns

# Apply Min-Max scaling to the numerical columns
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [13]:
categorical_features=[feature for feature in df.columns if df[feature].dtype=='O']

for feature in categorical_features[:]:
    print(feature,":",len(df[feature].unique()),'labels')

gender : 2 labels
living_status : 2 labels
claim_day_of_week : 7 labels
accident_site : 3 labels
channel : 3 labels
vehicle_category : 3 labels
vehicle_color : 7 labels


In [14]:
dictionary={'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

#Encodes all the day names to assigned numbers
df['claim_day_of_week']=df['claim_day_of_week'].map(dictionary)

In [15]:
Map = df['vehicle_color'].value_counts().to_dict()

#Mapping the respective value counts to the colors
df['vehicle_color']=df['vehicle_color'].map(Map)


In [16]:
# Create dummy columns
accident_site_dummies = pd.get_dummies(df['accident_site'], drop_first=True)
channel_dummies = pd.get_dummies(df['channel'], drop_first=True)
vehicle_category_dummies = pd.get_dummies(df['vehicle_category'], drop_first=True)

# Concatenate the dummy columns with the original DataFrame
df = pd.concat([df, accident_site_dummies], axis=1)
df = pd.concat([df, channel_dummies], axis=1)
df = pd.concat([df, vehicle_category_dummies], axis=1)

# Drop the original 'accident_site' column
df = df.drop('accident_site', axis=1)
df = df.drop('channel', axis=1)
df = df.drop('vehicle_category', axis=1)

In [17]:
df['gender'] = df['gender'].map({'M': 1, 'F': 0})
df['living_status'] = df['living_status'].map({'Rent': 1, 'Own': 0})


In [18]:
from sklearn.model_selection import train_test_split

# Assume 'df' is your DataFrame with features and target variable
# X should contain the features (independent variables), and y should contain the target variable (dependent variable)

# Assuming 'fraud' is the target variable, and the rest are features
X = df.drop('fraud', axis=1)
y = df['fraud']

# Split the data into training and testing sets
# Adjust the 'test_size' parameter as needed (e.g., test_size=0.3 for an 70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (12596, 25) (12596,)
Testing set shape: (5399, 25) (5399,)


In [75]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import ADASYN

# Standardize the features (optional but recommended for neural networks)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Use SMOTE for handling imbalanced data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)


# Build the neural network model
model = Sequential()
model.add(Dense(32, input_dim=X_resampled.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))  # Third hidden layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model on the resampled data
model.fit(X_resampled, y_resampled, epochs=120, batch_size=256, validation_data=(X_test, y_test))

# Make predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)

# Evaluate the model



Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

In [76]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 120 epoh bz 256

Confusion Matrix new1:
[[3783  805]
 [ 568  243]]


In [74]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 10 epoh bz 8

Confusion Matrix new1:
[[3276 1312]
 [ 454  357]]


In [72]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 120 epoh bz 36

Confusion Matrix new1:
[[3663  925]
 [ 558  253]]


In [70]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 120 epoh bz 128

Confusion Matrix new1:
[[3732  856]
 [ 573  238]]


In [64]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 100 epoh bz 256

Confusion Matrix new1:
[[3752  836]
 [ 569  242]]


In [62]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 90 epoh bz 256

Confusion Matrix new1:
[[3699  889]
 [ 580  231]]


In [60]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 70 epoh bz 256

Confusion Matrix new1:
[[3794  794]
 [ 580  231]]


In [56]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 50 epoh bs 256

Confusion Matrix new1:
[[3903  685]
 [ 607  204]]


In [52]:
print("Confusion Matrix new1:")
print(confusion_matrix(y_test, y_pred))#3 layers 1000 epch bs 256

Confusion Matrix new1:
[[3804  784]
 [ 571  240]]
