In [1]:
import pandas as pd
from keras.layers import Dense
from keras.models import Sequential

In [2]:
df = pd.read_csv('featuure_engineered.csv') 
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,...,policy_bind_date_year,policy_bind_date_cosine_month,policy_bind_date_sine_month,policy_bind_date_cosine_day,policy_bind_date_sine_day,incident_date_year,incident_date_cosine_month,incident_date_sine_month,incident_date_cosine_day,incident_date_sine_day
0,328,48,521585,OH,250/500,1000,1406.91,0,466132,MALE,...,2014,0.5,-0.8660254,-0.8660254,0.5,2015,0.866025,0.5,0.8660254,0.5
1,228,42,342868,IN,250/500,2000,1197.22,5000000,468176,MALE,...,2006,-1.0,1.224647e-16,5.510911e-16,1.0,2015,0.866025,0.5,-4.286264e-16,-1.0
2,134,29,687698,OH,100/300,2000,1413.14,5000000,430632,FEMALE,...,2000,-1.83697e-16,-1.0,-1.0,1.224647e-16,2015,0.5,0.866025,0.5,-0.866025
3,256,41,227811,IL,250/500,2000,1415.74,6000000,608117,FEMALE,...,1990,-0.8660254,0.5,0.8660254,0.5,2015,0.866025,0.5,0.5,-0.866025
4,228,44,367455,IL,500/1000,1000,1583.91,6000000,610706,MALE,...,2014,-1.0,1.224647e-16,-1.0,1.224647e-16,2015,0.5,0.866025,-0.8660254,0.5


In [3]:
from utils.modelling import get_preprocessor

In [4]:
preprocessor = get_preprocessor(df, 'fraud_reported')

## Spliting the data

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [21]:
X = df.drop('fraud_reported', axis=1)
target = df['fraud_reported']

# encode y to 0 and 1
lb = LabelBinarizer()
y = lb.fit_transform(target)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=42,
                                                    test_size=.2)

In [None]:
# Check the labels representation
lb.classes_

array(['N', 'Y'], dtype='<U1')

`0 represents N`

`1 represents Y`

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                  random_state=42,
                                                  test_size=.2)

## Preprocessing

In [24]:
# preprocessing features
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

In [25]:
# Getting number of features
n_features = preprocessor.get_feature_names_out().size
n_features

143

# Baseline network

In [56]:
base_model = Sequential()

# hidden layer
base_model.add(Dense(100, activation='relu', input_shape=(n_features,)))

# output layer
base_model.add(Dense(1, activation='sigmoid'))

# compiling model
base_model.compile(optimizer='SGD',
                   loss='mse',
                   metrics=['accuracy'])

# fitting the model
base_model.fit(X_train_preprocessed,
               y_train,
               epochs=100,
               batch_size=32,
               validation_data=(X_val_preprocessed, y_val))

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5658 - loss: 0.2488 - val_accuracy: 0.6375 - val_loss: 0.2319
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6566 - loss: 0.2237 - val_accuracy: 0.6562 - val_loss: 0.2203
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7069 - loss: 0.2081 - val_accuracy: 0.7000 - val_loss: 0.2130
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7215 - loss: 0.2017 - val_accuracy: 0.7125 - val_loss: 0.2085
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7576 - loss: 0.1870 - val_accuracy: 0.7063 - val_loss: 0.2056
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7637 - loss: 0.1869 - val_accuracy: 0.7188 - val_loss: 0.2037
Epoch 7/100
[1m20/20[0m [32m━━━

<keras.src.callbacks.history.History at 0x1ef67739f10>

In [57]:
base_model.evaluate(X_val_preprocessed, y_val)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6708 - loss: 0.2240 


[0.1858249455690384, 0.731249988079071]

The baseline model has  low accuracy of about 73%

### 2. Adam optimizer

In [54]:
adam_model = Sequential()

# hidden layer
adam_model.add(Dense(100, activation='relu', input_shape=(n_features,)))

# output layer
adam_model.add(Dense(1, activation='sigmoid'))

# compiling model
adam_model.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# fitting the model
adam_model.fit(X_train_preprocessed,
               y_train,
               epochs=100,
               batch_size=32,
               validation_data=(X_val_preprocessed, y_val))

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.6331 - loss: 0.6512 - val_accuracy: 0.7250 - val_loss: 0.5629
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7782 - loss: 0.4956 - val_accuracy: 0.7250 - val_loss: 0.5547
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7651 - loss: 0.4773 - val_accuracy: 0.7250 - val_loss: 0.5345
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7762 - loss: 0.4289 - val_accuracy: 0.7375 - val_loss: 0.5262
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7763 - loss: 0.4041 - val_accuracy: 0.7437 - val_loss: 0.5178
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7992 - loss: 0.3735 - val_accuracy: 0.7437 - val_loss: 0.5113
Epoch 7/100
[1m20/20[0m [32m━━

<keras.src.callbacks.history.History at 0x1ef69c2a1c0>

In [55]:
adam_model.evaluate(X_val_preprocessed, y_val)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7531 - loss: 0.9912 


[0.8261071443557739, 0.7749999761581421]

some improvement is evident

### 3. adding more layers

In [76]:
deep_model = Sequential()

# first hidden layer
deep_model.add(Dense(64, activation='relu', input_shape=(n_features,)))
# other hidden layer
deep_model.add(Dense(32, activation='relu'))
deep_model.add(Dense(16, activation='relu'))
deep_model.add(Dense(8, activation='relu'))

# output layer
deep_model.add(Dense(1, activation='sigmoid'))

# compiling model
deep_model.compile(optimizer='adam',
                   loss='mse',
                   metrics=['accuracy'])

# fitting the model
deep_model_history = deep_model.fit(X_train_preprocessed,
                                    y_train,
                                    epochs=100,
                                    batch_size=32,
                                    validation_data=(X_val_preprocessed, y_val))

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6301 - loss: 0.2347 - val_accuracy: 0.7250 - val_loss: 0.2013
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7699 - loss: 0.1783 - val_accuracy: 0.7250 - val_loss: 0.1963
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7623 - loss: 0.1689 - val_accuracy: 0.7250 - val_loss: 0.1918
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7523 - loss: 0.1650 - val_accuracy: 0.7250 - val_loss: 0.1869
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7552 - loss: 0.1539 - val_accuracy: 0.7250 - val_loss: 0.1838
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8091 - loss: 0.1120 - val_accuracy: 0.7375 - val_loss: 0.1790
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━

In [None]:
deep_model.evaluate(X_val_preprocessed, y_val)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7003 - loss: 0.2549 


[0.22670325636863708, 0.731249988079071]

In [86]:
from sklearn.metrics import recall_score, precision_score

In [85]:
def get_preds(array):
    """gets the predictions eitherr 0 or 1 given the array
    of probabilities from the NN predictions
    """
    return [0 if x < 0.5 else 1 for x in array]

In [87]:
base_preds = get_preds(base_model.predict(X_test_preprocessed))
adam_preds = get_preds(adam_model.predict(X_test_preprocessed))
deep_preds = get_preds(deep_model.predict(X_test_preprocessed))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [88]:
base_recall = recall_score(y_test, base_preds)
base_precision = precision_score(y_test, base_preds)
print('Base model Neural Network Recall:', base_recall)
print('Base model NN Precison:', base_precision)

Base model Neural Network Recall: 0.01818181818181818
Base model NN Precison: 0.3333333333333333


In [89]:
adam_recall = recall_score(y_test, base_preds)
adam_precision = precision_score(y_test, adam_preds)
print('adam model Neural Network Recall:', adam_recall)
print('adam model NN Precison:', adam_precision)

adam model Neural Network Recall: 0.01818181818181818
adam model NN Precison: 0.3793103448275862


In [91]:
deep_recall = recall_score(y_test, base_preds)
deep_precision = precision_score(y_test, deep_preds)
print('Deep model Neural Network Recall:', deep_recall)
print('Deep model NN Precison:', deep_precision)

Deep model Neural Network Recall: 0.01818181818181818
Deep model NN Precison: 0.3448275862068966


Deep learning fails to give a better performing model. This may be due to the small dataset