In [None]:
import time
import numpy as np
import pandas as pd

In [None]:
%%time
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.shape, test.shape, train.isnull().values.any(), test.isnull().values.any(), train.duplicated().sum(), test.duplicated().sum()

In [None]:
train.sample(3)

In [None]:
test.sample(3)

In [None]:
train.Response.value_counts(normalize=True)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
train.describe()

In [None]:
train.set_index('id', inplace=True)

In [None]:
train.drop(columns=['Driving_License'], inplace=True)

In [None]:
feature_info = {
    'feature': [],
    'dtype': [],
    'unique_val': [],
    }

for col in train.columns:
    feature_info['feature'].append(col)
    feature_info['dtype'].append(train[col].dtype)
    feature_info['unique_val'].append(len(train[col].unique()))

features = pd.DataFrame(feature_info)
features.set_index('feature', inplace=True)
features = features.sort_values(by='unique_val', ascending=False)
features

In [None]:
# Seems to be nominal; highly problematic due to 54 unique values
train.drop(columns=['Region_Code'], inplace=True)
# Seems to be nominal; highly problematic due to 152 unique values
train.drop(columns=['Policy_Sales_Channel'], inplace=True)
# nominal feature
train.Previously_Insured = train.Previously_Insured.astype(str)

In [None]:
# rebalanced dataset sampling
train_1 = train[train.Response==1]
train_0 = train[train.Response==0].sample(train_1.shape[0], random_state=666)
df_train = pd.concat([train_0, train_1])
df_train.Response.value_counts(normalize=True)

In [None]:
df_train.shape

In [None]:
from sklearn.model_selection import train_test_split
# train/test split
y = df_train['Response']
X = df_train.drop('Response', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.19, random_state=0, stratify=df_train['Response'])
df_train = pd.concat([X_train, y_train], axis=1)

In [None]:
df_train.shape

In [None]:
X_train.dtypes

In [None]:
from scipy.stats import mannwhitneyu, chi2_contingency

# numeric features testing
for feature in X_train.select_dtypes(exclude='object').columns:
    group_0 = df_train[df_train['Response']==0][feature]
    group_1 = df_train[df_train['Response']==1][feature]
    stat, p = mannwhitneyu(group_0, group_1, alternative='two-sided')
    if p > 0.05:
        print('Feature {} is statistically insignificant'.format(feature))
        
# nominal features testing
for feature in X_train.select_dtypes(include='object').columns:
    contingency_table = pd.crosstab(df_train[feature], df_train['Response'])
    chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)
    if p > 0.05:
        print('Feature {} is statistically insignificant'.format(feature))

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

prep = make_column_transformer((StandardScaler(), X_train.select_dtypes(include='float64').columns), 
                               (MinMaxScaler(), X_train.select_dtypes(include='int64').columns),
                               (OneHotEncoder(drop='first'), X_train.select_dtypes(include='object').columns))

X_train_transformed = prep.fit_transform(X_train)
X_train_transformed = pd.DataFrame(X_train_transformed)
X_train_transformed

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
# Neural network architecture
model = Sequential([
    Input(shape=(8,)),  # Define the input shape here
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

In [None]:
%%time
# compilation
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['AUC'])

# callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.0000001)

# backpropagation
history = model.fit(
    X_train_transformed, y_train,
    epochs=25,
    batch_size=1024,  
    validation_split=0.1,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

In [None]:
%%time
y_pred = model.predict(pd.DataFrame(prep.fit_transform(X_test)), verbose=0)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, np.where(y_pred>0.5,1,0))

TP = cm[1,1]
TN = cm[0,0]
FP = cm[0,1]
FN = cm[1,0]

print('Sensitivity')
print('When the acutal value is positive, how often is the prediction correct?')
print('sensitivity: {}\n'.format(round(TP/float(TP + FN), 3)))
print('Specificity')
print('When the acutal value is negative, how often is the prediction correct?')
print('specificity: {}'.format(round(TN/float(TN + FP), 3)))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Define labels for the confusion matrix
labels = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'], cbar=False)
plt.title('Confusion Matrix', fontsize=24)
plt.xlabel('Predicted Label', fontsize=20)
plt.ylabel('True Label', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
# Compute the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Calculate the AUC (Area Under the Curve)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')  # Diagonal line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Plot the point at the default threshold of 0.5
default_threshold_index = np.where(thresholds > 0.5)[0][-1]
plt.plot(fpr[default_threshold_index], tpr[default_threshold_index], 'ro', label='Threshold = 0.5')
plt.legend(loc="lower right")

plt.show()

In [None]:
model.summary()

In [None]:
test = pd.read_csv('test.csv')

In [None]:
test.set_index('id', inplace=True)
test.drop(columns=['Driving_License'], inplace=True)
test.drop(columns=['Region_Code'], inplace=True)
test.drop(columns=['Policy_Sales_Channel'], inplace=True)

In [None]:
%%time
y_hat = model.predict(pd.DataFrame(prep.fit_transform(test)), verbose=0)

In [None]:
submission = pd.DataFrame(test.reset_index().id).assign(Response=y_hat)
submission

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
import pickle
# Save the preprocessing pipeline
with open('preprocessing_pipeline.pkl', 'wb') as f:
    pickle.dump(prep, f)
    # Save the model in the recommended .keras format
model.save('trained_model.keras')
# Load the model
model = tf.keras.models.load_model('trained_model.keras')