In [None]:
import warnings
warnings.filterwarnings('ignore')

import math
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
from pathlib import Path

from plotnine import *
import os
# suppress tf informational and warning messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, Sequential
from tensorflow.keras import backend as K

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn import preprocessing
from sklearn import metrics

import category_encoders as ce


%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_STATE = 2112
keras.utils.set_random_seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)

In [1]:
INPUT_DIR = Path('/kaggle/input/playground-series-s4e7')
TARGET_NAME = 'Response'

train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')


cont_features = []    
cat_features = [
    'Gender', 'Driving_License', 'Previously_Insured', 
    'Vehicle_Age', 'Vehicle_Damage', 
     'Age', 'Vintage', 'Annual_Premium']

def fe(df):
    return df

# convert to pipeline:
all_data = fe(pd.concat([test_data, train_data]))

oe = preprocessing.OrdinalEncoder()
all_data[cat_features] = oe.fit_transform(all_data.filter(cat_features)).astype('int')

cat_features_card = {}
for f in cat_features:
    cat_features_card[f] = 1 + all_data[f].max()


# treat the 0-1 features as continuous
# everything else will go into an embedding layer.
cont_features = [
    'Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Damage']
cat_features = [ 
    'Vehicle_Age', 'Policy_Sales_Channel', 
    'Region_Code', 'Age', 'Vintage', 'Annual_Premium']

train_data = all_data.query(f"not {TARGET_NAME}.isna()")
test_data  = all_data.query(f"{TARGET_NAME}.isna()").drop(columns=[TARGET_NAME])
features = cont_features + cat_features

# tidy up
all_data = None
import gc
gc.collect()

Collecting plotnine
  Downloading plotnine-0.12.4-py3-none-any.whl.metadata (8.9 kB)
Collecting mizani<0.10.0,>0.9.0 (from plotnine)
  Downloading mizani-0.9.3-py3-none-any.whl.metadata (4.6 kB)
Collecting backports.zoneinfo (from mizani<0.10.0,>0.9.0->plotnine)
  Using cached backports.zoneinfo-0.2.1-cp38-cp38-macosx_11_0_arm64.whl
Downloading plotnine-0.12.4-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m0m
[?25hDownloading mizani-0.9.3-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.7/73.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: backports.zoneinfo, mizani, plotnine
Successfully installed backports.zoneinfo-0.2.1 mizani-0.9.3 plotnine-0.12.4


In [None]:
def Region_code_preprocessing(df):
    if df < 26:
        return 0
    elif 26<= df <29:
        return 1
    else:
        return 2
def policy_code_preprocsssing(df):
    if df < 124:
        return 0
    elif df < 152:
        return 1
    else:
        return 2

train_data['Region_Code'] = train_data['Region_Code'].apply(Region_code_preprocessing)
train_data['Policy_Sales_Channel'] = train_data['Policy_Sales_Channel'].apply(policy_code_preprocsssing)


In [None]:
def build_model(cat_features, cont_features):

    # Define input layers
    cat_inputs = [layers.Input(shape=(1,), name=f'cat{i}') for i in range(len(cat_features))]
    cont_inputs = layers.Input(shape=(len(cont_features),))
    #cont_inputs_bn = layers.BatchNormalization()(cont_inputs)
                                
    # Embedding layers for categorical inputs
    flat_embeddings = []
    for i, f in enumerate(cat_features):
        input_dim = int(cat_features_card[f])
        output_dim = int(min(64, round(1.6 * input_dim ** .56))) # based on the fastai library
        embedding = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)(cat_inputs[i])
        embedding = layers.SpatialDropout1D(.3)(embedding)
        flat_embeddings.append(layers.Flatten()(embedding))
                                
    concatenated_inputs = layers.Concatenate()(flat_embeddings + [cont_inputs, ])
    concatenated_inputs_bn = layers.BatchNormalization()(concatenated_inputs)

    x = layers.Dense(256, activation='mish')(concatenated_inputs_bn)
    #x = layers.Dropout(.3)(x)
    x = layers.BatchNormalization()(x)
    for units in (128,):
        inp = layers.Concatenate()([x, concatenated_inputs_bn])
        x = layers.Dense(units=units, activation='mish')(inp)
        x = layers.Dropout(.3)(x)
        x = layers.BatchNormalization()(x)

    # output layer
    outputs = layers.Dense(1, activation='sigmoid')(x)
    return keras.Model(cat_inputs + [cont_inputs], outputs)

# cosine decay - for later.
# initially use a single learning rate
# for a small number of epochs
epochs = 4
callbacks = []

In [None]:
def fold_logloss(y, preds):
    return metrics.log_loss(y, preds)

def fold_auc(y, preds):
    return metrics.roc_auc_score(y, preds)

# to feed data into the NN
# we feed the categoricals column by column,
# and the continuous features in one lump.
cat_idxs= []
cont_idxs = []
for f in cat_features:
    cat_idxs.append([features.index(f)])
for f in cont_features:
    cont_idxs.append(features.index(f))
    
feature_idxs = cat_idxs + [cont_idxs]

def to_nn_feed(df):
    X = df[feats].values
    result = []
    for f_idx in feature_idxs:
        # housekeeping: to feed data into the NN
        # we feed the categoricals column by column,
        # and the continuous features in one lump.
        result.append(X[:, f_idx])
    return result

def fit_fold(tr, vl, ts):

    model = build_model(cat_features, cont_features)
    model.compile(
        optimizer=keras.optimizers.AdamW(learning_rate=1E-4),
        loss='binary_crossentropy',
        metrics=['auc'])

    history = model.fit(
          to_nn_feed(tr), tr[TARGET_NAME],
          validation_data=(to_nn_feed(vl), vl[TARGET_NAME]),
          batch_size=BS,
          epochs=epochs,
          callbacks=callbacks,
          verbose=0
    )

    vl_pred = model.predict(to_nn_feed(vl), verbose=0, batch_size=BS).flatten()
    ts_pred = model.predict(to_nn_feed(ts), verbose=0, batch_size=BS).flatten()
    
    vl_metric = fold_auc(vl[TARGET_NAME], vl_pred)
    return vl_pred, ts_pred, vl_metric

In [None]:
N_FOLDS = 5
BS = 3000
feats = features

vl_preds = np.zeros(len(train_data))
ts_preds = np.zeros(len(test_data))
vl_metrics = []

keras.utils.set_random_seed(RANDOM_STATE)
k_fold = StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_STATE, shuffle=True)
for tr_idx, vl_idx in k_fold.split(train_data, train_data[TARGET_NAME]):
    tr = train_data.loc[tr_idx]
    vl = train_data.loc[vl_idx]

    vl_pred, ts_pred, vl_metric = fit_fold(tr, vl, test_data)
    
    print(f'  -- fold auc {vl_metric:2.6f}')
    vl_metrics.append(vl_metric)
    vl_preds[vl_idx] += vl_pred
    ts_preds += ts_pred / N_FOLDS

# overall metric:
vl_metric = fold_auc(
    train_data[TARGET_NAME], vl_preds)
print(f'  ----------- {vl_metric:2.6f}')