In [1]:
import tensorflow as tf
from tensorflow import keras
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import random
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import os
import tempfile

In [2]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 7214092670133180505
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 1734816564
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 4071252182451656217
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
 xla_global_id: 416903419]

In [3]:
def my_seed_everywhere(seed: int = 42):
    random.seed(seed) # random
    np.random.seed(seed) # np
    os.environ["PYTHONHASHSEED"] = str(seed) # os
    tf.random.set_seed(seed) # tensorflow

my_seed = 42
my_seed_everywhere(my_seed)

In [4]:
train = pd.read_csv("./playground-series-s4e7/train.csv")
target = pd.read_csv("./playground-series-s4e7/test.csv")

In [5]:
neg, pos = np.bincount(train['Response'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 11504798
    Positive: 1415059 (12.30% of total)



In [6]:
category_columns = train.select_dtypes(include="object").columns
category_columns = category_columns.tolist()

def label_encoding(df):
    # Instance of LabelEncoder
    label_encoders = {col: LabelEncoder() for col in category_columns}

    # Label Encoding
    for col in category_columns:
        df[col] = label_encoders[col].fit_transform(df[col])

label_encoding(train)

In [7]:
category_columns = target.select_dtypes(include="object").columns
category_columns = category_columns.tolist()

def label_encoding(df):
    # Instance of LabelEncoder
    label_encoders = {col: LabelEncoder() for col in category_columns}

    # Label Encoding
    for col in category_columns:
        df[col] = label_encoders[col].fit_transform(df[col])

label_encoding(target)

In [8]:
train = train.drop(columns=['id'])
target = target.drop(columns=['id'])

In [9]:
target

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,0,20,1,47.0,0,1,0,2630.0,160.0,228
1,1,47,1,28.0,0,0,1,37483.0,124.0,123
2,1,47,1,43.0,0,0,1,2630.0,26.0,271
3,0,22,1,47.0,1,1,0,24502.0,152.0,115
4,1,51,1,19.0,0,0,0,34115.0,124.0,148
...,...,...,...,...,...,...,...,...,...,...
7669861,1,57,1,28.0,0,0,1,51661.0,124.0,109
7669862,1,28,1,50.0,1,1,0,25651.0,152.0,184
7669863,1,47,1,33.0,1,0,0,2630.0,138.0,63
7669864,1,30,1,28.0,0,1,1,38866.0,124.0,119


In [10]:
# Use a utility from sklearn to split and shuffle your dataset.
train_df, test_df = train_test_split(train, test_size=0.2)
train_df, val_df = train_test_split(train_df, test_size=0.2)

# Form np arrays of labels and features.
train_labels = np.array(train_df.pop('Response'))
bool_train_labels = train_labels != 0
val_labels = np.array(val_df.pop('Response'))
test_labels = np.array(test_df.pop('Response'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

In [11]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)
target_features = scaler.transform(target)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
print('Target features shape:', target_features.shape)



Training labels shape: (7363070,)
Validation labels shape: (1840768,)
Test labels shape: (2300960,)
Training features shape: (7363070, 10)
Validation features shape: (1840768, 10)
Test features shape: (2300960, 10)
Target features shape: (7669866, 10)


In [12]:
import tensorflow as tf
from tensorflow import keras

# Define the metrics
METRICS = [
    keras.metrics.TruePositives(name='tp'),
    keras.metrics.FalsePositives(name='fp'),
    keras.metrics.TrueNegatives(name='tn'),
    keras.metrics.FalseNegatives(name='fn'),
    keras.metrics.BinaryAccuracy(name='accuracy'),
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall'),
    keras.metrics.AUC(name='auc'),
    keras.metrics.AUC(name='prc', curve='PR'),  # precision-recall curve
]

def make_model(metrics=METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    
    model = keras.Sequential([
        keras.layers.Dense(
            8, activation='relu',
            input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(
            1, activation='sigmoid',
            bias_initializer=output_bias
        ),
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=metrics
    )

    return model


In [13]:
EPOCHS = 100
BATCH_SIZE = 1024

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_prc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [14]:
model = make_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 88        
                                                                 
 dropout (Dropout)           (None, 8)                 0         
                                                                 
 dense_1 (Dense)             (None, 1)                 9         
                                                                 
Total params: 97
Trainable params: 97
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.predict(train_features[:10])



array([[0.8200438 ],
       [0.7540257 ],
       [0.7575045 ],
       [0.98849887],
       [0.87092674],
       [0.8190312 ],
       [0.8504568 ],
       [0.86311513],
       [0.87460834],
       [0.9212649 ]], dtype=float32)

In [16]:
results = model.evaluate(train_features, train_labels, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

Loss: 1.9033


In [17]:
initial_bias = np.log([pos/neg])
initial_bias

array([-1.96434774])

In [18]:
model = make_model(output_bias=initial_bias)
model.predict(train_features[:10])



array([[0.31429777],
       [0.11521886],
       [0.05819667],
       [0.04672507],
       [0.10645718],
       [0.22233212],
       [0.29374936],
       [0.01632581],
       [0.1031905 ],
       [0.08407525]], dtype=float32)

In [19]:
results = model.evaluate(train_features, train_labels, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

Loss: 0.3618


In [20]:
initial_weights = os.path.join(tempfile.mkdtemp(), 'initial_weights')
model.save_weights(initial_weights)

In [21]:
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

Weight for class 0: 0.57
Weight for class 1: 4.07


In [22]:
with tf.device('/GPU:0'):
    model = make_model()
    model.load_weights(initial_weights)
    baseline_history = model.fit(
        train_features,
        train_labels,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=[early_stopping],
        validation_data=(val_features, val_labels))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

KeyboardInterrupt: 

In [24]:
y_pred_prob = model.predict(test_features).ravel()
roc_auc = roc_auc_score(test_labels, y_pred_prob)
print(f'ROC AUC: {roc_auc:.2f}')

ROC AUC: 0.85
