In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
FILE_PATH = "/kaggle/input/tabular-playground-series-sep-2021/"

train_file_path = os.path.join(FILE_PATH, "train.csv")
test_file_path  = os.path.join(FILE_PATH, "test.csv")

In this notebook, we will use TensorFlow Decision Forests `GradientBoostedTreesModel` in order to create a classification model that can achieve good results.

We will use 2 preprocessing methods:
* On the training and validation dataframes directly
* Creating a Keras model that we will pass to the GradientBoostedTreesModel preprocessing parameter

Both methods achieved almost the same result on the validation set

So, let's start ... first let's install `tensorflow_decision_forests`

In [None]:
!pip install tensorflow_decision_forests

# Imports & Configuration

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_decision_forests as tfdf

from tensorflow import keras

import matplotlib.pyplot as plt

print("TensorFlow Version: {}".format(tf.__version__))
print("TensorFlow Decision Forests: {}".format(tfdf.__version__))

In [None]:
np.random.seed(1337)
tf.random.set_seed(1337)

VALID_RATIO = 0.1

# Data Loading

In [None]:
train_full_data = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(train_full_data.shape))

In [None]:
train_full_data.head()

The data is composed of 120 columns all of which are numerical:
* 118 feature columns named `f1, f2, ... f118`
* label column named `claim`
* An `id` column that we will drop

In [None]:
train_full_data = train_full_data.drop('id', axis=1)
features = [f'f{i}' for i in range(1, 119)]
label = 'claim'

Let's check if we have missing data

In [None]:
train_full_data[features].isna().sum()

We can see that the data contains a lot of missing values. Approximately 15000 for each feature. That's around 1.5%

In the approach that we will use in this notebook, we will keep the missing values but will add 3 additional features:
* `Number of missing values` in each sample. So for each sample out of the 957919, we will see how many values are missing across all features
* `Standard deviation` over axis=1 which gives us the standard deviation for each sample
* `Unbiased Variance` over axis=1 which gives us the variance for each sample

We will be implementing this preprocessing using 2 methods as I said before.
Let's start with the first one

In [None]:
def split_dataset(dataset, test_ratio=0.1):
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

# First Approach: Preprocessing using pandas

## Preprocessing

Let's add the 3 additional features using pandas

In [None]:
train_full_data['nan'] = train_full_data[features].isnull().sum(axis=1)
train_full_data['std'] = train_full_data[features].std(axis=1)
train_full_data['var'] = train_full_data[features].var(axis=1)

## Datasets

Split the dataframe into training and validation sets

In [None]:
train_ds_pd, valid_ds_pd = split_dataset(train_full_data, test_ratio=VALID_RATIO)
print("{} samples in training and {} in validation".format(train_ds_pd.shape[0], valid_ds_pd.shape[0]))

Create the training and validation datasets using TensorFlow Decision Forests `pd_dataframe_to_tf_dataset`

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label)

## GradientBoostedTreesModel Training

For hyperparameter tuning, we did the following:
* Tried the predefined hyperparameters which did not give good results especially `benchmark_rank1` which gave very bad results. `better_default` on the other hand gave acceptable results
* Used Keras tuner in order to search for hyperparameters that maximise AUC. If you want to check how to do this, check out the following [notebook](https://www.kaggle.com/ekaterinadranitsyna/kerastuner-tf-decision-forest?linkId=133421702) by Ekaterina Dranitsyna

**Note:** The idea of adding 3 new columns is taken also from the above mentioned notebook by Ekaterina Dranitsyna

Finally the hyperparameters that gave me the best results where the below which are `better_default` with `l1_regularization`

In [None]:
model_1 = tfdf.keras.GradientBoostedTreesModel(
    growing_strategy = 'BEST_FIRST_GLOBAL',
    l1_regularization = 0.8
)

model_1.compile(metrics=[keras.metrics.AUC()])

You can now run the below cell and go prepare a cup of ☕ ... it will take around 30 minutes to finish.

In [None]:
%%time
model_1.fit(train_ds, verbose=0)

## Post Training Analysis

`model.summary()` shows us the overall structure of the model

In [None]:
model_1.summary()

We can access all this information using the model inspector

In [None]:
inspector = model_1.make_inspector()

In [None]:
print("Model contains {} trees".format(inspector.num_trees()))

In [None]:
inspector.features()

In [None]:
inspector.variable_importances()

### Training Logs

In [None]:
logs = inspector.training_logs()

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel('Number of trees')
plt.ylabel('Accuracy (out-of-bag)')
plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel('Number of trees')
plt.ylabel('Logloss (out-of-bag)')
plt.show()

## Evaluation

Let's evaluate the model using the validation dataset

In [None]:
evaluation = model_1.evaluate(valid_ds, return_dict = True)
for name, value in evaluation.items():
    print("{}: {}".format(name, value))

The model achieved `AUC = 0.8160` on the validation dataset

Let's now look at our second approach

# Second Approach: TensorFlow based preprocessing

In this second approach, we will add the 3 columns that we added using pandas but rather through TensorFlow / Keras preprocessing that we will pass to the model via the preprocessing parameter.

You will notice that this approach is more difficult, more complicated, however it has the advantage of having the preprocessing within the model itself.

In [None]:
train_full_data = pd.read_csv(train_file_path)

In [None]:
train_full_data = train_full_data.drop('id', axis=1)
features = [f'f{i}' for i in range(1, 119)]
label = 'claim'

## Datasets

In [None]:
train_ds_pd, valid_ds_pd = split_dataset(train_full_data, test_ratio=VALID_RATIO)
print("{} samples in training and {} in validation".format(train_ds_pd.shape[0], valid_ds_pd.shape[0]))

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label)

## Preprocessing

In [None]:
def reduce_mean_without_nan(input_tensor):
    return tf.experimental.numpy.nanmean(input_tensor, axis=1, keepdims=True)

def get_nan_std_var(input_tensor):
    # nan tensor
    is_nan = tf.math.is_nan(input_tensor)
    
    # Get the number of nans available in each sample
    nan_number_per_sample = tf.cast(
        tf.math.reduce_sum(
            tf.where(is_nan, [1], [0]),
            axis=1,
            keepdims=True
        ),
        tf.float32
    )
    
    # Calculate mean excluding nan
    mean_excluding_nan = keras.layers.Lambda(reduce_mean_without_nan)(input_tensor)
    
    # input tensor replacing nan with the mean for each row (i.e. for each sample)
    input_tensor_with_nan_replaced_by_mean = tf.where(
        is_nan,
        mean_excluding_nan,
        input_tensor
    )
    
    squared_distance_from_mean = tf.math.reduce_sum(
        tf.math.square(
            input_tensor_with_nan_replaced_by_mean - mean_excluding_nan,
        ),
        axis=1,
        keepdims=True
    )
    
    # Calculate std
    std = tf.math.sqrt(
        tf.math.divide(
            squared_distance_from_mean,
            input_tensor.shape[1] - nan_number_per_sample,
        )
    )
    
    # Calculate var
    var = tf.math.divide(
        squared_distance_from_mean,
        input_tensor.shape[1] - nan_number_per_sample - 1,
    )
    
    stack =tf.stack([nan_number_per_sample, std, var], axis=1)
    
    return tf.squeeze(stack, axis=-1)

In [None]:
def build_preprocessing_model(features):
    # Create inputs
    input_layers = []

    # Each feature will be one input
    for feature in features:
        input_layers.append(keras.layers.Input(shape=(1,), name=feature))
    
    # Concatenate all inputs
    inputs = keras.layers.concatenate(input_layers, name="inputs")
        
    
    # Add 3 additional features:
    # - How many nan are they in each sample
    # - std accross the features of each sample
    # - var accross the features of each sample
    additional_features = get_nan_std_var(inputs)
    
    outputs = keras.layers.concatenate([inputs, additional_features])
    
    return keras.Model(input_layers, outputs)

preprocessing_model = build_preprocessing_model(features)
preprocessing_model.summary()

## GradientBoostedTreesModel Training

Let's first define the features that we will be using 

In [None]:
tfdf_features = []

for feature in features:
    print("Creating FeatureUsage for {}".format(feature))
    tfdf_features.append(tfdf.keras.FeatureUsage(name=feature))

In [None]:
model_2 = tfdf.keras.GradientBoostedTreesModel(
    growing_strategy = 'BEST_FIRST_GLOBAL',
    l1_regularization = 0.6,
    preprocessing = preprocessing_model
)

model_2.compile(metrics=[keras.metrics.AUC()])

In [None]:
%%time
model_2.fit(train_ds, verbose=0)

## Post Training Analysis

In [None]:
model_2.summary()

In [None]:
inspector = model_2.make_inspector()

In [None]:
print("Model contains {} trees".format(inspector.num_trees()))

In [None]:
inspector.features()

In [None]:
inspector.variable_importances()

### Training Logs

In [None]:
logs = inspector.training_logs()

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
plt.xlabel('Number of trees')
plt.ylabel('Accuracy (out-of-bag)')
plt.subplot(1, 2, 2)
plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
plt.xlabel('Number of trees')
plt.ylabel('Logloss (out-of-bag)')
plt.show()

### Tensorboard

In [None]:
inspector.export_to_tensorboard("./model_2_logs")

In [None]:
%tensorboard --logdir "./model_2_logs"

## Evaluation

In [None]:
evaluation = model_2.evaluate(valid_ds, return_dict = True)
for name, value in evaluation.items():
    print("{}: {}".format(name, value))

This model achieved a lower `AUC = 0.8111` on the validation dataset than the previous one which achieved `0.8160`

This model as you noticed created the additional features differently. While the pandas based method counts the nan values and calculates std and var on the whole training dataset, this model calculates and counts them for each batch.

# Test Set Prediction
We will use the 2nd approach for our prediction

In [None]:
model = model_2

In [None]:
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('id')

In [None]:
if model == model_1:
    test_data['nan'] = test_data[features].isnull().sum(axis=1)
    test_data['std'] = test_data[features].std(axis=1)
    test_data['var'] = test_data[features].var(axis=1)

In [None]:
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_data)

In [None]:
preds = model.predict(test_ds)

In [None]:
output = pd.DataFrame({'id': ids,
                       'claim': preds.squeeze()})

output.head()

In [None]:
output_filename = "test_prediction_output.csv"
output.to_csv(output_filename, index=False)

# References


*   [KerasTuner + TF Decision Forest](https://www.kaggle.com/ekaterinadranitsyna/kerastuner-tf-decision-forest?linkId=133421702) by [Ekaterina Dranitsyna](https://www.kaggle.com/ekaterinadranitsyna)
*   [TensorFlow Decision Forests tutorials](https://www.tensorflow.org/decision_forests/tutorials) which are a set of 3 very interesting (beginner, intermediate and advanced levels) tutorials.
*   The [TensorFlow Forum](https://discuss.tensorflow.org/) where one can get in touch with the TensorFlow community. Check it out if you haven't yet.
