# Deep Neural Decision Forests

Even though deep learning has attained trendendous success on data domains such as images, audio and texts.
GDBT still rule the domain of tabular data.

In this note we will discuss [Deep Neural Decision Forests](https://ieeexplore.ieee.org/document/7410529) for tabular deep learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
import joblib


# Data

In [2]:
data = pd.read_csv('../input/song-popularity-prediction/train.csv')
print(data.shape)
data.head()

(40000, 15)


Unnamed: 0,id,song_duration_ms,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,audio_mode,speechiness,tempo,time_signature,audio_valence,song_popularity
0,0,212990.0,0.642286,0.85652,0.707073,0.002001,10.0,,-5.619088,0,0.08257,158.386236,4,0.734642,0
1,1,,0.054866,0.733289,0.835545,0.000996,8.0,0.436428,-5.236965,1,0.127358,102.752988,3,0.711531,1
2,2,193213.0,,0.188387,0.783524,-0.002694,5.0,0.170499,-4.951759,0,0.052282,178.685791,3,0.425536,0
3,3,249893.0,0.48866,0.585234,0.552685,0.000608,0.0,0.094805,-7.893694,0,0.035618,128.71563,3,0.453597,0
4,4,165969.0,0.493017,,0.740982,0.002033,10.0,0.094891,-2.684095,0,0.050746,121.928157,4,0.741311,0


In [3]:
test = pd.read_csv('../input/song-popularity-prediction/test.csv')
X_test = test.drop(['id'], axis=1)

In [4]:
X = data.drop(['id', 'song_popularity'], axis=1)
y = data['song_popularity']

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Model

In [6]:
class NeuralDecisionTree(keras.Model):
    def __init__(self, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionTree, self).__init__()
        self.depth = depth
        self.num_leaves = 2 ** depth
        self.num_classes = num_classes
        
        num_used_features = int(num_features * used_features_rate)
        one_hot = np.eye(num_features)
        sampled_feature_indicies = np.random.choice(
            np.arange(num_features), num_used_features, replace=False
        )
        self.used_features_mask = one_hot[sampled_feature_indicies]

        self.pi = tf.Variable(
            initial_value=tf.random_normal_initializer()(
                shape=[self.num_leaves, self.num_classes]
            ),
            dtype="float32",
            trainable=True,
        )
        
        self.decision_fn = L.Dense(
            units=self.num_leaves, activation="sigmoid", name="decision"
        )

    def call(self, features):
        batch_size = tf.shape(features)[0]
        
        features = tf.matmul(
            features, self.used_features_mask, transpose_b=True
        )  
        decisions = tf.expand_dims(
            self.decision_fn(features), axis=2
        )  
        decisions = L.concatenate(
            [decisions, 1 - decisions], axis=2
        ) 

        mu = tf.ones([batch_size, 1, 1])

        begin_idx = 1
        end_idx = 2
        
        for level in range(self.depth):
            mu = tf.reshape(mu, [batch_size, -1, 1]) 
            mu = tf.tile(mu, (1, 1, 2))
            level_decisions = decisions[
                :, begin_idx:end_idx, :
            ]
            mu = mu * level_decisions
            begin_idx = end_idx
            end_idx = begin_idx + 2 ** (level + 1)

        mu = tf.reshape(mu, [batch_size, self.num_leaves])
        probabilities = keras.activations.softmax(self.pi)
        outputs = tf.matmul(mu, probabilities)
        return outputs
    
class NeuralDecisionForest(keras.Model):
    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
        super(NeuralDecisionForest, self).__init__()
        self.ensemble = []
        self.num_classes = num_classes
        for _ in range(num_trees):
            self.ensemble.append(
                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
            )

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        outputs = tf.zeros([batch_size, self.num_classes])

        for tree in self.ensemble:
            outputs += tree(inputs)
        outputs /= len(self.ensemble)
        return outputs

In [7]:
class model_config:
    NUMERIC_FEATURE_NAMES=[
        'song_duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness',
        'speechiness', 'tempo', 'audio_valence'
    ]
    CATEGORICAL_FEATURE_NAMES=[
        'key','audio_mode','time_signature'   
    ]

MAX_EPOCHS  = 250

get_callbacks = lambda : [
    keras.callbacks.EarlyStopping(min_delta=1e-4, patience=10, verbose=1, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(patience=3, verbose=1)
]

# Training

In [8]:
preds_tree = []
preds_forest = []

for fold, (train_index, valid_index) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    mean_imputer = SimpleImputer(strategy='median').fit(X_train[model_config.NUMERIC_FEATURE_NAMES])
    mode_imputer = SimpleImputer(strategy='most_frequent').fit(X_train[model_config.CATEGORICAL_FEATURE_NAMES])
    
    X_train = np.hstack((
        mean_imputer.transform(X_train[model_config.NUMERIC_FEATURE_NAMES]),
        mode_imputer.transform(X_train[model_config.CATEGORICAL_FEATURE_NAMES])
    ))
    X_valid = np.hstack((
        mean_imputer.transform(X_valid[model_config.NUMERIC_FEATURE_NAMES]),
        mode_imputer.transform(X_valid[model_config.CATEGORICAL_FEATURE_NAMES])
    ))
    X_test_ = np.hstack((
        mean_imputer.transform(X_test[model_config.NUMERIC_FEATURE_NAMES]),
        mode_imputer.transform(X_test[model_config.CATEGORICAL_FEATURE_NAMES])
    ))
    
    neural_decsion_tree = NeuralDecisionTree(depth=10, num_features=X.shape[1], used_features_rate=0.8, num_classes=2)
    neural_decsion_tree.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
    neural_decsion_tree.fit(
        X_train, y_train, validation_data=(X_valid, y_valid), callbacks=get_callbacks(), epochs=MAX_EPOCHS
    )  
    preds_tree.append(neural_decsion_tree.predict(X_test_))
    
    neural_decsion_forest = NeuralDecisionForest(
        num_trees=10, depth=10, num_features=X.shape[1], used_features_rate=0.8, num_classes=2
    )
    neural_decsion_forest.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
    neural_decsion_forest.fit(
        X_train, y_train, validation_data=(X_valid, y_valid), callbacks=get_callbacks(), epochs=MAX_EPOCHS
    )  
    preds_forest.append(neural_decsion_forest.predict(X_test_))


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/250
Epoch 6/250
Epoch 7/250

Epoch 00007: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 8/250
Epoch 9/250
Epoch 10/250

Epoch 00010: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 11/250
Restoring model weights from the end of the best epoch.
Epoch 00011: early stopping
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250

Epoch 00015: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 16/250
Epoch 17/250
Epoch 18/250

Epoch 00018: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 19/250
Epoch 20/250
Epoch 21/250

Epoch 00021: ReduceLROnPl

# Submissions

In [9]:
submissions = pd.read_csv('../input/song-popularity-prediction/sample_submission.csv')
submissions['song_popularity'] = np.array([arr[:, 1] for arr in preds_tree]).mean(axis=0)
submissions.to_csv('preds_tree.csv', index=False)

submissions['song_popularity'] = np.array([arr[:, 1] for arr in preds_forest]).mean(axis=0)
submissions.to_csv('preds_forest.csv', index=False)