# **Random Forest (ML-CUP22)**

In [None]:
# Install packages
!pip install tensorflow_decision_forests
!pip install dtreeviz
!pip install keras-tuner -U -qq

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from statistics import mean, stdev

# Decision Forest
import tensorflow as tf
import tensorflow_decision_forests as tfdf

# Visualize the decision forest
import dtreeviz

# keras_tuner for GridSearch
import keras_tuner as kt

In [None]:
# Mount google drive to access data loaded on Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

**Definition of Functions**

In [None]:
## Definition of Mean Euclidean Error (MEE): metric used for performance evaluation of the model
def MEE(y_true, y_pred):
  eucl_norm = tf.norm(y_true - y_pred, ord='euclidean', axis=1)
  return tf.reduce_mean(eucl_norm)

---
## **Data Preparation**

In [None]:
# Loading the training dataset ML-CUP
path = '/content/drive/MyDrive/data/Data_CUP/ML-CUP22-TR.csv'
col_names = ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'a7', 'a8', 'a9', 'x', 'y']

data = pd.read_csv(path, names=col_names, comment='#')
data.shape

In [None]:
# Check the first 5 rows
data.head()

In [None]:
# Check for NaN values
data.isnull().any()

In [None]:
# Split data into design (85%) and test (15%) sets
design, test = train_test_split(data, test_size=0.15, shuffle=True, random_state=42)
print(f'design shape: {design.shape}')
print(f'test shape: {test.shape}')

# Split design data into train (80%) and validation (20%) sets
train, val = train_test_split(design, test_size=0.20, shuffle=True, random_state=42)
print(f'train shape: {train.shape}')
print(f'val shape: {val.shape}')

In [None]:
# Descriptive statistics on design data
design.describe()

In [None]:
# X_design, y_design split
X_design , y_design = design.iloc[:, :-2], design.loc[:, ['x', 'y']]
print(f'X_design shape: {X_design.shape}')
print(f'y_design shape: {y_design.shape}')
print()

# X_train, y_train split
X_train , y_train = train.iloc[:, :-2], train.loc[:, ['x', 'y']]
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print()

# X_val, y_val split
X_val , y_val = val.iloc[:, :-2], val.loc[:, ['x', 'y']]
print(f'X_val shape: {X_val.shape}')
print(f'y_val shape: {y_val.shape}')
print()

# X_test, y_test split
X_test , y_test = test.iloc[:, :-2], test.loc[:, ['x', 'y']]
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

In [None]:
# RandomForestModel works with tf.data.Dataset
# In Multitask the tf.dataset label (i.e. the second element of the dataset) should be a dictionary of label_key:label_values

# Prepare Design set
y_design_dict = {
    'x': y_design['x'].values,
    'y': y_design['y'].values
    }
design_dataset = tf.data.Dataset.from_tensor_slices((X_design, y_design_dict)).batch(512)

# Prepare Training set
y_train_dict = {
    'x': y_train['x'].values,
    'y': y_train['y'].values
    }
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_dict)).batch(512)

# Prepare Validation set
y_val_dict = {
    'x': y_val['x'].values,
    'y': y_val['y'].values
    }

val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val_dict)).batch(512)

---
## **Preliminary Experimental Phase**

In [None]:
# Multitask Regression (model with multiple outputs trained to predict different labels)
mulitask = [tfdf.keras.MultiTaskItem(label=t, task=tfdf.keras.Task.REGRESSION) for t in ['x', 'y']]

In [None]:
# Configure the model
model = tfdf.keras.RandomForestModel(multitask=mulitask,
                                     winner_take_all=False,
                                     max_depth=30,
                                     num_trees=300
                                     )

# Compile the model
model.compile(metrics=[MEE, 'mse'])

# Train the model
model.fit(train_dataset)

# Model summary
model.summary()

In [None]:
# x_mee, y_mee
evaluation = model.evaluate(val_dataset, return_dict=True)
print()

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

In [None]:
# Predict the validation dataset
val_pred = model.predict(X_val.values)
val_pred = pd.DataFrame({k:v.ravel() for k, v in val_pred.items()})

In [None]:
# Mean Euclidean Error
y_true = tf.convert_to_tensor(y_val, dtype=tf.double)
y_pred = tf.convert_to_tensor(val_pred, dtype=tf.double)

mee = MEE(y_true, y_pred)
print(mee)

---
## **GridSearch**

### **Coarse-grained GridSearch**

In [None]:
# Multitask Regression
mulitask = [tfdf.keras.MultiTaskItem(label=t, task=tfdf.keras.Task.REGRESSION) for t in ['x', 'y']]

In [None]:
def build_model(hp):
  model = tfdf.keras.RandomForestModel(
      min_examples=hp.Choice("min_examples", [2, 5, 10]),
      categorical_algorithm=hp.Choice("categorical_algorithm", ["CART", "RANDOM"]),
      max_depth=hp.Choice("max_depth", [5, 15, 30]),
      num_candidate_attributes_ratio=hp.Choice("num_candidate_attributes_ratio", [-1.0, 0.2, 0.5]),
      num_trees = hp.Choice("num_trees", [100, 500, 1000]),
      winner_take_all=hp.Boolean("winner_take_all"),
      multitask=mulitask
  )
  # Optimize the model MEE as computed on the validation dataset
  model.compile(metrics=[MEE])
  return model

In [None]:
%%time
# Define Keras Tuner
keras_tuner = kt.GridSearch(
    build_model,
    # Minimizing the sum of all the objectives to minimize
    objective=[kt.Objective("val_x_MEE", direction="min"),
               kt.Objective("val_y_MEE", direction="min")],
    max_consecutive_failed_trials=1,
    overwrite=True
    )

# Tune the model
keras_tuner.search(train_dataset, validation_data=val_dataset)

In [None]:
# Summary results
keras_tuner.results_summary()

### **Fine-grained GridSearch**

In [None]:
def build_model(hp):
  model = tfdf.keras.RandomForestModel(
      min_examples=hp.Choice("min_examples", [1, 2, 3]),
      max_depth=hp.Choice("max_depth", [15, 20, 25, 30]),
      num_candidate_attributes_ratio=hp.Fixed("num_candidate_attributes_ratio", 0.2),
      num_trees = hp.Choice("num_trees", [500, 750, 1000]),
      multitask=mulitask
  )

  # Optimize the model MEE as computed on the validation dataset.
  model.compile(metrics=[MEE])
  return model

In [None]:
%%time
# Define Keras Tuner
keras_tuner = kt.GridSearch(
    build_model,
    # we will minimize the sum of all the objectives to minimize
    objective=[kt.Objective("val_x_MEE", direction="min"),
               kt.Objective("val_y_MEE", direction="min")],
    max_consecutive_failed_trials=1,
    overwrite=True
    )

# Tune the model
keras_tuner.search(train_dataset, validation_data=val_dataset)

In [None]:
# Summary results
keras_tuner.results_summary()

In [None]:
# Top 5 Models
top5_hps = keras_tuner.get_best_hyperparameters(5)

for i, hps in enumerate(top5_hps):
  print(f"model{i}: {hps.values}")

In [None]:
# Best Model
best_hps = top5_hps[0].values
print("Best hyper-parameters:", best_hps)

---
## **K-Fold Cross Validation best model**

In [None]:
# Val MEE on the individual folds
MEE_per_fold = []

# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True)

# Run a 5-folds cross-validation
for  fold_idx, (train_indices, val_indices) in enumerate(kfold.split(design)):
  print(f"Running fold {fold_idx+1}")

  # Extract the training and testing examples
  X_train , y_train = design.iloc[train_indices, :-2], design.iloc[train_indices, -2:]
  X_val , y_val = design.iloc[val_indices, :-2], design.iloc[val_indices, -2:]

  y_train_dict = {
     'x': y_train['x'].values,
     'y': y_train['y'].values
     }

  train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train_dict)).batch(64)

  # Multitask Regression (model with multiple outputs trained to predict different labels)
  mulitask = [tfdf.keras.MultiTaskItem(label=t, task=tfdf.keras.Task.REGRESSION) for t in ['x', 'y']]

  # Configure the model
  model = tfdf.keras.RandomForestModel(**best_hps, multitask=mulitask)

  # Compile the model
  model.compile(metrics=[MEE])

  # Train the model
  model.fit(train_dataset, verbose=False)

  # Evaluate the model
  val_pred = model.predict(X_val.values, verbose=0)
  val_pred = pd.DataFrame({k:v.ravel() for k, v in val_pred.items()})
  y_pred = tf.convert_to_tensor(val_pred, dtype=tf.double)
  y_true = tf.convert_to_tensor(y_val, dtype=tf.double)
  mee = float(MEE(y_true, y_pred))
  print(f"MEE: {mee}")

  MEE_per_fold.append(mee)


print(f"Mean: {mean(MEE_per_fold)}")
print(f"Stdev: {stdev(MEE_per_fold)}")

---
## **Model Assessment**

In [None]:
# Re-instantiate the best model
best_model = tfdf.keras.RandomForestModel(min_examples= 2,
                                          max_depth= 20,
                                          num_candidate_attributes_ratio= 0.2,
                                          num_trees= 500,
                                          multitask=mulitask)

In [None]:
# Re-Train the model on Design set
best_model.fit(design_dataset)

In [None]:
# Summary results
best_model.summary()

In [None]:
# Prediction on Design set
design_pred = best_model.predict(X_design.values, verbose=0)
design_pred = pd.DataFrame({k:v.ravel() for k,v in design_pred.items()})
y_pred_design = tf.convert_to_tensor(design_pred, dtype=tf.double)
y_true = tf.convert_to_tensor(y_design, dtype=tf.double)
mee_design = float(MEE(y_true, y_pred_design))
print(f'Design MEE: {mee_design}')

In [None]:
# Prediction on Test set
test_pred = best_model.predict(X_test.values, verbose=0)
test_pred = pd.DataFrame({k:v.ravel() for k, v in test_pred.items()})
y_pred_TS = tf.convert_to_tensor(test_pred, dtype=tf.double)
y_true = tf.convert_to_tensor(y_test, dtype=tf.double)
mee_test = float(MEE(y_true, y_pred_TS))
print(f"Test MEE: {mee_test}")

## **Prediction Plots**

In [None]:
# y prediction
plt.figure(figsize=(8, 6))
sns.set_theme(style="darkgrid")
sns.scatterplot(x=y_test['x'], y=y_test['y'], color='k', label='y_true')
sns.scatterplot(x=y_test['x'], y=y_pred_TS[:, 1], color='g', label='y_pred')
plt.title('Random Forest y prediction', fontsize=14)
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.legend()
plt.show()

In [None]:
# x prediction
plt.figure(figsize=(8, 6))
sns.set_theme(style="darkgrid")
sns.scatterplot(x=y_test['x'], y=y_test['y'], color='k', label='x_true')
sns.scatterplot(x=y_pred_TS[:, 0], y=y_test['y'], color='g', label='x_pred')
plt.title('Random Forest x prediction', fontsize=14)
plt.xlabel('x', fontsize=14)
plt.ylabel('y', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.legend(fontsize=14)
plt.show()