# Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_diabetes, make_regression
from sklearn.metrics import mean_absolute_error as mae_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor as SkBoosting
from catboost import Pool, CatBoostRegressor
import os, sys
import time

## Import JIT trees module

In [2]:
sys.path.append('..')  # as the module is created in the upper directory
import JITtrees

## Create pandas data frame to hold the results

In [3]:
res_df = None

# Define pipeline
1. Split dataset into train and test
2. Fit model
3. Evaluate

In [4]:
default_options = {
    'min_bins': 16,
    'max_bins': 256,
    'patience': 4,
    'tree_count': 200,
    'tree_depth': 4,
    'feature_fold_size': 0.8,
    'learning_rate': 0.2,
    'es_delta': 1e-5,
    'batch_part': 0.8,
    'use_jit': False,
    'jit_type': 0,
    'random_batches': True
}


def evaluation_pipeline(dataset_name, x_all, y_all, df, options=default_options):
    # data frame to hold the results
    cur_df = {
        'Dataset': dataset_name,
        'Data size': x_all.shape[0],
        'Data dim': x_all.shape[1],
        'MAE': None,
        'MAE Sklearn': None,
        'MAE CatBoost': None,
        'Fit time': None,
        'Fit time Sklearn': None,
        'Fit time CatBoost': None
    }
    
    # split dataset into train and test
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=12)
    print(f"Train length: {y_train.shape[0]}, test length: {y_test.shape[0]}")
    
    # fit model
    model = JITtrees.Boosting(options['min_bins'], options['max_bins'], options['patience'])
    start_time = time.time() # get start time to count the time of execution
    history = model.fit(x_train, y_train, x_test, y_test, options['tree_count'],
        options['tree_depth'], int(np.ceil(x_train.shape[1] * options['feature_fold_size'])), 
        options['learning_rate'], options['es_delta'],
        options['batch_part'], options['use_jit'], options['jit_type'], options['random_batches'])
    exec_time = time.time() - start_time
    print(f"Fit time: {exec_time} seconds")
    
    # evaluate
    preds = model.predict(x_test)
    mae = mae_score(y_test, preds)
    cur_df['MAE'] = mae
    cur_df['Fit time'] = exec_time

    # compare to Scikit-learn model
    sk_model = SkBoosting(learning_rate=options['learning_rate'],
                          max_iter=options['tree_count'],
                          max_depth=options['tree_depth'])
    start_time = time.time()
    sk_model.fit(x_train, y_train)
    sk_fit_time = time.time() - start_time
    sk_preds = sk_model.predict(x_test)
    sk_mae = mae_score(y_test, sk_preds)
    cur_df['MAE Sklearn'] = sk_mae
    cur_df['Fit time Sklearn'] = sk_fit_time

    # compare to CatBoost model
    cb_model = CatBoostRegressor(iterations=options['tree_count'],
                                 learning_rate=options['learning_rate'],
                                 verbose=False,
                                 depth=options['tree_depth'])
    start_time = time.time()
    cb_model.fit(x_train, y_train, verbose=False)
    cb_fit_time = time.time() - start_time
    cb_preds = cb_model.predict(x_test)
    cb_mae = mae_score(y_test, cb_preds)
    cur_df['MAE CatBoost'] = cb_mae
    cur_df['Fit time CatBoost'] = cb_fit_time
    
    print(f"MAE score: {mae}, Sklearn model MAE: {sk_mae}")
    print(f"CatBoost model MAE: {cb_mae}")
    print(f"Sklearn model fit time: {sk_fit_time} seconds")
    print(f"CatBoost model fit time: {cb_fit_time} seconds")
    
    cur_df = pd.DataFrame(data=cur_df, index=[0])
    
    return df.append(cur_df) if df is not None else cur_df

# Boston dataset
## Get dataset

In [5]:
x_all, y_all = load_boston(return_X_y=True)  # get data
print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 506
Feature count: 13


## Evaluate

In [6]:
res_df = evaluation_pipeline('Boston', x_all, y_all, res_df)

Train length: 404, test length: 102
Fit time: 0.22744965553283691 seconds
MAE score: 2.10174363458832, Sklearn model MAE: 2.527354851153334
CatBoost model MAE: 2.2150299019712345
Sklearn model fit time: 0.4393606185913086 seconds
CatBoost model fit time: 1.1615710258483887 seconds


# Diabetes dataset
## Get dataset

In [7]:
x_all, y_all = load_diabetes(return_X_y=True)  # get data
print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 442
Feature count: 10


## Evaluate

In [8]:
res_df = evaluation_pipeline('Diabetes', x_all, y_all, res_df)

Train length: 353, test length: 89
Fit time: 0.04387927055358887 seconds
MAE score: 48.00637314495751, Sklearn model MAE: 53.389011650016585
CatBoost model MAE: 49.45795579850272
Sklearn model fit time: 0.4109036922454834 seconds
CatBoost model fit time: 0.3739969730377197 seconds


# Scikit-learn makes regression itself
## Generate dataset

In [13]:
x_all, y_all = make_regression(n_samples=1000, n_features=200, n_informative=150, n_targets=1, bias=10.0, 
                                  noise=3.0, shuffle=True, random_state=12)
print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 1000
Feature count: 200


## Evaluate

In [14]:
res_df = evaluation_pipeline('Make regression', x_all, y_all, res_df)

Train length: 800, test length: 200
Fit time: 24.71819281578064 seconds
MAE score: 460.6287593804577, Sklearn model MAE: 461.3837396763176
CatBoost model MAE: 447.4196013195277
Sklearn model fit time: 2.2539684772491455 seconds
CatBoost model fit time: 2.1660587787628174 seconds


# Superconductivity dataset
### Link:
https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data
## Get dataset

In [15]:
data_dir = os.path.join('datasets', 'superconduct')
data_csv = 'train.csv'

# read train
all_data = pd.read_csv(os.path.join(data_dir, data_csv))

# split into target and features
label_name = 'critical_temp'
labels_df = all_data[label_name]  # target df
features_df = all_data.drop(label_name, axis=1)  # featrues df

# convert to numpy arrays
y_all = labels_df.to_numpy()
x_all = features_df.to_numpy()

print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 21263
Feature count: 81


## Evaluate

In [16]:
supercond_options = {
    'min_bins': 64,
    'max_bins': 256,
    'patience': 4,
    'tree_count': 300,
    'tree_depth': 6,
    'feature_fold_size': 0.8,
    'learning_rate': 0.2,
    'es_delta': 1e-6,
    'batch_part': 0.5,
    'use_jit': False,
    'jit_type': 0,
    'random_batches': True
}
res_df = evaluation_pipeline('Superconductivity', x_all, y_all, res_df, supercond_options)

Train length: 17010, test length: 4253
Fit time: 34.77028179168701 seconds
MAE score: 1.5071374856171906e+250, Sklearn model MAE: 5.646929980989992
CatBoost model MAE: 6.205642193901763
Sklearn model fit time: 4.592872142791748 seconds
CatBoost model fit time: 5.426388740539551 seconds


# Summary
|Dataset|MAE|Sklearn MAE|fit time|Sklearn fit time|
|-------|---|-----------|--------|----------------|
| Boston | 2.35 | 2.53 | 0.25 | 0.39 |
| Diabetes | 48.11 | 53.39 | 0.18 | 0.43 |
| Generated regression (1000, 200) | 490.7 | 512.3 | 110.7 | 4.62 |
| Generated regression (1000, 100) | 279.1 | 282.8 | 21.2 | 1.27 |

In [17]:
res_df

Unnamed: 0,Dataset,Data size,Data dim,MAE,MAE Sklearn,MAE CatBoost,Fit time,Fit time Sklearn,Fit time CatBoost
0,Boston,506,13,2.101744,2.527355,2.21503,0.22745,0.439361,1.161571
0,Diabetes,442,10,48.00637,53.389012,49.457956,0.043879,0.410904,0.373997
0,Make regression,1000,100,259.3012,282.795242,255.629052,12.968372,1.297237,1.182057
0,Make regression,1000,200,460.6288,461.38374,447.419601,24.718193,2.253968,2.166059
0,Superconductivity,21263,81,1.507137e+250,5.64693,6.205642,34.770282,4.592872,5.426389
