# Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston, load_diabetes, make_regression
from sklearn.metrics import mean_absolute_error as mae_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor as SkBoosting
from catboost import Pool, CatBoostRegressor
import os, sys
import time

## Import regbm module

In [2]:
sys.path.append('..')  # as the module is created in the upper directory
import regbm

## Create pandas data frame to hold the results

In [3]:
res_df = None

# Define pipeline
1. Split dataset into train and test
2. Fit model
3. Evaluate

In [5]:
def_ctor_opts = {
    'min_bins': 16,
    'max_bins': 256,
    'patience': 4,
    'no_early_stopping': False,
    'thread_cnt': 1
}

def_fit_opts = {
    'tree_count': 500,
    'tree_depth': 3,
    'feature_fold_size': 1.0,
    'learning_rate': 0.15,
    'regularization_param': 0.15,
    'early_stopping_delta': 1e-5,
    'batch_part': 1.0,
    'random_state': 12,
    'random_batches': False,
    'random_hist_thresholds': True,
    'remove_regularization_later': True
}


def evaluation_pipeline(dataset_name, x_all, y_all, df, ctor_opts=def_ctor_opts, fit_opts=def_fit_opts):
    # data frame to hold the results
    cur_df = {
        'Dataset': dataset_name,
        'Data size': x_all.shape[0],
        'Data dim': x_all.shape[1],
        'MAE': None,
        'MAE Sklearn': None,
        'MAE CatBoost': None,
        'Fit time': None,
        'Fit time Sklearn': None,
        'Fit time CatBoost': None
    }
    
    # split dataset into train and test
    x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, random_state=12)
    print(f"Train length: {y_train.shape[0]}, test length: {y_test.shape[0]}")
    
    # fit model
    model = regbm.Boosting(**ctor_opts)
    start_time = time.time() # get start time to count the time of execution
    history = model.fit(x_train=x_train, y_train=y_train, x_valid=x_test, y_valid=y_test, **fit_opts)
    exec_time = time.time() - start_time
    print(f"Fit time: {exec_time} seconds")
    print(f"Real tree number: {history.trees_number()}")
    
    # evaluate
    preds = model.predict(x_test)
    mae = mae_score(y_test, preds)
    cur_df['MAE'] = mae
    cur_df['Fit time'] = exec_time

    # compare to Scikit-learn model
    sk_model = SkBoosting(learning_rate=fit_opts['learning_rate'],
                          max_iter=fit_opts['tree_count'],
                          max_depth=fit_opts['tree_depth'])
    start_time = time.time()
    sk_model.fit(x_train, y_train)
    sk_fit_time = time.time() - start_time
    sk_preds = sk_model.predict(x_test)
    sk_mae = mae_score(y_test, sk_preds)
    cur_df['MAE Sklearn'] = sk_mae
    cur_df['Fit time Sklearn'] = sk_fit_time

    # compare to CatBoost model
    cb_model = CatBoostRegressor(iterations=fit_opts['tree_count'],
                                 learning_rate=fit_opts['learning_rate'],
                                 verbose=False,
                                 depth=fit_opts['tree_depth'])
    start_time = time.time()
    cb_model.fit(x_train, y_train, verbose=False)
    cb_fit_time = time.time() - start_time
    cb_preds = cb_model.predict(x_test)
    cb_mae = mae_score(y_test, cb_preds)
    cur_df['MAE CatBoost'] = cb_mae
    cur_df['Fit time CatBoost'] = cb_fit_time
    
    print(f"MAE score: {mae}, Sklearn model MAE: {sk_mae}")
    print(f"CatBoost model MAE: {cb_mae}")
    print(f"Sklearn model fit time: {sk_fit_time} seconds")
    print(f"CatBoost model fit time: {cb_fit_time} seconds")
    
    cur_df = pd.DataFrame(data=cur_df, index=[0])
    
    return df.append(cur_df) if df is not None else cur_df

# Boston dataset
## Get dataset

In [6]:
x_all, y_all = load_boston(return_X_y=True)  # get data
print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 506
Feature count: 13


## Evaluate

In [7]:
res_df = evaluation_pipeline('Boston', x_all, y_all, res_df)

Train length: 404, test length: 102
Fit time: 0.06360411643981934 seconds
Real tree number: 32
MAE score: 2.3952834434792343, Sklearn model MAE: 2.5414756751278706
CatBoost model MAE: 2.23362872607141
Sklearn model fit time: 0.9142591953277588 seconds
CatBoost model fit time: 0.7542943954467773 seconds


# Diabetes dataset
## Get dataset

In [8]:
x_all, y_all = load_diabetes(return_X_y=True)  # get data
print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 442
Feature count: 10


## Evaluate

In [9]:
res_df = evaluation_pipeline('Diabetes', x_all, y_all, res_df)

Train length: 353, test length: 89
Fit time: 0.13262701034545898 seconds
Real tree number: 69
MAE score: 47.64886493509642, Sklearn model MAE: 52.49889344154753
CatBoost model MAE: 51.33442751608146
Sklearn model fit time: 0.7265231609344482 seconds
CatBoost model fit time: 0.45449304580688477 seconds


# Scikit-learn makes regression itself
## Generate dataset

In [9]:
x_all, y_all = make_regression(n_samples=1000, n_features=200, n_informative=150, n_targets=1, bias=10.0, 
                                  noise=3.0, shuffle=True, random_state=12)
print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 1000
Feature count: 200


## Evaluate

In [10]:
res_df = evaluation_pipeline('Make regression', x_all, y_all, res_df)

Train length: 800, test length: 200
Fit time: 23.627938747406006 seconds
Real tree number: 200
MAE score: 465.53869253219204, Sklearn model MAE: 461.3837396763176
CatBoost model MAE: 447.41960131952766
Sklearn model fit time: 2.1584689617156982 seconds
CatBoost model fit time: 2.165522336959839 seconds


# Superconductivity dataset
### Link:
https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data
## Get dataset

In [11]:
data_dir = os.path.join('datasets', 'superconduct')
data_csv = 'train.csv'

# read train
all_data = pd.read_csv(os.path.join(data_dir, data_csv))

# split into target and features
label_name = 'critical_temp'
labels_df = all_data[label_name]  # target df
features_df = all_data.drop(label_name, axis=1)  # featrues df

# convert to numpy arrays
y_all = labels_df.to_numpy()
x_all = features_df.to_numpy()

print(f"Dataset size: {y_all.shape[0]}")
print(f"Feature count: {x_all.shape[1]}")

Dataset size: 21263
Feature count: 81


## Evaluate

In [12]:
supercond_ctor = {
    'min_bins': 128,
    'max_bins': 256,
    'patience': 5,
    'no_early_stopping': False,
    'thread_cnt': 1
}

supercond_fit = {
    'tree_count': 2000,
    'tree_depth': 4,
    'feature_fold_size': 1.0,
    'learning_rate': 0.15,
    'regularization_param': 0.12,
    'early_stopping_delta': 1e-5,
    'batch_part': 0.8,
    'random_state': 12,
    'random_batches': True,
    'random_hist_thresholds': True,
    'remove_regularization_later': True
}

res_df = evaluation_pipeline('Superconductivity', x_all, y_all, res_df, supercond_options)

Train length: 17010, test length: 4253
Fit time: 34.533090353012085 seconds
Real tree number: 3
MAE score: 3.343772801663413e+230, Sklearn model MAE: 5.812342773887075
CatBoost model MAE: 5.67554362319252
Sklearn model fit time: 4.07066535949707 seconds
CatBoost model fit time: 12.015799760818481 seconds


# Summary

In [10]:
res_df

Unnamed: 0,Dataset,Data size,Data dim,MAE,MAE Sklearn,MAE CatBoost,Fit time,Fit time Sklearn,Fit time CatBoost
0,Boston,506,13,2.395283,2.541476,2.233629,0.063604,0.914259,0.754294
0,Diabetes,442,10,47.648865,52.498893,51.334428,0.132627,0.726523,0.454493
