# Boosting with Cybooster

This notebook demonstrates the usage of the `cybooster` library for boosting various scikit-learn-like (having `fit` and `predict` methods is enough) estimators on different datasets. It includes examples of regression and classification and time series forecasting tasks. It's worth mentioning that only regressors are accepted in `cybooster`, no matter the task.

`cybooster` is a high-performance generic gradient boosting (any based learner can be used) library designed for classification and regression tasks. It is built on Cython (that is, C) for speed and efficiency. This version will also be more GPU friendly, thanks to JAX, making it suitable for large datasets.

In `cybooster`, each base learner is augmented with a randomized neural network (a generalization of [https://www.researchgate.net/publication/346059361_LSBoost_gradient_boosted_penalized_nonlinear_least_squares](https://www.researchgate.net/publication/346059361_LSBoost_gradient_boosted_penalized_nonlinear_least_squares) to any base learner), which allows the model to learn complex patterns in the data. The library supports both classification and regression tasks, making it versatile for various machine learning applications.

`cybooster` is born from `mlsauce`, that might be difficult to install on some systems (for now). `cybooster` installation is straightforward.


In [None]:
!pip install cybooster --upgrade --no-cache-dir

# 1 - Simple example

In [None]:
from cybooster import BoosterClassifier, BoosterRegressor
from sklearn.datasets import load_iris, load_diabetes, load_breast_cancer, load_digits, load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, root_mean_squared_error
from sklearn.linear_model import LinearRegression
from time import time


# Regression Examples
X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regressor = BoosterRegressor(obj=LinearRegression(), n_estimators=100, learning_rate=0.1,
                             n_hidden_features=10, verbose=1, seed=42)
start = time()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print(f"Elapsed: {time() - start} s")
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE for regression: {rmse:.4f}")

# Classification Example
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = BoosterClassifier(obj=LinearRegression(), n_estimators=100, learning_rate=0.1,
                               n_hidden_features=10, verbose=1, seed=42)
start = time()
try:
    classifier.fit(X_train, y_train)
except Exception as e: # this is for Windows users
    y_train = y_train.astype('int32')
    classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Elapsed: {time() - start} s")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for classification: {accuracy:.4f}")

X, y = load_wine(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = BoosterClassifier(obj=LinearRegression(), n_estimators=100, learning_rate=0.1,
                               n_hidden_features=10, verbose=1, seed=42)
start = time()
try:
    classifier.fit(X_train, y_train)
except Exception as e: # this is for Windows users
    y_train = y_train.astype('int32')
    classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Elapsed: {time() - start} s")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for classification: {accuracy:.4f}")

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = BoosterClassifier(obj=LinearRegression(), n_estimators=100, learning_rate=0.1,
                               n_hidden_features=10, verbose=1, seed=42)
start = time()
try:
    classifier.fit(X_train, y_train)
except Exception as e:
    y_train = y_train.astype('int32')
    classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Elapsed: {time() - start} s")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for classification: {accuracy:.4f}")

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
classifier = BoosterClassifier(obj=LinearRegression(), n_estimators=100, learning_rate=0.1,
                               n_hidden_features=10, verbose=1, seed=42)
start = time()
try:
    classifier.fit(X_train, y_train)
except Exception as e:
    y_train = y_train.astype('int32')
    classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(f"Elapsed: {time() - start} s")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy for classification: {accuracy:.4f}")


# 2 - Loop on models

In [None]:
import pandas as pd
from sklearn.utils import all_estimators
from tqdm import tqdm
from sklearn.utils.multiclass import type_of_target
from sklearn.datasets import fetch_california_housing


# Get all scikit-learn regressors
estimators = all_estimators(type_filter='regressor')

results_regressors = []
results_classifiers = []

verbose = 0

for name, RegressorClass in tqdm(estimators):

    if name in ['MultiOutputRegressor', 'MultiOutputClassifier', 'StackingRegressor', 'StackingClassifier',
                    'VotingRegressor', 'VotingClassifier', 'TransformedTargetRegressor', 'RegressorChain',
                    'GradientBoostingRegressor', 'HistGradientBoostingRegressor', 'RandomForestRegressor',
                    'ExtraTreesRegressor', 'MLPRegressor']:

        continue

    try:
      print(f"\nRunning with {name}")
      # Regression Examples
      X, y = load_diabetes(return_X_y=True)
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
      regressor = BoosterRegressor(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                  n_hidden_features=10, verbose=verbose, seed=42)
      start = time()
      regressor.fit(X_train, y_train)
      y_pred = regressor.predict(X_test)
      print(f"Elapsed: {time() - start} s")
      rmse = root_mean_squared_error(y_test, y_pred)
      print(f"RMSE for regression: {rmse:.4f}")
      results_regressors.append(["diabetes", name, rmse])

      X, y = fetch_california_housing(return_X_y=True)
      X_train, X_test, y_train, y_test = train_test_split(X[:1000,:], y[:1000], test_size=0.2, random_state=42)
      regressor = BoosterRegressor(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                  n_hidden_features=10, verbose=verbose, seed=42)
      start = time()
      regressor.fit(X_train, y_train)
      y_pred = regressor.predict(X_test)
      print(f"Elapsed: {time() - start} s")
      rmse = root_mean_squared_error(y_test, y_pred)
      print(f"RMSE for regression: {rmse:.4f}")
      results_regressors.append(["housing", name, rmse])

      # Classification Example
      X, y = load_iris(return_X_y=True)
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
      classifier = BoosterClassifier(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                    n_hidden_features=10, verbose=verbose, seed=42)
      start = time()
      try:
          classifier.fit(X_train, y_train)
      except Exception as e: # this is for Windows users
          y_train = y_train.astype('int32')
          classifier.fit(X_train, y_train)
      y_pred = classifier.predict(X_test)
      print(f"Elapsed: {time() - start} s")
      accuracy = accuracy_score(y_test, y_pred)
      print(f"Accuracy for classification: {accuracy:.4f}")
      results_classifiers.append(["iris", name, accuracy])

      X, y = load_wine(return_X_y=True)
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
      classifier = BoosterClassifier(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                    n_hidden_features=10, verbose=verbose, seed=42)
      start = time()
      try:
          classifier.fit(X_train, y_train)
      except Exception as e: # this is for Windows users
          y_train = y_train.astype('int32')
          classifier.fit(X_train, y_train)
      y_pred = classifier.predict(X_test)
      print(f"Elapsed: {time() - start} s")
      accuracy = accuracy_score(y_test, y_pred)
      print(f"Accuracy for classification: {accuracy:.4f}")
      results_classifiers.append(["wine", name, accuracy])

      X, y = load_breast_cancer(return_X_y=True)
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
      classifier = BoosterClassifier(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                    n_hidden_features=10, verbose=verbose, seed=42)
      start = time()
      try:
          classifier.fit(X_train, y_train)
      except Exception as e:
          y_train = y_train.astype('int32')
          classifier.fit(X_train, y_train)
      y_pred = classifier.predict(X_test)
      print(f"Elapsed: {time() - start} s")
      accuracy = accuracy_score(y_test, y_pred)
      print(f"Accuracy for classification: {accuracy:.4f}")
      results_classifiers.append(["breast_cancer", name, accuracy])

      X, y = load_digits(return_X_y=True)
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
      classifier = BoosterClassifier(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                    n_hidden_features=10, verbose=verbose, seed=42)
      start = time()
      try:
          classifier.fit(X_train, y_train)
      except Exception as e:
          y_train = y_train.astype('int32')
          classifier.fit(X_train, y_train)
      y_pred = classifier.predict(X_test)
      print(f"Elapsed: {time() - start} s")
      accuracy = accuracy_score(y_test, y_pred)
      print(f"Accuracy for classification: {accuracy:.4f}")
      results_classifiers.append(["digits", name, accuracy])
    except Exception as e:
      continue

In [None]:
df_results_regressors = pd.DataFrame(results_regressors, columns=['Dataset', 'Model', 'RMSE'])
df_results_regressors.sort_values(by='RMSE', ascending=True, inplace=True)

df_results_classifiers = pd.DataFrame(results_classifiers, columns=['Dataset', 'Model', 'Accuracy'])
df_results_classifiers.sort_values(by='Accuracy', ascending=False, inplace=True)

df_results_regressors_diabetes = df_results_regressors[df_results_regressors['Dataset'] == 'diabetes']
df_results_regressors_housing = df_results_regressors[df_results_regressors['Dataset'] == 'housing']
df_results_classifiers_iris = df_results_classifiers[df_results_classifiers['Dataset'] == 'iris']
df_results_classifiers_wine = df_results_classifiers[df_results_classifiers['Dataset'] == 'wine']
df_results_classifiers_breast_cancer = df_results_classifiers[df_results_classifiers['Dataset'] == 'breast_cancer']
df_results_classifiers_digits = df_results_classifiers[df_results_classifiers['Dataset'] == 'digits']

print("Best regressors:")
display(df_results_regressors_diabetes)
display(df_results_regressors_housing)

print("\nBest classifiers:")
display(df_results_classifiers_iris)
display(df_results_classifiers_wine)
display(df_results_classifiers_breast_cancer)
display(df_results_classifiers_digits)

# 3 - Time series forecasting using `nnetsauce.MTS`

In [None]:
!pip install nnetsauce

In [None]:
import nnetsauce as ns
import pandas as pd
from sklearn.utils import all_estimators
from tqdm import tqdm
from sklearn.utils.multiclass import type_of_target
from sklearn.datasets import fetch_california_housing


# Get all scikit-learn regressors
estimators = all_estimators(type_filter='regressor')

results_regressors = []
results_classifiers = []

verbose = 0

url = "https://raw.githubusercontent.com/Techtonique/"
url += "datasets/main/time_series/multivariate/"
url += "ice_cream_vs_heater.csv"
df_temp = pd.read_csv(url)
df_temp.index = pd.DatetimeIndex(df_temp.date)
# must have# first other difference
df_icecream = df_temp.drop(columns=['date']).diff().dropna()

for name, RegressorClass in tqdm(estimators):

    if name in ['AdaBoostRegressor', 'MultiOutputRegressor', 'MultiOutputClassifier', 'StackingRegressor', 'StackingClassifier',
                    'VotingRegressor', 'VotingClassifier', 'TransformedTargetRegressor', 'RegressorChain',
                    'GradientBoostingRegressor', 'HistGradientBoostingRegressor', 'RandomForestRegressor',
                    'ExtraTreesRegressor', 'MLPRegressor', 'TheilSenRegressor']:

        continue

    try:
      print(f"\nRunning with {name}")
      # Regression Examples
      mdl = BoosterRegressor(obj=RegressorClass(), n_estimators=100, learning_rate=0.1,
                                  n_hidden_features=10, verbose=verbose, seed=42)
      regr = ns.MTS(obj=mdl,
                    type_pi="scp2-kde",
                    replications=250,
                    lags=20,
                    show_progress=False)
      regr.fit(df_icecream)
      regr.predict(h=30)
      regr.plot("heater", type_plot="pi")
    except Exception as e:
      print(e)
      continue