# Boosting trees
> "Simple boosting trees in python from scratch"

- comments: true
- badges: true
- categories: [ai]
- publishes: true

# Boosting trees

In [1]:
from sklearn.datasets import load_wine, load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [2]:
raw = load_breast_cancer(return_X_y=True)

X = pd.DataFrame(raw[0])
y = pd.DataFrame(raw[1])

In [34]:
initial_prediction_proba = y.mean() 
initial_prediction_classes = round(initial_prediction_proba)
initial_prediction_logodds = np.log(initial_prediction_proba / (1-initial_prediction_proba))
proba_residuals = (y - initial_prediction_proba).values.reshape(-1)

print("Score with mean: ", f1_score(y, [initial_prediction_classes[0]]*len(y)))

trees = []
for i in range(5): 

  # Train a tree on the latest residuals
  tree = DecisionTreeRegressor(max_depth=1)
  tree.fit(X, proba_residuals)
  trees.append(tree)

  # Compute the predictions of the trees
  predictions = np.array(y.mean())
  for tree in trees: 
    predictions = predictions + tree.predict(X).reshape(-1) # Each tree tries to predict the error. 

  # Get the new residuals. This is what we fit the next tree on
  proba_residuals = y.values.reshape(-1) - predictions

predictions = np.array(y.mean())
for tree in trees: 
  predictions = tree.predict(X).reshape(-1) + predictions
print("Score with boosting: ", f1_score(y, 1 * (predictions > 0.5)))

Score with mean:  0.7710583153347732
Score with boosting:  0.947945205479452
