In [12]:
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_diabetes
import matplotlib.pyplot as plt
import numpy as np

Для реализованной модели градиентного бустинга построить графики зависимости ошибки от количества деревьев в ансамбле и от максимальной глубины деревьев. Сделать выводы о зависимости ошибки от этих параметров.

In [2]:
X, y = load_diabetes(return_X_y=True)
X.shape, y.shape

((442, 10), (442,))

In [3]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25)

In [4]:
def gb_predict(X, trees_list, eta):
    
    predictions = np.zeros(X.shape[0])
    for i, x in enumerate(X):
        prediction = 0
        for alg in trees_list:
            prediction += eta * alg.predict([x])[0]
        predictions[i] = prediction
        
    predictions = np.array(
        [sum([eta * alg.predict([x])[0] for alg in trees_list]) for x in X]
    )

    return predictions

In [5]:
def mean_squared_error(y_real, prediction):
    return (sum((y_real - prediction)**2)) / len(y_real)

In [6]:
def residual(y, z):
    return - (z - y)

In [7]:
def gb_fit(n_trees, max_depth, X_train, X_test, y_train, y_test, eta):
    
    trees = []
    
    train_errors = []
    test_errors = []
    
    for i in range(n_trees):
        tree = DecisionTreeRegressor(max_depth=max_depth, random_state=42)

        if len(trees) == 0:
            tree.fit(X_train, y_train)
            
            train_errors.append(mean_squared_error(y_train, gb_predict(X_train, trees, eta)))
            test_errors.append(mean_squared_error(y_test, gb_predict(X_test, trees, eta)))
        else:
            target = gb_predict(X_train, trees, eta)
            
            tree.fit(X_train, residual(y_train, target))
            
            train_errors.append(mean_squared_error(y_train, gb_predict(X_train, trees, eta)))
            test_errors.append(mean_squared_error(y_test, gb_predict(X_test, trees, eta)))

        trees.append(tree)
        
    return trees, train_errors, test_errors

In [8]:
def evaluate_alg(X_train, X_test, y_train, y_test, trees, eta):
    train_prediction = gb_predict(X_train, trees, eta)

    print(f'Ошибка алгоритма из {n_trees} деревьев глубиной {max_depth} \
    с шагом {eta} на тренировочной выборке: {mean_squared_error(y_train, train_prediction)}')

    test_prediction = gb_predict(X_test, trees, eta)

    print(f'Ошибка алгоритма из {n_trees} деревьев глубиной {max_depth} \
    с шагом {eta} на тестовой выборке: {mean_squared_error(y_test, test_prediction)}')

In [None]:
%%time

n_trees = 50

max_depth = 5

eta = 0.1

trees, train_errors, test_errors = gb_fit(n_trees, max_depth, X_train, X_test, y_train, y_test, eta)

In [None]:
evaluate_alg(X_train, X_test, y_train, y_test, trees, eta)

In [None]:
def get_error_plot(n_trees, train_err, test_err):
    plt.xlabel('Iteration number')
    plt.ylabel('MSE')
    plt.xlim(0, n_trees)
    plt.plot(list(range(n_trees)), train_err, label='train error')
    plt.plot(list(range(n_trees)), test_err, label='test error')
    plt.legend(loc='upper right')
    plt.show()

In [None]:
import time
from tqdm import tqdm

train_errors = []
test_errors = []
eta = 0.1
max_depths = [1,3,5,30]
n_trees = [1,3,5,10,30]

for n_tree in tqdm(n_trees):
    for max_depth in max_depths:
        trees, train_error, test_error = gb_fit(n_tree, max_depth, X_train, X_test, y_train, y_test, eta)
        #print(evaluate_alg(X_train, X_test, y_train, y_test, trees, eta))
        train_errors.append(train_error)
        test_errors.append(test_error)

In [None]:
def list_merge(lstlst):
    all = []
    for lst in lstlst:
      all.extend(lst)
    return all

In [None]:
train_errors_merged = list_merge(train_errors)
test_errors_merged = list_merge(test_errors)

In [None]:
plt.plot(train_errors_merged, label='train errors')
plt.plot(test_errors_merged, label='test errors')
#plt.xlabel('N')
plt.ylabel('Error')
plt.legend(loc='upper right');

In [None]:
import time
from tqdm import tqdm

train_errors = []
test_errors = []
eta = 0.1
max_depths = [1,3,5,30]
n_trees = [1,3,5,10,30]

for max_depth in tqdm(max_depths):
    for n_tree in n_trees:
        trees, train_error, test_error = gb_fit(n_tree, max_depth, X_train, X_test, y_train, y_test, eta)
        #print(evaluate_alg(X_train, X_test, y_train, y_test, trees, eta))
        train_errors.append(train_error)
        test_errors.append(test_error)

In [None]:
train_errors_merged = list_merge(train_errors)
test_errors_merged = list_merge(test_errors)

In [None]:
plt.plot(train_errors_merged, label='train errors')
plt.plot(test_errors_merged, label='test errors')
# plt.xlabel('N')
plt.ylabel('Error')
plt.legend(loc='upper right');

При достижении определённого порога количества деревьев в ансамбле и глубины дерева, начинается переобучение модели