In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from tqdm.notebook import trange, tqdm
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [24]:
def create_regression_dataset(n_features=150, n_samples=3000):
    X, y = make_regression(n_features=n_features, n_samples=n_samples)
    features = pd.DataFrame(X)
    for i in range(n_features):
        features = features.rename(columns={i:f'x{i}'})
    target = pd.DataFrame(y).rename(columns={0:'y_true'})
    df = pd.concat((features, target), axis=1)
    return df
df = create_regression_dataset()
feature_names = df.drop('y_true', axis=1).columns.tolist()
df

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x141,x142,x143,x144,x145,x146,x147,x148,x149,y_true
0,0.520022,0.526296,0.330135,-0.495069,-1.543022,-1.157556,0.713485,-0.753843,-0.042196,-0.640776,...,-1.161863,-0.707183,-0.836864,-2.285807,0.105416,0.470246,1.691534,0.700196,1.388067,186.695370
1,-0.045069,0.396401,-1.224466,-0.924438,0.283884,-0.507533,-0.336987,-2.518666,-0.170255,0.787259,...,-0.073507,1.554640,-0.869647,0.686178,0.542037,0.842261,-0.115568,0.445450,0.816829,-81.537075
2,0.884023,1.033987,0.262848,1.158599,0.554924,1.002516,0.225207,-0.910007,0.193731,0.439605,...,-1.915032,-0.885403,0.302551,0.137838,-0.206347,-0.995145,0.651125,-1.245912,0.046721,-22.345007
3,-0.394077,-0.892001,-0.053115,1.019966,0.788991,0.677271,1.037286,2.009356,-0.630339,1.714598,...,1.445044,0.309793,-1.560443,-0.095056,0.015458,-0.127816,-0.718520,-0.674763,-0.068461,-205.465900
4,-0.597951,-0.026114,0.654033,-0.195915,-0.043026,-1.111827,-1.206759,-0.826245,1.119364,0.304132,...,-0.686543,-0.777192,0.360564,0.170128,1.573750,-0.580461,2.470817,0.611631,0.744259,37.434623
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,-0.582839,1.007109,0.378854,-1.676764,0.936225,-0.425095,1.029678,-1.038520,0.625444,0.750064,...,0.123147,-1.023604,-0.601598,-1.018903,0.644635,-0.820875,1.245103,-1.352967,0.395663,70.910500
2996,-1.099996,-1.078055,-2.434939,0.971502,-0.245185,-3.195841,0.117417,0.117068,0.074351,-0.538186,...,0.559533,1.166425,-0.496831,-1.409971,-1.697588,-1.054659,-0.495898,0.692523,0.135959,127.174754
2997,-2.965513,-0.357274,0.836868,-0.276761,-0.452158,0.410543,-0.394848,0.307580,-0.075481,-0.899352,...,-0.878181,-1.035308,-2.373042,-0.671410,0.840647,0.397245,-0.332068,-1.445178,-0.642550,59.918312
2998,-2.632438,-0.136622,1.286956,0.659059,1.099575,-1.223162,-1.213365,-0.596027,-1.956609,1.063232,...,1.036287,0.479131,0.486716,-0.577870,-0.051621,0.906419,-0.704719,0.547574,-2.020050,-219.150621


In [25]:
kfold= KFold(n_splits=10,random_state=42,shuffle=True) #kfold cross validation, 10-subsets
X_train, X_test, y_train, y_test = train_test_split(df[feature_names], df['y_true'], test_size=0.3,random_state=17)

In [26]:
train = pd.concat((X_train, y_train), axis=1)

In [27]:
test = pd.concat((X_test, y_test), axis=1)

In [28]:
# LEARNING ON TRAIN DATA

In [29]:
list_of_trees = []
iterations = 1000
learning_rate = 0.1
for i in trange(iterations, desc='LEARNING PROGRESS'):
    if i == 0:
        train['y_pred_0'] = train['y_true'].mean()

    train[f'gradient_{i}'] = -(train['y_true'] - train[f'y_pred_{i}'])

    tree = DecisionTreeRegressor(max_depth=15)
    tree.fit(train[feature_names], train[f'gradient_{i}'])
    list_of_trees.append(tree)

    train[f'tree_{i+1}_pred'] = tree.predict(train[feature_names])

    train[f'y_pred_{i+1}'] = train[f'y_pred_{i}'] - learning_rate*train[f'tree_{i+1}_pred']


print('=======================================')
# checking metrics
mae = mean_absolute_error(train['y_true'], train['y_pred_0'])
mape = mean_absolute_percentage_error(train['y_true'], train['y_pred_0'])
print(f'initial MAE: {mae}')
print(f'initial MAPE: {round(mape,2)}%')
print('========= difference in errors ========')
mae = mean_absolute_error(train['y_true'], train[f'y_pred_{iterations}'])
mape = mean_absolute_percentage_error(train['y_true'], train[f'y_pred_{iterations}'])
print(f'final MAE: {mae}')
print(f'final MAPE: {round(mape,2)}%')

LEARNING PROGRESS:   0%|          | 0/1000 [00:00<?, ?it/s]

initial MAE: 120.0039621469355
initial MAPE: 1.06%
final MAE: 1.1775096226954228e-08
final MAPE: 0.0%


In [30]:
# VALIDATION ON TEST SET

In [31]:
for i in trange(iterations, desc='TEST VALIDATION PROGRESS'):
    if i == 0:
        test['y_pred_0'] = test['y_true'].mean()

    test[f'gradient_{i}'] = -(test['y_true'] - test[f'y_pred_{i}'])    

    test[f'tree_{i+1}_pred'] = list_of_trees[i].predict(test[feature_names])

    test[f'y_pred_{i+1}'] = test[f'y_pred_{i}'] - learning_rate*test[f'tree_{i+1}_pred']


print('=======================================')
# checking metrics
mae = mean_absolute_error(test['y_true'], test['y_pred_0'])
mape = mean_absolute_percentage_error(test['y_true'], test['y_pred_0'])
print(f'initial MAE: {mae}')
print(f'initial MAPE: {round(mape,2)}%')
print('========= difference in errors ========')
mae = mean_absolute_error(test['y_true'], test[f'y_pred_{iterations}'])
mape = mean_absolute_percentage_error(test['y_true'], test[f'y_pred_{iterations}'])
print(f'final MAE: {mae}')
print(f'final MAPE: {round(mape,2)}%')

TEST VALIDATION PROGRESS:   0%|          | 0/1000 [00:00<?, ?it/s]

initial MAE: 117.62654022761137
initial MAPE: 1.04%
final MAE: 81.84223497535976
final MAPE: 2.76%


In [32]:
# FITTING REAL GRADIENT BOOSTING MODEL

In [36]:
xg_reg = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 15, alpha = 5, n_estimators = 1000)

xg_reg.fit(X_train, y_train)

preds = xg_reg.predict(X_test)


# checking metrics
mae = mean_absolute_error(test['y_true'], test['y_pred_0'])
mape = mean_absolute_percentage_error(test['y_true'], test['y_pred_0'])
print(f'initial MAE: {mae}')
print(f'initial MAPE: {round(mape,2)}%')
print('========= next step ========')
mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)
print(f'final MAE: {mae}')
print(f'final MAPE: {round(mape,2)}%')

initial MAE: 117.62654022761137
initial MAPE: 1.04%
final MAE: 87.18877898910397
final MAPE: 1.3%


In [43]:
pd.DataFrame({'col_name': xg_reg.feature_importances_}, index=xg_reg.feature_names_in_).sort_values(by='col_name', ascending=False)

Unnamed: 0,col_name
x82,0.049211
x53,0.038002
x59,0.037605
x57,0.028268
x109,0.024776
...,...
x8,0.001294
x131,0.001148
x67,0.000926
x69,0.000799
