In [2]:
import xgboost as xgb


# notes

1. is xgboost an algorithm for regression (similar to lasso)
2. returns feature importance score
3. A loss function minimize the differences between the actual (ground truth) values and model predictions. a metric measure the similarity between ground truth and model predictions

In [3]:
import seaborn as sns

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import warnings

from sklearn.metrics import mean_squared_error

In [4]:
diamonds = sns.load_dataset("diamonds")

diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
from sklearn.model_selection import train_test_split

X, y = diamonds.drop('price', axis=1), diamonds[['price']]

In [6]:
# Extract text features and convert them to category

cats = X.select_dtypes(exclude=np.number).columns.tolist()

print(cats)
for col in cats:
   X[col] = X[col].astype('category')

['cut', 'color', 'clarity']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# regression using xgboost 

set the parameters

In [9]:
params = {"objective": "reg:squarederror"}

n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

Predict

In [10]:
preds = model.predict(dtest_reg)

In [11]:
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 552.861


# Finding the best n - rounds of boosting

use evals = evals: return the loss function value for each boost round
use verbose_eval = 10: display the loss function value every 10 rounds
use early_stopping_rounds = 50: stop the training if the loss function value for validation did not improve for the last 50 rounds

In [12]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]


model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals = evals,
   verbose_eval=50,
   early_stopping_rounds=50
)

model

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[50]	train-rmse:438.68033	validation-rmse:554.13365
[99]	train-rmse:383.48826	validation-rmse:552.86131


<xgboost.core.Booster at 0x17361fd60>

In [15]:
type(model)

xgboost.core.Booster

In [16]:
model.attributes()

{'best_iteration': '79', 'best_score': '548.6484465072929'}