In [19]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score,mean_absolute_error


In [2]:
data = sns.load_dataset("diamonds")
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [4]:
X = data.drop("price",axis=1)
y = data["price"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [12]:
from scipy.stats import randint, uniform

param_dist = {
    'tree_method': ['hist'],            # Fixed (no randomness)

    'n_estimators': randint(200, 1201),     # Integers from 200 to 1200 (inclusive of 200, exclusive of 1201)
    'learning_rate': uniform(0.01, 0.09),   # Floats from 0.01 to 0.1  (0.1-0.01=0.09)
    'max_depth': randint(3, 10),            # Integers from 3 to 9 (inclusive of 3, exclusive of 10)
    'min_child_weight': randint(1, 8),      # Integers from 1 to 7 (inclusive of 1, exclusive of 8)
    'subsample': uniform(0.6, 0.4),         # Floats from 0.6 to 1.0  (1.0-0.6=0.4)
    'colsample_bytree': uniform(0.4, 0.6),  # Floats from 0.4 to 1.0  (1.0-0.4=0.6)
    'gamma': uniform(0, 0.4),               # Floats from 0 to 0.4
    'reg_alpha': uniform(0, 1.0),           # Floats from 0 to 1.0
    'reg_lambda': uniform(1, 1.0)           # Floats from 1.0 to 2.0 (2.0-1.0=1.0)
}


In [13]:
model = XGBRegressor(enable_categorical=True)
grid = RandomizedSearchCV(estimator=model,n_jobs=1,n_iter=200,cv=5,param_distributions=param_dist,scoring="r2",random_state=42)
grid.fit(X_train,y_train)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,param_distributions,"{'colsample_bytree': <scipy.stats....0023C7C87C250>, 'gamma': <scipy.stats....0023C7C87C350>, 'learning_rate': <scipy.stats....0023C7A36BDF0>, 'max_depth': <scipy.stats....0023C7A3AA2C0>, ...}"
,n_iter,200
,scoring,'r2'
,n_jobs,1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,np.float64(0.9677167257290141)
,device,
,early_stopping_rounds,
,enable_categorical,True


In [15]:
grid.best_params_


{'colsample_bytree': np.float64(0.9677167257290141),
 'gamma': np.float64(0.312518420188481),
 'learning_rate': np.float64(0.020211814181570323),
 'max_depth': 7,
 'min_child_weight': 2,
 'n_estimators': 385,
 'reg_alpha': np.float64(0.057558760016644284),
 'reg_lambda': np.float64(1.5495288823237354),
 'subsample': np.float64(0.7766122005493508),
 'tree_method': 'hist'}

In [16]:
grid.best_score_

np.float64(0.9821971893310547)

In [18]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
print(f"r2_score {r2_score(y_test,y_pred)}")

r2_score 0.9819327592849731


In [20]:
mean_absolute_error(y_test,y_pred)

270.0104675292969