In [1]:
import os

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))

from utils.modeling import *
from utils.functions import *


In [2]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


In [3]:
Regression.add_models(['XGBRegressor'])

rmse = make_scorer(calculate_rmse, greater_is_better=False)

kfolds = KFold(n_splits=5, shuffle=True, random_state=43)


# Introducción

- Como se ha visto en "Feature_engineering", el mejor modelo a aplicar es "XGBRegressor"

- Si bien se le ha sacado bastante partido con varios parámetros, aún quedan muchas combinaciones por probar

- A continuación, se experimenta de forma extensiva con este modelo con tal de obtener los mejores resultados posibles

# Ronda 1: base
- Esta primera ronda se lleva a cabo solo para poder ir comparando las mejoras 

In [4]:
df_1 = df_diamonds.copy()

round_1 = Regression(df_diamonds, 'price')
X_train, X_test, y_train, y_test = round_1.split_dataframe()
round_1.prepare_models(params_list=[['XGBRegressor', 'random_state=43']])
round_1.apply_and_evaluate_kfolds()
round_1.create_dataframe()


-- Regression: using mean of 5 folds --
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 9.3 sec(s). Total time: 9.3


Unnamed: 0,XGBRegressor: random_state=43
mae,0.065596
mape,0.008468
mse,0.008247
r2_score,0.991976
rmse,0.090762


# Ronda 2: solo cambios

- Similar a la anterior, esta ronda sirve de punto de partida

In [40]:
df_cambios = df_diamonds.copy()

df_cambios = remove_all(df_cambios, zeros_only=True)

df_cambios = assign_values(df_cambios, outlier=False)

df_2 = df_cambios.copy()

round_2 = Regression(df_2, 'price')
X_train, X_test, y_train, y_test = round_2.split_dataframe()
round_2.prepare_models(params_list=[['XGBRegressor', 'random_state=43']])
round_2.apply_and_evaluate_kfolds()
round_2.create_dataframe()


-- Regression: using mean of 5 folds --
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 6.43 sec(s). Total time: 6.43


Unnamed: 0,XGBRegressor: random_state=43
mae,0.065675
mape,0.008483
mse,0.008221
r2_score,0.992033
rmse,0.090657


# Ronda 3: "eta"

- Como el "dataset" no es muy grande, se reduce el "learning rate" (por defecto, 3) todo lo posible
- Dado que durante el "feature engineering" ha salido un "eta" de 1, se mira alrededor de ese número

In [64]:
model = XGBRegressor(n_estimators=1000, random_state=43)

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


[0]	validation_0-rmse:5.15340
[1]	validation_0-rmse:3.61090
[2]	validation_0-rmse:2.53117
[3]	validation_0-rmse:1.77563
[4]	validation_0-rmse:1.24698
[5]	validation_0-rmse:0.87780
[6]	validation_0-rmse:0.62082
[7]	validation_0-rmse:0.44256
[8]	validation_0-rmse:0.32017
[9]	validation_0-rmse:0.23730
[10]	validation_0-rmse:0.18313
[11]	validation_0-rmse:0.14890
[12]	validation_0-rmse:0.12836
[13]	validation_0-rmse:0.11667
[14]	validation_0-rmse:0.10953
[15]	validation_0-rmse:0.10557
[16]	validation_0-rmse:0.10331
[17]	validation_0-rmse:0.10162
[18]	validation_0-rmse:0.10088
[19]	validation_0-rmse:0.10018
[20]	validation_0-rmse:0.09962
[21]	validation_0-rmse:0.09922
[22]	validation_0-rmse:0.09867
[23]	validation_0-rmse:0.09845
[24]	validation_0-rmse:0.09814
[25]	validation_0-rmse:0.09774
[26]	validation_0-rmse:0.09759
[27]	validation_0-rmse:0.09737
[28]	validation_0-rmse:0.09688
[29]	validation_0-rmse:0.09675
[30]	validation_0-rmse:0.09661
[31]	validation_0-rmse:0.09601
[32]	validation_0-

196

In [71]:
model = XGBRegressor()

params = {'random_state': [43],
          'n_estimators': [195],
          'eta': [0.1, 0.125, 0.15, 0.175, 0.2],
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END eta=0.1, n_estimators=195, random_state=43;, score=-0.092 total time=   2.3s
[CV 2/5] END eta=0.1, n_estimators=195, random_state=43;, score=-0.091 total time=   1.8s
[CV 3/5] END eta=0.1, n_estimators=195, random_state=43;, score=-0.087 total time=   1.8s
[CV 4/5] END eta=0.1, n_estimators=195, random_state=43;, score=-0.087 total time=   2.0s
[CV 5/5] END eta=0.1, n_estimators=195, random_state=43;, score=-0.089 total time=   1.8s
[CV 1/5] END eta=0.125, n_estimators=195, random_state=43;, score=-0.091 total time=   2.3s
[CV 2/5] END eta=0.125, n_estimators=195, random_state=43;, score=-0.091 total time=   2.6s
[CV 3/5] END eta=0.125, n_estimators=195, random_state=43;, score=-0.087 total time=   2.5s
[CV 4/5] END eta=0.125, n_estimators=195, random_state=43;, score=-0.088 total time=   2.8s
[CV 5/5] END eta=0.125, n_estimators=195, random_state=43;, score=-0.088 total time=   1.9s
[CV 1/5] END eta=0.15, n_estim

In [80]:
model = XGBRegressor(n_estimators=1000, eta=0.15)

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


[0]	validation_0-rmse:6.25481
[1]	validation_0-rmse:5.31758
[2]	validation_0-rmse:4.52146
[3]	validation_0-rmse:3.84486
[4]	validation_0-rmse:3.26999
[5]	validation_0-rmse:2.78099
[6]	validation_0-rmse:2.36548
[7]	validation_0-rmse:2.01256
[8]	validation_0-rmse:1.71259
[9]	validation_0-rmse:1.45772
[10]	validation_0-rmse:1.24104
[11]	validation_0-rmse:1.05718
[12]	validation_0-rmse:0.90112
[13]	validation_0-rmse:0.76859
[14]	validation_0-rmse:0.65630
[15]	validation_0-rmse:0.56107
[16]	validation_0-rmse:0.48058
[17]	validation_0-rmse:0.41249
[18]	validation_0-rmse:0.35531
[19]	validation_0-rmse:0.30726
[20]	validation_0-rmse:0.26704
[21]	validation_0-rmse:0.23362
[22]	validation_0-rmse:0.20596
[23]	validation_0-rmse:0.18320
[24]	validation_0-rmse:0.16485
[25]	validation_0-rmse:0.14995
[26]	validation_0-rmse:0.13801
[27]	validation_0-rmse:0.12861
[28]	validation_0-rmse:0.12114
[29]	validation_0-rmse:0.11550
[30]	validation_0-rmse:0.11118
[31]	validation_0-rmse:0.10787
[32]	validation_0-

532

# Ronda 4: "constraints"

- Se comprueba el número de estimadores

- Se comprueba que el modelo mejora si se aplican "constraints" a "weight (carat)"

- Eso tiene sentido ya que como se ha visto en el EDA a) tiene una altísima correlación con la "target", y b) cuando un diamante pesa más su precio es superior, y las otras columnas de tamaño se ajustan a ese peso

In [76]:
model = XGBRegressor()

params = {'random_state': [43],
          'n_estimators': [531],
          'eta': [0.15],
          'monotone_constraints':[{"weight (carat)": 1}, {"lenght (millimeters)": 1}, {"width (millimeters)": 1}, {"depth (millimeters)": 1}],
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=531, random_state=43;, score=-0.089 total time=   9.8s
[CV 2/5] END eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=531, random_state=43;, score=-0.089 total time=   8.0s
[CV 3/5] END eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=531, random_state=43;, score=-0.086 total time=   8.4s
[CV 4/5] END eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=531, random_state=43;, score=-0.086 total time=   8.9s
[CV 5/5] END eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=531, random_state=43;, score=-0.087 total time=   7.9s
[CV 1/5] END eta=0.15, monotone_constraints={'lenght (millimeters)': 1}, n_estimators=531, random_state=43;, score=-0.090 total time=   9.1s
[CV 2/5] END eta=0.15, monotone_constraints={'lenght (millimeters)': 1}, n_estimators=531, random_state=43;, score=-0.089 to

In [81]:
model = XGBRegressor(n_estimators=1000, eta=0.15, monotone_constraints={"weight (carat)": 1})

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


[0]	validation_0-rmse:6.25481
[1]	validation_0-rmse:5.31758
[2]	validation_0-rmse:4.52150
[3]	validation_0-rmse:3.84463
[4]	validation_0-rmse:3.26964
[5]	validation_0-rmse:2.78173
[6]	validation_0-rmse:2.36649
[7]	validation_0-rmse:2.01309
[8]	validation_0-rmse:1.71300
[9]	validation_0-rmse:1.45817
[10]	validation_0-rmse:1.24123
[11]	validation_0-rmse:1.05734
[12]	validation_0-rmse:0.90110
[13]	validation_0-rmse:0.76855
[14]	validation_0-rmse:0.65585
[15]	validation_0-rmse:0.56116
[16]	validation_0-rmse:0.48050
[17]	validation_0-rmse:0.41302
[18]	validation_0-rmse:0.35586
[19]	validation_0-rmse:0.30790
[20]	validation_0-rmse:0.26781
[21]	validation_0-rmse:0.23454
[22]	validation_0-rmse:0.20698
[23]	validation_0-rmse:0.18427
[24]	validation_0-rmse:0.16564
[25]	validation_0-rmse:0.15081
[26]	validation_0-rmse:0.13914
[27]	validation_0-rmse:0.12979
[28]	validation_0-rmse:0.12263
[29]	validation_0-rmse:0.11697
[30]	validation_0-rmse:0.11259
[31]	validation_0-rmse:0.10920
[32]	validation_0-

338

# Ronda 4: "subsample" y "colsample_bytree"


In [82]:
model = XGBRegressor()

params = {'random_state': [43],
          'n_estimators': [337],
          'eta': [0.15],
          'monotone_constraints':[{"weight (carat)": 1}],
          'subsample': np.linspace(0.5, 1.0, num=11),
          'colsample_bytree': np.linspace(0.5, 1.0, num=11)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 121 candidates, totalling 605 fits
[CV 1/5] END colsample_bytree=0.5, eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=337, random_state=43, subsample=0.5;, score=-0.094 total time=   5.7s
[CV 2/5] END colsample_bytree=0.5, eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=337, random_state=43, subsample=0.5;, score=-0.091 total time=   6.2s
[CV 3/5] END colsample_bytree=0.5, eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=337, random_state=43, subsample=0.5;, score=-0.089 total time=   6.6s
[CV 4/5] END colsample_bytree=0.5, eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=337, random_state=43, subsample=0.5;, score=-0.088 total time=   7.9s
[CV 5/5] END colsample_bytree=0.5, eta=0.15, monotone_constraints={'weight (carat)': 1}, n_estimators=337, random_state=43, subsample=0.5;, score=-0.090 total time=   5.8s
[CV 1/5] END colsample_bytree=0.5, eta=0.15, monotone_constraints={'weight (c

In [None]:
model = XGBRegressor(n_estimators=1000, eta=0.15, monotone_constraints={"weight (carat)": 1}, subsample=None, colsample_bytree=None)

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


# Ronda 5: "max_depth" y "min_child_weight"


In [58]:
model = XGBRegressor()

params = {'random_state': [43],
          'n_estimators': [],
          'eta': [0.15],
          'monotone_constraints':[{"weight (carat)": 1}],
          'subsample': [],
          'colsample_bytree': [],
          'max_depth': range(1, 11),
          'min_child_weight': range(1, 11)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END max_depth=1, min_child_weight=1, monotone_constraints={'weight (carat)': 1}, n_estimators=195, random_state=43;, score=-0.136 total time=   0.8s
[CV 2/5] END max_depth=1, min_child_weight=1, monotone_constraints={'weight (carat)': 1}, n_estimators=195, random_state=43;, score=-0.134 total time=   0.6s
[CV 3/5] END max_depth=1, min_child_weight=1, monotone_constraints={'weight (carat)': 1}, n_estimators=195, random_state=43;, score=-0.129 total time=   0.6s
[CV 4/5] END max_depth=1, min_child_weight=1, monotone_constraints={'weight (carat)': 1}, n_estimators=195, random_state=43;, score=-0.133 total time=   0.6s
[CV 5/5] END max_depth=1, min_child_weight=1, monotone_constraints={'weight (carat)': 1}, n_estimators=195, random_state=43;, score=-0.131 total time=   0.6s
[CV 1/5] END max_depth=1, min_child_weight=2, monotone_constraints={'weight (carat)': 1}, n_estimators=195, random_state=43;, score=-0.136 total ti

In [None]:
model = XGBRegressor(random_state=43, n_estimators=1000, eta=0.15, monotone_constraints={"weight (carat)": 1}, subsample=None, colsample_bytree=None, max_depth=None, min_child_weight=None)

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


# Ronda 6: "num_parallel_tree"

In [None]:
model = XGBRegressor()

params = {'random_state': [43],
          'n_estimators': [],
          'eta': [0.15],
          'monotone_constraints':[{"weight (carat)": 1}],
          'subsample': [],
          'colsample_bytree': [],
          'max_depth': [],
          'min_child_weight': [],
          'num_parallel_tree': range(1, 6)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


In [None]:
model = XGBRegressor(random_state=43, n_estimators=1000, eta=0.15, monotone_constraints={"weight (carat)": 1}, subsample=None, colsample_bytree=None, max_depth=None, min_child_weight=None, num_parallel_tree=None)

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


# Ronda 7: hiperparámetros de regularización

In [None]:
model = XGBRegressor()

params = {'random_state': [43],
          'n_estimators': [],
          'eta': [0.15],
          'monotone_constraints':[{"weight (carat)": 1}],
          'subsample': [],
          'colsample_bytree': [],
          'max_depth': [],
          'min_child_weight': [],
          'num_parallel_tree': [],
          'max_delta_step': range(0, 6),
          'gamma': np.linspace(0, 0.5, num=6),
          'lambda': np.linspace(0, 1, num=11),
          'reg_alpha': np.linspace(0, 0.5, num=6)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


In [None]:
model = XGBRegressor(random_state=43, 
                        n_estimators=1000, 
                        eta=0.15,
                        monotone_constraints={"weight (carat)": 1}, 
                        subsample=None, 
                        colsample_bytree=None, 
                        max_depth=None, 
                        min_child_weight=None, 
                        num_parallel_tree=None,
                        max_delta_step=None,
                        gamma=None,
                        reg_lambda=None,
                        reg_alpha=None
                    )

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit
