In [1]:
import os

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))

from utils.modeling import *
from utils.functions import *


In [2]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


In [3]:
Regression.add_models(['XGBRegressor'])

rmse = make_scorer(calculate_rmse, greater_is_better=False)

kfolds = KFold(n_splits=5, shuffle=True, random_state=43)


# Introducción

- Como se ha visto en "Feature_engineering", el mejor modelo a aplicar es "XGBRegressor"

- Si bien se le ha sacado bastante partido con varios parámetros, aún quedan muchas combinaciones por probar

- A continuación, se experimenta de forma extensiva con este modelo con tal de obtener los mejores resultados posibles

# Ronda 1: base
- Esta primera ronda se lleva a cabo solo para poder ir comparando las mejoras 

In [4]:
df_1 = df_diamonds.copy()

round_1 = Regression(df_diamonds, 'price')
X_train, X_test, y_train, y_test = round_1.split_dataframe()
round_1.prepare_models(params_list=[['XGBRegressor', 'random_state=43']])
round_1.apply_and_evaluate_kfolds()
round_1.create_dataframe()


-- Regression: using mean of 5 folds --
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 9.3 sec(s). Total time: 9.3


Unnamed: 0,XGBRegressor: random_state=43
mae,0.065596
mape,0.008468
mse,0.008247
r2_score,0.991976
rmse,0.090762


# Ronda 2: solo cambios

- Similar a la anterior, esta ronda sirve de punto de partida

In [5]:
df_cambios = df_diamonds.copy()

df_cambios = remove_all(df_cambios, zeros_only=True)

df_cambios = assign_values(df_cambios, outlier=False)

df_2 = df_cambios.copy()

round_2 = Regression(df_2, 'price')
X_train, X_test, y_train, y_test = round_2.split_dataframe()
round_2.prepare_models(params_list=[['XGBRegressor', 'random_state=43']])
round_2.apply_and_evaluate_kfolds()
round_2.create_dataframe()


-- Regression: using mean of 5 folds --
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 5.43 sec(s). Total time: 5.43


Unnamed: 0,XGBRegressor: random_state=43
mae,0.065675
mape,0.008483
mse,0.008221
r2_score,0.992033
rmse,0.090657


# Ronda 3: "estimators"

- Se utiliza "gbtree" porque ofrece buenos resultados y es relativamente rápido

- Como es una cifra que puede variar mucho, se buscan los "estimators" primero

- Se encuentra el número de "estimators" óptimo para los valores por defecto


In [10]:
# Aunque dé a entender que el mejor estimador es 196, al examinar los resultados se ve claro que hay que restarle 1 y es 195
model = XGBRegressor(n_estimators=1000)

model.early_stopping_rounds = 300

model.fit(X_train, y_train, eval_set=[(X_test, y_test)]) 

model.best_ntree_limit


[0]	validation_0-rmse:5.15340
[1]	validation_0-rmse:3.61090
[2]	validation_0-rmse:2.53117
[3]	validation_0-rmse:1.77563
[4]	validation_0-rmse:1.24698
[5]	validation_0-rmse:0.87780
[6]	validation_0-rmse:0.62082
[7]	validation_0-rmse:0.44256
[8]	validation_0-rmse:0.32017
[9]	validation_0-rmse:0.23730
[10]	validation_0-rmse:0.18313
[11]	validation_0-rmse:0.14890
[12]	validation_0-rmse:0.12836
[13]	validation_0-rmse:0.11667
[14]	validation_0-rmse:0.10953
[15]	validation_0-rmse:0.10557
[16]	validation_0-rmse:0.10331
[17]	validation_0-rmse:0.10162
[18]	validation_0-rmse:0.10088
[19]	validation_0-rmse:0.10018
[20]	validation_0-rmse:0.09962
[21]	validation_0-rmse:0.09922
[22]	validation_0-rmse:0.09867
[23]	validation_0-rmse:0.09845
[24]	validation_0-rmse:0.09814
[25]	validation_0-rmse:0.09774
[26]	validation_0-rmse:0.09759
[27]	validation_0-rmse:0.09737
[28]	validation_0-rmse:0.09688
[29]	validation_0-rmse:0.09675
[30]	validation_0-rmse:0.09661
[31]	validation_0-rmse:0.09601
[32]	validation_0-

196

# Ronda 4: "max_depth" y "min_child_weight"

- Estos dos influyen mucho, así que se buscan inmediatamente después de los "estimators"

In [11]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': range(3, 21),
          'min_child_weight': range(1, 11)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END booster=gbtree, max_depth=3, min_child_weight=1, n_estimators=195, random_state=43;, score=-0.098 total time=   2.5s
[CV 2/5] END booster=gbtree, max_depth=3, min_child_weight=1, n_estimators=195, random_state=43;, score=-0.094 total time=   2.3s
[CV 3/5] END booster=gbtree, max_depth=3, min_child_weight=1, n_estimators=195, random_state=43;, score=-0.093 total time=   2.1s
[CV 4/5] END booster=gbtree, max_depth=3, min_child_weight=1, n_estimators=195, random_state=43;, score=-0.092 total time=   1.7s
[CV 5/5] END booster=gbtree, max_depth=3, min_child_weight=1, n_estimators=195, random_state=43;, score=-0.092 total time=   1.7s
[CV 1/5] END booster=gbtree, max_depth=3, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.098 total time=   2.1s
[CV 2/5] END booster=gbtree, max_depth=3, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.094 total time=   2.8s
[CV 3/5] END booster=gbtree

# Ronda 5: "gamma"

- "Gamma", que es un tanto menos importante que los dos anteriores, se localiza a continuación

In [12]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': [6],
          'min_child_weight': [2],
          'gamma': np.linspace(0, 6, num=61)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 61 candidates, totalling 305 fits
[CV 1/5] END booster=gbtree, gamma=0.0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.092 total time=   2.7s
[CV 2/5] END booster=gbtree, gamma=0.0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.091 total time=   2.1s
[CV 3/5] END booster=gbtree, gamma=0.0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.088 total time=   2.3s
[CV 4/5] END booster=gbtree, gamma=0.0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.088 total time=   2.4s
[CV 5/5] END booster=gbtree, gamma=0.0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.090 total time=   2.5s
[CV 1/5] END booster=gbtree, gamma=0.1, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43;, score=-0.097 total time=   1.8s
[CV 2/5] END booster=gbtree, gamma=0.1, max_depth=6, min_child_weight=2, n_estimators=195, r

# Ronda 6: "subsample" y "colsample_bytree"
- Quizá más relevantes que "gamma", pero con un valor por defecto que suele ser razonable, estos dos parámetros son los siguientes de la lista

In [13]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': [6],
          'min_child_weight': [2],
          'gamma': [0],
          'subsample': np.linspace(0.5, 1.0, num=11),
          'colsample_bytree': np.linspace(0.5, 1.0, num=11)
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 121 candidates, totalling 605 fits
[CV 1/5] END booster=gbtree, colsample_bytree=0.5, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, subsample=0.5;, score=-0.097 total time=   2.2s
[CV 2/5] END booster=gbtree, colsample_bytree=0.5, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, subsample=0.5;, score=-0.095 total time=   1.6s
[CV 3/5] END booster=gbtree, colsample_bytree=0.5, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, subsample=0.5;, score=-0.092 total time=   1.6s
[CV 4/5] END booster=gbtree, colsample_bytree=0.5, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, subsample=0.5;, score=-0.093 total time=   1.8s
[CV 5/5] END booster=gbtree, colsample_bytree=0.5, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, subsample=0.5;, score=-0.093 total time=   3.5s
[CV 1/5] END booster=gbtree, colsample_bytree=0.5, gamma=

# Ronda 7: "sampling"

- Con lo obtenido de los parámetros anteriores, toca elegir el "sampling_method". Como "subsample" es superior a 0.5, solo hay que probar "uniform" y "gradient_based"
- No hay diferencia, pero "gradient_based" es algo más rápido

In [14]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': [6],
          'min_child_weight': [2],
          'gamma': [0],
          'subsample': [1],
          'colsample_bytree': [0.7],
          'sampling_method': ['uniform', 'gradient_based']
            }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, sampling_method=uniform, subsample=1;, score=-0.093 total time=   2.1s
[CV 2/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, sampling_method=uniform, subsample=1;, score=-0.090 total time=   1.4s
[CV 3/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, sampling_method=uniform, subsample=1;, score=-0.088 total time=   1.5s
[CV 4/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, sampling_method=uniform, subsample=1;, score=-0.087 total time=   1.6s
[CV 5/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, sampling_method=

# Ronda 8: parámetros adicionales

- Se prueban varios otros parámetros a priori de importancia reducida

In [19]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': [6],
          'min_child_weight': [2],
          'gamma': [0],
          'subsample': [1],
          'colsample_bytree': [0.7],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': range(0, 4),
          'max_delta_step': range(0, 4),
          'lambda': np.linspace(0, 0.5, num=6),
          'reg_alpha': np.linspace(0, 0.5, num=6)
          }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[CV 1/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, lambda=0.0, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.0, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-0.093 total time=   1.8s
[CV 2/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, lambda=0.0, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.0, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-0.091 total time=   1.5s
[CV 3/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, lambda=0.0, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.0, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-0.090 total time=   3.4s
[CV 4/5] END booster=gbtree, colsample_bytree=0.7, gamma=0, lambda=0.0, max_delta_step=0, max_depth=6, min_child_weight=2, 

# Ronda 9: "eta"

- Se prueba la mejor "eta"

In [21]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': [6],
          'min_child_weight': [2],
          'gamma': [0],
          'subsample': [1],
          'colsample_bytree': [0.7],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0],
          'max_delta_step': [0],
          'lambda': [0.3],
          'reg_alpha': [0.3],
          'eta': np.linspace(0, 1, num=10)
          }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END booster=gbtree, colsample_bytree=0.7, eta=0.0, gamma=0, lambda=0.3, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.3, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-7.353 total time=   1.8s
[CV 2/5] END booster=gbtree, colsample_bytree=0.7, eta=0.0, gamma=0, lambda=0.3, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.3, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-7.367 total time=   1.4s
[CV 3/5] END booster=gbtree, colsample_bytree=0.7, eta=0.0, gamma=0, lambda=0.3, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.3, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-7.335 total time=   1.5s
[CV 4/5] END booster=gbtree, colsample_bytree=0.7, eta=0.0, gamma=0, lambda=0.3, max_delta_step=0, 

In [22]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [195],
          'max_depth': [6],
          'min_child_weight': [2],
          'gamma': [0],
          'subsample': [1],
          'colsample_bytree': [0.7],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0],
          'max_delta_step': [0],
          'lambda': [0.3],
          'reg_alpha': [0.3],
          'eta': np.linspace(0.01, 0.2, num=100)
          }

grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END booster=gbtree, colsample_bytree=0.7, eta=0.01, gamma=0, lambda=0.3, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.3, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-1.047 total time=   2.9s
[CV 2/5] END booster=gbtree, colsample_bytree=0.7, eta=0.01, gamma=0, lambda=0.3, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.3, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-1.051 total time=   1.8s
[CV 3/5] END booster=gbtree, colsample_bytree=0.7, eta=0.01, gamma=0, lambda=0.3, max_delta_step=0, max_depth=6, min_child_weight=2, n_estimators=195, random_state=43, reg_alpha=0.3, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-1.043 total time=   2.0s
[CV 4/5] END booster=gbtree, colsample_bytree=0.7, eta=0.01, gamma=0, lambda=0.3, max_delta_st

# Ronda 10: rejuste general

- Se supone que los valores óptimos obtenidos al probar los hiperparámetros por separado pueden haber oscilado ligeramente al aplicarse de manera consecutiva

- Por tanto, se prueban todos a la vez con un rango reducido para cada uno

In [6]:
model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [194, 195, 196],
          'max_depth': [5, 6, 7],
          'min_child_weight': [2, 3, 4],
          'gamma': [0, 0.1],
          'subsample': [1],
          'colsample_bytree': [0.7, 0.8],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0, 1],
          'max_delta_step': [1, 2, 3],
          'lambda': [0.2, 0.3, 0.4],
          'reg_alpha': [0.2, 0.3, 0.4],
          'eta': [0.18272727272727274]
          }


grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


Fitting 5 folds for each of 5832 candidates, totalling 29160 fits
[CV 1/5] END booster=gbtree, colsample_bytree=0.7, eta=0.18272727272727274, gamma=0, lambda=0.2, max_delta_step=1, max_depth=5, min_child_weight=2, n_estimators=194, random_state=43, reg_alpha=0.2, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-0.094 total time=   1.1s
[CV 2/5] END booster=gbtree, colsample_bytree=0.7, eta=0.18272727272727274, gamma=0, lambda=0.2, max_delta_step=1, max_depth=5, min_child_weight=2, n_estimators=194, random_state=43, reg_alpha=0.2, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-0.091 total time=   1.4s
[CV 3/5] END booster=gbtree, colsample_bytree=0.7, eta=0.18272727272727274, gamma=0, lambda=0.2, max_delta_step=1, max_depth=5, min_child_weight=2, n_estimators=194, random_state=43, reg_alpha=0.2, sampling_method=gradient_based, scale_pos_weight=0, subsample=1;, score=-0.088 total time=   1.0s
[CV 4/5] END booster=gbtree, colsample_bytree=

# Ronda 11: sustitución

- Aunque la sustitución de valores de "depth (percentage)" por los resultados reales que da el cálculo a partir de las columnas correspondientes no ha dado buenos resultados durante el "feature engineering", quizá ahora mejore con los nuevos hiperparámetros

In [None]:
df_11 = df_cambios.copy()

df_11['depth (percentage)'] = (df_11['depth (millimeters)'] / ((df_11['lenght (millimeters)']+df_11['width (millimeters)']) / 2)) * 100

round_11 = Regression(df_11, 'price')
X_train, X_test, y_train, y_test = round_11.split_dataframe()

model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [194, 195, 196],
          'max_depth': [5, 6, 7],
          'min_child_weight': [2, 3, 4],
          'gamma': [0, 0.1],
          'subsample': [1],
          'colsample_bytree': [0.7, 0.8],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0, 1],
          'max_delta_step': [1, 2, 3],
          'lambda': [0.2, 0.3, 0.4],
          'reg_alpha': [0.2, 0.3, 0.4],
          'eta': [0.18272727272727274]
          }


grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


# Ronda 12: sin borrado

- Asimismo, se prueba sin el borrado

In [None]:
df_12 = df_diamonds.copy()

df_12 = assign_values(df_12, outlier=False)

round_12 = Regression(df_12, 'price')
X_train, X_test, y_train, y_test = round_12.split_dataframe()

model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [194, 195, 196],
          'max_depth': [5, 6, 7],
          'min_child_weight': [2, 3, 4],
          'gamma': [0, 0.1],
          'subsample': [1],
          'colsample_bytree': [0.7, 0.8],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0, 1],
          'max_delta_step': [1, 2, 3],
          'lambda': [0.2, 0.3, 0.4],
          'reg_alpha': [0.2, 0.3, 0.4],
          'eta': [0.18272727272727274]
          }


grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

# Ronda 13: sin asignación

- Igual que en el caso anterior, pero esta vez sin la asignación

In [None]:
df_13 = df_diamonds.copy()

df_13 = remove_all(df_13, zeros_only=True)

round_13 = Regression(df_13, 'price')
X_train, X_test, y_train, y_test = round_13.split_dataframe()

model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [194, 195, 196],
          'max_depth': [5, 6, 7],
          'min_child_weight': [2, 3, 4],
          'gamma': [0, 0.1],
          'subsample': [1],
          'colsample_bytree': [0.7, 0.8],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0, 1],
          'max_delta_step': [1, 2, 3],
          'lambda': [0.2, 0.3, 0.4],
          'reg_alpha': [0.2, 0.3, 0.4],
          'eta': [0.18272727272727274]
          }


grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')

# Ronda 14: asignación y borrado modelados con "ridge"

- Utilizando una técnica parecida a la usada en la fase de "Imputaciones 'ridge'" en el "feature engineering" para imputar los valores atípicos de "depth", sería posible "imputar" todos los valores 0 de las variables que tienen que ver con el tamaño de los diamantes a partir de una predicción. Ello serviría como alternativa a los cambios de borrado y asignación

In [None]:
df_14 = df_diamonds.copy()

cols = ['width (millimeters)', 'lenght (millimeters)', 'depth (millimeters)']
df = df_14.copy()[['width (millimeters)', 'lenght (millimeters)', 'depth (millimeters)', 'weight (carat)']]
for index, col in enumerate(cols):
    cols.pop(index)
    df_train = df.drop(df[(df[cols[0]] == 0) | (df[cols[1]] == 0) | (df[col] == 0)].index)
    df_test = df[df[col] == 0]
    X_train = df_train.drop(columns=col)
    y_train = df_train[col]
    x_test = df_test.drop(columns=col)
    y_test = df_test[col]
    cols.insert(0, col)
    ridge = Ridge()
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(x_test)
    df_14.loc[df_diamonds[col] == 0, col] = y_pred


In [None]:
round_14 = Regression(df_14, 'price')
X_train, X_test, y_train, y_test = round_14.split_dataframe()

model = XGBRegressor()

params = {'random_state': [43],
          'booster': ['gbtree'],
          'n_estimators': [194, 195, 196],
          'max_depth': [5, 6, 7],
          'min_child_weight': [2, 3, 4],
          'gamma': [0, 0.1],
          'subsample': [1],
          'colsample_bytree': [0.7, 0.8],
          'sampling_method': ['gradient_based'],
          'scale_pos_weight': [0, 1],
          'max_delta_step': [1, 2, 3],
          'lambda': [0.2, 0.3, 0.4],
          'reg_alpha': [0.2, 0.3, 0.4],
          'eta': [0.18272727272727274]
          }


grid = GridSearchCV(estimator=model,
                    param_grid=params,
                    scoring=rmse,
                    verbose=4,
                    cv=kfolds
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')
print(f'Best score: {grid.best_score_}')


# Ronda 15: ajuste final