## <font color="orange">*1. Librerias*</font>

In [None]:
#!pip install lazypredict

In [None]:
#!pip install catboost

In [None]:
#!pip install optuna

In [51]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

from lazypredict.Supervised import LazyRegressor

from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.preprocessing  import MinMaxScaler

In [5]:
import optuna
import logging

optuna.logging.set_verbosity(optuna.logging.ERROR)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## <font color="orange">*2. Dataset*</font>

In [7]:
train_cleaned = pd.read_pickle('/content/drive/MyDrive/train_cleaned.pkl')

In [8]:
# Selección de columas
train_columns = ['shop_id_scal', 'item_id_scal', 'label_mean_encoder'] + list(range(34))
# Dataframe de entrenamiento
data = train_cleaned[train_columns]

## <font color="orange">*3. LazyPredict*</font>


Se utiliza LazyRegressor como otra buena alternativa donde se pueden ver los diferentes modelos de predicción y sus métricas.

Se recortan la cantidad de datos para evitar la saturación  de la RAM.

In [9]:
# Recorte de datos
midpoint = len(data) // 30

train = data.iloc[:midpoint]

In [10]:
# Cantidad de datos
train.shape[0]

14136

In [11]:
X = train.iloc[:, (train.columns != 33)].values
y = train.iloc[:, (train.columns == 33)].values

X , y = shuffle(X , y, random_state=13)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

In [13]:
# Inicializar LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
# Entrenar y evaluar los modelos
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [02:19<00:00,  3.33s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 10602, number of used features: 36
[LightGBM] [Info] Start training from score 0.003755





In [14]:
predictions

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,0.26,0.27,0.02,0.59
GradientBoostingRegressor,0.26,0.27,0.02,1.85
LGBMRegressor,0.22,0.23,0.02,0.26
KNeighborsRegressor,0.16,0.17,0.02,0.29
TweedieRegressor,0.13,0.14,0.02,0.14
ElasticNetCV,0.12,0.13,0.02,0.19
LarsCV,0.12,0.13,0.02,0.19
LassoLarsCV,0.12,0.13,0.02,0.23
LassoCV,0.12,0.13,0.02,0.4
OrthogonalMatchingPursuitCV,0.09,0.1,0.02,0.05


## <font color="orange">*4. Modelos Ensamblados*</font>

Se utiliza todo el conjunto de datos.

Los ensambles constan de utilizar los siguientes modelos predictivos de las siguiente manera:

HistGradientBoostingRegressor + XGBRegressor + CatBoostRegressor
  - Promedios Ponderados
  - VotingRegressor
  - MetaModelo con GradientBoostingRegressor

### Resultado:
  - según las metricas RMSE, el mejor modelo ensamblado es el Promedio Ponderado con pesos optimizados con Optuna, inclusive es mejor el modelo XGBoost del archivo XGBoost+Optuna_predict.ipynb

In [15]:
# Se utiliza todo el conjunto de datos
X = data.iloc[:, (data.columns != 33)].values
y = data.iloc[:, (data.columns == 33)].values

X , y = shuffle(X , y, random_state=13)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=13)

### 4.1. Modelos HistGradientBoostingRegressor, XGBRegressor y CatBoostRegressor

In [17]:
categorical_features = ['shop_id_scal', 'item_id_scal']

# Entrenar HistGradientBoostingRegressor
hgbr_model = HistGradientBoostingRegressor(
                                   learning_rate=0.01,
                                  min_samples_leaf=60,
                                  max_iter=100,
                                   random_state=42
)

hgbr_model.fit(X_train, y_train)

# Predicciones con HistGradientBoostingRegressor
hgbr_predictions = hgbr_model.predict(X_test)

In [18]:
# Entrenar XGBRegressor
xgb_model = XGBRegressor(
                          objective='reg:squarederror',
                          eval_metric='rmse',
                          max_depth=5,
                          random_state=13
                        )
xgb_model.fit(
              X_train, y_train,
              eval_set=[(X_test, y_test)],
              verbose=False,
              early_stopping_rounds=50
              )


# Predicciones con XGBRegressor
xgb_predictions = xgb_model.predict(X_test)

In [19]:
# Entrenar CatBost
catboost_model = CatBoostRegressor(
                                   depth=6,
                                   eval_metric='RMSE',
                                   random_seed=13
                                  )

catboost_model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False,
    early_stopping_rounds=50
)

# Predicciones con CatBoost
catboost_predictions = catboost_model.predict(X_test)

In [20]:
# Evaluar individualmente
rmse_hgbr = mean_squared_error(y_test, hgbr_predictions, squared=False)
rmse_xgb = mean_squared_error(y_test, xgb_predictions, squared=False)
rmse_cat = mean_squared_error(y_test, catboost_predictions, squared=False)
print("RMSE de HistGradientBoostingRegressor:", rmse_hgbr)
print("RMSE de XGBRegressor:", rmse_xgb)
print("RMSE de CatBoost:", rmse_cat)

RMSE de HistGradientBoostingRegressor: 0.05810694116447235
RMSE de XGBRegressor: 0.05802786606145431
RMSE de CatBoost: 0.05417657502402026


### 4.2. Ensambles
 1. Promedios Ponderados
 2. VotingRegressor
 3. MetaModelo

#### 4.2.1. Promedios Ponderados

In [21]:
# Modelos Ensamblados - Promedio Ponderado de las predicciones
weights = [1, 1, 1]
total_weight = sum(weights)
ensemble_predictions = ((weights[0] * hgbr_predictions) +
                        (weights[1] * xgb_predictions) +
                        (weights[2] * catboost_predictions)
                       ) / total_weight

rmse = mean_squared_error(y_test, ensemble_predictions, squared=False)
print("RMSE Promedios:", rmse)

RMSE Promedios: 0.0542839894303059


In [22]:
# Define la función objetivo para optimizar
def objective(trial):
    # Define los rangos de búsqueda para los pesos
    w1 = trial.suggest_float('w1', 0.1, 2.0)
    w2 = trial.suggest_float('w2', 0.1, 2.0)
    w3 = trial.suggest_float('w3', 0.1, 2.0)

    # Crea un ensamblador con los pesos sugeridos
    weights = [w1, w2, w3]
    total_weight = sum(weights)
    ensemble_predictions = ((weights[0] * hgbr_predictions) +
                            (weights[1] * xgb_predictions) +
                            (weights[2] * catboost_predictions)
                            ) / total_weight

    # Calcula el RMSE
    rmse = mean_squared_error(y_test, ensemble_predictions, squared=False)

    return rmse

# Crea un estudio Optuna
study = optuna.create_study(direction='minimize')

# Realiza la optimización
study.optimize(objective, n_trials=100)

# Obtiene los mejores valores encontrados
best_weights = study.best_params
best_rmse = study.best_value

print("Mejores pesos:", best_weights)
print("RMSE Ponderaciones optimizadas:", best_rmse)

Mejores pesos: {'w1': 0.4989865925060802, 'w2': 0.3643185502991042, 'w3': 1.9555637688751761}
RMSE Ponderaciones optimizadas: 0.05382678914722075


#### 4.2.2. VotingRegressor

In [23]:
# Crear el ensamblaje
ensemble = VotingRegressor(estimators=[
                                        ('rf', hgbr_model),
                                        ('xgb', xgb_model),
                                        ('cat', catboost_model)
                                      ], weights=[1, 1, 1], n_jobs=-1)

# Ajustar el ensamblaje al conjunto de datos
ensemble.fit(X_train, y_train)

# Predicciones y evaluación
y_pred = ensemble.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE V-Regressor:", rmse)

RMSE V-Regressor: 0.054741157861567184


In [24]:
# Crear el VotingRegressor con los mejores pesos
final_voting_regressor = VotingRegressor(
    estimators=[
        ('rf', hgbr_model),
        ('xgb', xgb_model),
        ('cat', catboost_model)
    ],
    weights=[best_weights['w1'], best_weights['w2'], best_weights['w3']],
    n_jobs=-1
)
# Ajustar el VotingRegressor al conjunto de datos
final_voting_regressor.fit(X_train, y_train)

# Predicciones y evaluación final
final_y_pred = final_voting_regressor.predict(X_test)
final_rmse = mean_squared_error(y_test, final_y_pred, squared=False)

print("RMSE V-Regressor optimizado:", final_rmse)

RMSE V-Regressor optimizado: 0.05453406341338416


#### 4.2.3. MetaModelo

In [25]:
dict_val = {
    'hgbr_predictions': np.array(hgbr_predictions).ravel()  *(best_weights['w1']/sum(best_weights.values())),
    'xgb_predictions': np.array(xgb_predictions).ravel() *(best_weights['w2']/sum(best_weights.values())),
    'catb_predictions': np.array(catboost_predictions).ravel() *(best_weights['w3']/sum(best_weights.values())),
    'label' : np.array(y_test).ravel()
    }

In [26]:
val_data = pd.DataFrame(dict_val)

In [27]:
val_data[val_data['label']>0]

Unnamed: 0,hgbr_predictions,xgb_predictions,catb_predictions,label
0,0.00,0.00,-0.00,0.04
9,0.01,0.00,0.02,0.09
15,0.00,0.00,0.01,0.04
22,0.00,0.00,0.02,0.04
35,0.00,0.00,0.00,0.04
...,...,...,...,...
105977,0.01,0.03,0.26,0.75
105989,0.00,0.00,0.01,0.09
105992,0.00,0.00,0.01,0.09
106010,0.00,0.00,0.01,0.04


In [28]:
X_train = val_data.iloc[:, (val_data.columns != 'label')].values
y_train = val_data.iloc[:, (val_data.columns == 'label')].values

In [29]:
# Entrenar GradientBoostingRegressor
metamodel = GradientBoostingRegressor()

# Realizar validación cruzada
cv_scores = cross_val_score(metamodel, X_train, y_train, cv=5, scoring='neg_mean_squared_error', verbose=False)

# Calcular el error cuadrático medio (RMSE) promedio de la validación cruzada
cv_rmse = np.sqrt(-cv_scores.mean())
print(f"Cross-validated RMSE Metamodelo: {cv_rmse}")

Cross-validated RMSE Metamodelo: 0.05776142478519193


## <font color="orange">*5. Set de prueba*</font>

Predicción del mejor modelo en el set de prueba "*test_pred* ".

In [31]:
test_pred = pd.read_pickle('/content/drive/MyDrive/test_pred.pkl')

In [32]:
test_pred.head()

Unnamed: 0,ID,shop_id_scal,item_id_scal,label_mean_encoder,label_embedding,-1,0,1,2,3,...,23,24,25,26,27,28,29,30,31,32
0,0,8.47,22.72,0.4,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,0.0,0.0,0.0,0.0,...,0.18,0.0,0.0,0.0,0.13,0.23,0.21,0.69,0.06,0.0
1,1,8.47,24.0,0.13,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,8.47,23.61,0.4,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.4,0.45,0.0,0.23,0.18,0.04
3,3,8.47,23.6,0.4,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.0,0.0
4,4,8.47,23.76,0.4,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
# Selección de valores
X_test = test_pred.drop(columns=['ID', 'label_embedding', -1]).values

In [44]:
# Predicciones
hgbr_predictions = hgbr_model.predict(X_test)
xgb_predictions = xgb_model.predict(X_test)
catboost_predictions = catboost_model.predict(X_test)

In [50]:
# Promedios Ponderados
total_weight = sum(best_weights.values())
ensemble_predictions = ((best_weights['w1'] * hgbr_predictions) +
                        (best_weights['w2'] * xgb_predictions) +
                        (best_weights['w3'] * catboost_predictions)
                       ) / total_weight

Los valores deben ajustarce del 0 al 20 según lo especificado en el desafío de Kaggle.

In [52]:
# Inicializar el escalador con el rango deseado
scaler = MinMaxScaler(feature_range=(0, 20))

# Ajustar el escalador y transformar las predicciones
predictions_reshaped = ensemble_predictions.reshape(-1, 1)  # Necesario para que MinMaxScaler funcione correctamente
predictions_scaled = scaler.fit_transform(predictions_reshaped)