## <font color="orange">*1. Librerias*</font>

In [None]:
#!pip install xgboost

In [None]:
#!pip install optuna

In [None]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import xgboost as xgb

In [None]:
import optuna
import logging

optuna.logging.set_verbosity(optuna.logging.ERROR)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## <font color="orange">*2. Función de optimización*</font>

In [None]:
def objective(trial, X_train , y_train , X_test, y_test):
    '''
    Optimización de los hiperparámetros de xgboost,
    mediante la librería Optuna.

    Args:
        trial: sugerencia de un valor.
        X_train: variables de entrenamiento.
        y_train: target de entrenamiento.
        X_test: variables de prueba.
        y_test: target de prueba.
    Return:
        rmse: error cuadrático medio del par eval y	train.
              ej. eval-rmse:0.06544	train-rmse:0.04366
    '''
    # Definir los parámetros que deseas optimizar
    params = {
              'objective': 'reg:squarederror',  # función de pérdida para regresión
              'eval_metric': 'rmse',  # métrica de evaluación (Error Cuadrático Medio)
              'max_depth': 5,
              'subsample': 1,
              'min_child_weight': trial.suggest_float('min_child_weight', 0.9, 1, log=True),
              'alpha': 0,
              'gamma': 0,
              'lambda':0
             }


    # Crear matrices DMatrix para los datos de entrenamiento
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Lista de conjuntos de evaluación
    evals = [(dtest, 'eval'), (dtrain, 'train')]

    # Entrenar el modelo con los hiperparámetros dados
    model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=20, verbose_eval=False)

    # Realizar predicciones en el conjunto de prueba
    predictions = model.predict(dtest)

    # Calcular la métrica objetivo (por ejemplo, error cuadrático medio)
    rmse = mean_squared_error(y_test, predictions)

    return rmse

## <font color="orange">*3. Dataset*</font>


In [None]:
train_cleaned = pd.read_pickle('/content/drive/MyDrive/train_cleaned.pkl')

## <font color="orange">*4. División de los datos de entrenamiento*</font>

Entrenamiento y validación con Target/MeanEncoder y Embedding

In [None]:
# Selecciona las columnas deseadas
train_1_columns = ['shop_id_scal', 'item_id_scal', 'label_mean_encoder'] + list(range(34))
train_2_columns = ['shop_id_scal', 'item_id_scal', 'label_embedding'] + list(range(34))

# Dataframes del entrenamiento
train_1 = train_cleaned[train_1_columns]
train_2 = train_cleaned[train_2_columns]

In [None]:
train_1.head()

Unnamed: 0,shop_id_scal,item_id_scal,label_mean_encoder,0,1,2,3,4,5,6,...,24,25,26,27,28,29,30,31,32,33
0,0.0,0.135324,0.11366,0.0,23.846154,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.139835,0.11366,0.0,8.461538,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.144346,0.11366,3.550296,7.692308,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.148857,0.11366,1.775148,2.307692,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.157878,0.11366,0.591716,10.769231,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_2.head()

Unnamed: 0,shop_id_scal,item_id_scal,label_embedding,0,1,2,3,4,5,6,...,24,25,26,27,28,29,30,31,32,33
0,0.0,0.135324,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,23.846154,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.139835,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.0,8.461538,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.144346,"[-0.016664708, -0.0076228143, -0.024167625, -0...",3.550296,7.692308,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.148857,"[-0.016664708, -0.0076228143, -0.024167625, -0...",1.775148,2.307692,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.157878,"[-0.016664708, -0.0076228143, -0.024167625, -0...",0.591716,10.769231,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## <font color="orange">*5. Modelos de predicción*</font>

### Objetivo:
  - Utilizar XGBoost como modelo de regresión:
    
    - comparar el desempeño entre las codificaciones Embedding y Target/MeanEncoder.
    - analizar el desempeño del modelo con los hiperparámetros optimizados con Optuna (apartado 2.).

### Resultado:
  - El modelo de mejor desempeño es el optimizado y utilizando la codificación Target/MeanEncoder.

In [None]:
X_1 = train_1.iloc[:, (train_1.columns != 33)].values
y_1 = train_1.iloc[:, (train_1.columns == 33)].values


# Datos en un formato aceptable para XGBoost
embeddings = np.vstack(train_2['label_embedding'].to_numpy())
other_features = train_2.drop(columns=['label_embedding', 33]).values
X_2 = np.hstack((embeddings, other_features))
y_2 = train_2[33].values

In [None]:
# Cambia el orden de los datos de manera aleatoria.
# Este procedimiento mejora las predicciones.
X_1 , y_1 = shuffle(X_1 , y_1, random_state=13)
X_2 , y_2 = shuffle(X_2 , y_2, random_state=13)


In [None]:
# Sets de entrenamiento y validación
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.25, random_state=13)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.25, random_state=13)

### 5.1 Sin optimización de hiperparámetros

 - Conjuntos de evaluación (eval_set):

 `En xgboost.train, se usa una lista de tuplas evals para especificar los conjuntos de datos para la evaluación (en este caso, dtrain y dtest).`

 - Detención temprana (early_stopping_rounds):

 ` Configura early_stopping_rounds=20 en xgboost.train para detener el entrenamiento si el rendimiento en el conjunto de validación no mejora durante 20 iteraciones consecutivas, igual que en XGBRegressor.`

 - Número de iteraciones (num_boost_round):

 `Proporciona un número máximo de iteraciones (num_boost_round=100), pero el entrenamiento puede detenerse antes debido a la detención temprana.`

In [None]:
# Convertir conjuntos de datos en DMatrix
dtrain_1 = xgb.DMatrix(X_train_1, label=y_train_1)
dtest_1 = xgb.DMatrix(X_test_1, label=y_test_1)


# Matriz DMatrix
dtrain_2 = xgb.DMatrix(X_train_2, label=y_train_2)
dtest_2 = xgb.DMatrix(X_test_2, label=y_test_2)


# Definir parámetros del modelo
params = {
        'objective': 'reg:squarederror',
        'eval_metric':'rmse',
        'max_depth':5,
        'subsample':1,
        'min_child_weight':1,
        'alpha': 0,
        'gamma': 0,
        'lambda':0
        }

# Lista de conjuntos de validación y entrenamiento
evals_1 = [(dtest_1, 'eval'), (dtrain_1, 'train')]
evals_2 = [(dtest_2, 'eval'), (dtrain_2, 'train')]

# Entrenar el modelo
model_1 = xgb.train(params, dtrain_1, num_boost_round=100, evals=evals_1, early_stopping_rounds=20, verbose_eval=False)
model_2 = xgb.train(params, dtrain_2, num_boost_round=100, evals=evals_2, early_stopping_rounds=20, verbose_eval=False)

In [None]:
# Realizar predicciones en el conjunto de validación
predictions_1 = model_1.predict(dtest_1)
predictions_2 = model_2.predict(dtest_2)

In [None]:
# Cálculo de errores
rmse_1 = mean_squared_error(predictions_1,y_test_1, squared=False)
rmse_2 = mean_squared_error(predictions_2,y_test_2, squared=False)
print(f'rmse MeanEncoder: {rmse_1}')
print(f'rmse Embedding: {rmse_2}')

rmse MeanEncoder: 0.0637410931553696
rmse Embedding: 0.06515164061922452


### 5.2. Optimización de hiperparámetros con Optuna.

Utilizando la función del apartado 2

In [None]:
#------------MeanEncoder-----------------------
# Crear un objeto de estudio de Optuna
study_1 = optuna.create_study(direction='minimize')

# Optimizar los hiperparámetros para el primer conjunto de datos
study_1.optimize(lambda trial: objective(trial, X_train_1, y_train_1, X_test_1, y_test_1), n_trials=100)

# Obtener los mejores hiperparámetros encontrados
best_params_1 = study_1.best_params

# Entrenar el modelo final con los mejores hiperparámetros para el primer conjunto de datos
evals_1 = [(dtest_1, 'eval'), (dtrain_1, 'train')]
Opt_model_1 = xgb.train(best_params_1, dtrain_1, num_boost_round=100, evals=evals_1, early_stopping_rounds=20, verbose_eval=False)

# Realizar predicciones en el conjunto de prueba para el primer conjunto de datos
predictions_1 = Opt_model_1.predict(dtest_1)

In [None]:
#------------Embedding-----------------------
# Crear un objeto de estudio de Optuna
study_2 = optuna.create_study(direction='minimize')

# Optimizar los hiperparámetros para el segundo conjunto de datos
study_2.optimize(lambda trial: objective(trial, X_train_2, y_train_2, X_test_2, y_test_2), n_trials=100)

# Obtener los mejores hiperparámetros encontrados para el segundo conjunto de datos
best_params_2 = study_2.best_params

# Entrenar el modelo final con los mejores hiperparámetros para el primer conjunto de datos
evals_2 = [(dtest_2, 'eval'), (dtrain_2, 'train')]
Opt_model_2 = xgb.train(best_params_2, dtrain_2, num_boost_round=100, evals=evals_2, early_stopping_rounds=20, verbose_eval=False)

# Realizar predicciones en el conjunto de prueba para el segundo conjunto de datos
predictions_2 = Opt_model_2.predict(dtest_2)

In [None]:
# Cálculo de errores
rmse_1_opt = mean_squared_error(y_test_1, predictions_1, squared=False)
rmse_2_opt = mean_squared_error(y_test_2, predictions_2, squared=False)

print(f'rmse MeanEncoder con Optuna: {rmse_1_opt}')
print(f'rmse Embedding con Optuna: {rmse_2_opt}')

rmse MeanEncoder con Optuna: 0.058789080626297924
rmse Embedding con Optuna: 0.05910046822804891


## <font color="orange">*6. Set de prueba*</font>

Predicción del mejor modelo en el set de prueba "*test_pred* ".

In [None]:
test_pred = pd.read_pickle('/content/drive/MyDrive/test_pred.pkl')

In [None]:
# Corvertir los datos en un formato aceptable para XGBoost
embeddings = np.vstack(test_pred['label_embedding'].to_numpy())
other_features = test_pred.drop(columns=['ID', 'label_embedding', 'label_mean_encoder', -1]).values
X = np.hstack((embeddings, other_features))

dtest =  xgb.DMatrix(X)
predictions = Opt_model_2.predict(dtest)

Los valores deben ajustarce del 0 al 20 según lo especificado en el desafío de Kaggle.

In [None]:
# Inicializar el escalador con el rango deseado
scaler = MinMaxScaler(feature_range=(0, 20))

# Ajustar el escalador y transformar las predicciones
predictions_reshaped = predictions.reshape(-1, 1)  # Necesario para que MinMaxScaler funcione correctamente
predictions_scaled = scaler.fit_transform(predictions_reshaped)