# Export best XGBoost model to joblib

**Goal:** train the best model (from K-Fold) and save it as a `.pkl` file for Flask.

**Output:** `best_xgb_model.pkl`


In [None]:
# Dependencias
# If you do not have xgboost installed, run:
# !pip install xgboost

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import joblib

In [None]:
# Load data
file_path = "../database.csv"
df = pd.read_csv(file_path)

df.head()

In [None]:
# Normalize columns
rename_map = {
    "Numero de cuartos": "num_cuartos",
    "Numero de baños": "num_banos",
    "Numero de huéspedes": "num_huespedes",
    "Precio por noche estimado (MXN)": "precio_noche_mxn",
    "Ocupación promedio (%)": "ocupacion_promedio",
    "Tipo de alojamiento": "tipo_alojamiento",
    "Colonia/Municipio": "colonia_municipio",
}

df = df.rename(columns=rename_map)

In [None]:
# Features and target
features = [
    "Estado",
    "Ciudad",
    "colonia_municipio",
    "tipo_alojamiento",
    "num_cuartos",
    "num_banos",
    "num_huespedes",
    "ocupacion_promedio",
]

target = "precio_noche_mxn"

X = df[features]
y = df[target]

In [None]:
# Preprocessing
categorical_features = ["Estado", "Ciudad", "colonia_municipio", "tipo_alojamiento"]
numeric_features = ["num_cuartos", "num_banos", "num_huespedes", "ocupacion_promedio"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

In [None]:
# Best configuration (K-Fold)
# Ajusta aquí si cambias el mejor resultado
best_params = {
    "n_estimators": 800,
    "learning_rate": 0.1,
    "max_depth": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "objective": "reg:squarederror",
    "random_state": 42,
}

In [None]:
# Final training and export
model = XGBRegressor(**best_params)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model),
    ]
)

pipeline.fit(X, y)

joblib.dump(pipeline, "../best_xgb_model.pkl")
print("Model exported to ../best_xgb_model.pkl")

## Flask note
Load the model with `joblib.load` and predict using the same column schema.


In [None]:
# Load example (for Flask)
# model = joblib.load("../best_xgb_model.pkl")
# pred = model.predict(pd.DataFrame([{
#     "Estado": "Ciudad de México",
#     "Ciudad": "Mexico City",
#     "colonia_municipio": "Roma Norte",
#     "tipo_alojamiento": "Departamento",
#     "num_cuartos": 2,
#     "num_banos": 1.5,
#     "num_huespedes": 4,
#     "ocupacion_promedio": 70,
# }]))
# print(pred)