In [12]:
pip install xgboost


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
import numpy as np
import pandas as pd
import joblib
from google.cloud import storage
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# Configurar nombre del bucket y archivo
BUCKET_NAME = "datasets-cardiovasculares"
FILE_PATH = "dataset_cardiovascular.csv"
MODEL_PATH = "models/xgboost_model.pkl"
SCALER_PATH = "models/scaler1.pkl"

# Descargar archivo desde Cloud Storage
client = storage.Client()
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(FILE_PATH)
blob.download_to_filename("dataset.csv")

# Cargar dataset
df = pd.read_csv("dataset.csv", delimiter=",")  # Asegurar que el delimitador sea correcto

# Aplicar OneHotEncoder a 'gender'
one_hot_encoder = OneHotEncoder(sparse_output=False, drop="first")
gender_encoded = one_hot_encoder.fit_transform(df[['gender']])

# Aplicar OrdinalEncoder a 'cholesterol' y 'gluc'
ordinal_encoder = OrdinalEncoder(categories=[[1, 2, 3]])
cholesterol_encoded = ordinal_encoder.fit_transform(df[['cholesterol']])
gluc_encoded = ordinal_encoder.fit_transform(df[['gluc']])

# Aplicar StandardScaler a variables numéricas
scaler = StandardScaler()
numeric_features = ['age', 'ap_hi', 'ap_lo', 'imc']
df_scaled = scaler.fit_transform(df[numeric_features])

# Unimos las transformaciones y el resto de variables
df_final = np.hstack([
    df_scaled,
    gender_encoded,
    cholesterol_encoded,
    gluc_encoded,
    df[['smoke', 'alco', 'active', 'cardio']].values
])

# Convertimos a DataFrame
column_names = numeric_features + ["gender_encoded"] + ["cholesterol_encoded"] + ["gluc_encoded"] + ['smoke', 'alco', 'active', 'cardio']
df_transformed = pd.DataFrame(df_final, columns=column_names)

# Separar datos en entrenamiento y prueba
X = df_transformed.drop(columns=["cardio"])
y = df_transformed["cardio"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=58)

# Entrenar modelo XGBoost con los hiperparámetros encontrados
xgb = XGBClassifier(n_estimators=200, max_depth=5, gamma=0.2, learning_rate=0.1, 
                    colsample_bytree=1, eval_metric='logloss', random_state=58)
xgb.fit(X_train, y_train)

# Guardar modelo y scaler localmente
joblib.dump(xgb, "xgboost_model.pkl")
joblib.dump(scaler, "scaler1.pkl")

# Subir modelo y scaler a Cloud Storage
blob = bucket.blob(MODEL_PATH)
blob.upload_from_filename("xgboost_model.pkl")

blob = bucket.blob(SCALER_PATH)
blob.upload_from_filename("scaler1.pkl")

print("Modelo guardado en Cloud Storage exitosamente.")


Modelo guardado en Cloud Storage exitosamente.


In [10]:
print("Columnas usadas para entrenar:", X_train.columns.tolist())


Columnas usadas para entrenar: ['age', 'ap_hi', 'ap_lo', 'imc', 'gender_encoded', 'cholesterol_encoded', 'gluc_encoded', 'smoke', 'alco', 'active']
