In [None]:
import boto3
import sagemaker
import os
import sagemaker
import pandas as pd
from sagemaker import get_execution_role

from dotenv import load_dotenv

# Load dotenv variables from .env file
load_dotenv()

# Crear sesión usando el perfil aws_agora_presmanes
boto_session = boto3.Session(profile_name=os.environ.get("AWS_PROFILE", "default"))

session = sagemaker.Session(boto_session=boto_session)
region = boto_session.region_name

print("✅ Connected to AWS with profile:", boto_session.profile_name)
print("Region:", boto_session.region_name)


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


✅ Connected to AWS with profile: aws_agora_presmanes
Region: eu-west-1


In [12]:
session = sagemaker.Session(boto_session=boto_session)

role = os.environ.get("AWS_SAGEMAKER_ROLE_ARN", None)
print("Using IAM role:", role)

Using IAM role: arn:aws:iam::296062581113:role/SageMakerExecutionRole


In [25]:
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

# Usa la imagen XGBoost de AWS (ya preinstalada)
container = sagemaker.image_uris.retrieve("xgboost", region, version="1.5-1")

# Crear dataset simple con label en la primera columna
df = pd.DataFrame({
    "label": [0, 1, 0, 1, 1, 0],
    "feature1": [0.1, 0.5, 0.2, 0.8, 0.3, 0.9],
    "feature2": [1.1, 0.7, 0.2, 1.0, 0.5, 0.3],
})

# Guardar sin cabecera y sin índice
df.to_csv("./sample_train.csv", index=False, header=False)

# Crear el estimador
estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{session.default_bucket()}/xgboost-hello-world/output",
    sagemaker_session=session,
    hyperparameters={
        "objective": "binary:logistic",  # clasificación binaria
        "num_round": 10                  # número de iteraciones
    }
)

# Subir un dataset de ejemplo a S3
s3_train_path = session.upload_data("sample_train.csv", key_prefix="xgboost-hello-world/input")

print(f"📤 Uploaded dataset to: {s3_train_path}")

# Lanza el job de entrenamiento
print("🚀 Launching training job in SageMaker...")
train_input = TrainingInput(
    s3_data=s3_train_path,
    content_type="text/csv"   # 👈 le decimos que es CSV
)
estimator.fit(
    {"train": train_input}
)

print("✅ Training job completed!")
print(f"🎯 Model artifacts saved to: {estimator.model_data}")

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-10-06-17-44-58-424


📤 Uploaded dataset to: s3://sagemaker-eu-west-1-296062581113/xgboost-hello-world/input/sample_train.csv
🚀 Launching training job in SageMaker...
2025-10-06 17:44:58 Starting - Starting the training job...
2025-10-06 17:45:33 Downloading - Downloading input data...
2025-10-06 17:46:19 Downloading - Downloading the training image......
2025-10-06 17:47:35 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index
[2025-10-06 17:47:30.226 ip-10-0-254-123.eu-west-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2025-10-06 17:47:30.253 ip-10-0-254-123.eu-west-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2025-10-06:17:47:30:INFO] Imported framework sagemaker_xgboost_container.training
[2025-10-06:17:47:30:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.
Returning the value itself
[2025-10-06:17:47:30:INFO] No GPUs detected (normal if no gpus in