In [None]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

In [None]:
import os
os.chdir("/home/sagemaker-user/mlops_project_grupo_2")

In [None]:
# Setup
role = get_execution_role()
sagemaker_session = sagemaker.Session()
bucket = "my-batch-inference-data"  # change if needed
tracking_server_arn = "arn:aws:sagemaker:us-east-2:686410906112:mlflow-tracking-server/bank-attrition-server"

In [None]:
# Upload data to S3 (if not already uploaded)
train_input = sagemaker_session.upload_data("data/processed/train_clean.csv", key_prefix="training/train")
test_input = sagemaker_session.upload_data("data/processed/test_clean.csv", key_prefix="training/test")

In [None]:
sklearn_estimator = SKLearn(
    entry_point="src/training/train.py",     # Full path inside the project
    source_dir=".",                          # Upload whole repo
    role=role,
    instance_type="ml.m5.large",
    framework_version="1.2-1",
    py_version="py3",
    hyperparameters={
        "--train_path": "/opt/ml/input/data/train/train_clean.csv",
        "--test_path": "/opt/ml/input/data/train/test_clean.csv"
    },
    environment={
        "MLFLOW_TRACKING_URI": tracking_server_arn,
        "PYTHONPATH": "/opt/ml/code"  # because /opt/ml/code now contains src/
    },
    sagemaker_session=sagemaker_session,
    dependencies=["mlops_project_grupo_2/src/training/requirements.txt"]
)

sklearn_estimator.fit({
    "train": train_input,
    "test": test_input
})