In [None]:
import sagemaker
import boto3

sessions = boto3.Session(profile_name="default", region_name="us-east-1")

sagemaker_session = sagemaker.Session(boto_session=sessions)

#Dummy name
bucket = "aws-sagemak09"
prefix = "dataset/TrainTestData"

train_s3 = sagemaker_session.upload_data(path="train/training_data.csv", bucket=bucket, key_prefix=f"{prefix}/train")
test_s3 = sagemaker_session.upload_data(path="test/testing_data.csv", bucket=bucket, key_prefix=f"{prefix}/test")

print("Train S3 URI:", train_s3)
print("Test S3 URI:", test_s3)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\PC\AppData\Local\sagemaker\sagemaker\config.yaml
Train S3 URI: s3://aws-sagemaker-bucket-12309/dataset/TrainTestData/train/training_data.csv
Test S3 URI: s3://aws-sagemaker-bucket-12309/dataset/TrainTestData/test/testing_data.csv


In [20]:
%%writefile script.py
import os
import argparse
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def model_fn(model_dir):
    """Load model for inference"""
    return joblib.load(os.path.join(model_dir, "model.joblib"))


if __name__ == "__main__":
    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=42)

    # SageMaker specific arguments
    parser.add_argument(
        "--output-data-dir", type=str, default=os.environ.get("SM_OUTPUT_DATA_DIR", "./output")
    )
    parser.add_argument(
        "--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR", "./model")
    )
    parser.add_argument(
        "--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", "./train")
    )
    parser.add_argument(
        "--test", type=str, default=os.environ.get("SM_CHANNEL_TEST", "./test")
    )

    args, _ = parser.parse_known_args()

    # Ensure local folders exist for testing
    if not os.path.exists(args.train):
        raise ValueError(f"Training directory '{args.train}' does not exist.")
    if not os.path.exists(args.test):
        raise ValueError(f"Testing directory '{args.test}' does not exist.")

    # Load data
    train_df = pd.read_csv(os.path.join(args.train, "training_data.csv"))
    test_df = pd.read_csv(os.path.join(args.test, "testing_data.csv"))

    X_train = train_df.iloc[:, :-1]
    y_train = train_df.iloc[:, -1]
    X_test = test_df.iloc[:, :-1]
    y_test = test_df.iloc[:, -1]

    # Train model
    print("\nMODEL TRAINING STARTED...\n")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        random_state=args.random_state
    )
    model.fit(X_train, y_train)

    # Ensure model directory exists
    os.makedirs(args.model_dir, exist_ok=True)

    # Save model
    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model saved at:", model_path)

    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy **********:", test_acc)


Overwriting script.py


In [None]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='script.py',                
    framework_version="0.23-1",
    instance_type="ml.m5.large",
    #Dummy
    role='arn:aws:iam::15le/ml-project-finals',  
    base_job_name='dt-custom-sklearn',
    hyperparameters={
        'max-depth': 7,
        'n_estimators': 50
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
)


In [None]:
sklearn_estimator.fit({'train': train_s3, 'test':test_s3})


INFO:sagemaker:Creating training-job with name: dt-custom-sklearn-2025-10-04-15-57-31-836


2025-10-04 15:57:40 Starting - Starting the training job...
2025-10-04 15:58:16 Downloading - Downloading input data...
2025-10-04 15:58:42 Downloading - Downloading the training image...
2025-10-04 15:59:22 Training - Training image download completed. Training in progress..2025-10-04 15:59:26,730 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-10-04 15:59:26,734 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,781 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-10-04 15:59:26,952 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,965 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,978 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,986 sagemaker-training-toolkit INFO     Invoking user script
Trai

In [14]:
import boto3

sklearn_estimator.latest_training_job.wait(logs=True)   

sm_client = boto3.client("sagemaker")

artifact = sm_client.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at:", artifact)


2025-10-04 15:59:45 Starting - Preparing the instances for training
2025-10-04 15:59:45 Downloading - Downloading the training image
2025-10-04 15:59:45 Training - Training image download completed. Training in progress.
2025-10-04 15:59:45 Uploading - Uploading generated training model
2025-10-04 15:59:45 Completed - Training job completed2025-10-04 15:59:26,730 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2025-10-04 15:59:26,734 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,781 sagemaker_sklearn_container.training INFO     Invoking user training script.
2025-10-04 15:59:26,952 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,965 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26,978 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2025-10-04 15:59:26

In [None]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    #Dummy 
    role="arn:aws:iam::723:role/e2e-mobe-sagemaker",
    entry_point="script.py",
    framework_version='0.23-1',
)

In [None]:
endpoint= 'Custom-sklearn-models' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
predictor = model.deploy(instance_type="ml.c5.large", initial_instance_count=1, endpoint_name=endpoint)