In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
region = 'us-east-1'
boto_session = boto3.Session(region_name=region)
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
default_bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker-featurestore'
offline_feature_store_bucket = 's3://*{}*/*{}*'.format(default_bucket, prefix)

sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [19]:
TARGET_VAR = 'class'
FILE_TRAIN = 'train.csv'
FILE_VALIDATION = 'validation.csv'

In [20]:
def write_to_s3(fobj, bucket, key):
    return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(fobj)

def upload_to_s3(bucket, channel, filename):
    fobj=open(filename, 'rb')
    key = prefix+'/'+channel+'/'+filename
    url = 's3://{}/{}'.format(bucket, key)
    print('Writing to {}'.format(url))
    write_to_s3(fobj, bucket, key)     
    return(url)

In [21]:
# upload the files to the S3 bucket
s3_train_loc = upload_to_s3(bucket = 'rust-detection', channel = 'train', filename = FILE_TRAIN)
s3_valid_loc = upload_to_s3(bucket = 'rust-detection', channel = 'validation', filename = FILE_VALIDATION)

Writing to s3://rust-detection/sagemaker-featurestore/train/train.csv
Writing to s3://rust-detection/sagemaker-featurestore/validation/validation.csv


In [24]:
xgboost_estimator = sagemaker.estimator.Estimator(
    '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',  # Docker container image URI
    role=get_execution_role(),  # IAM role for SageMaker
    instance_count=1,  # Number of instances for training
    instance_type='ml.c5.2xlarge',  # Type of EC2 instance for training
    output_path='s3://rust-output/my-training-output/output',  # S3 location for model artifacts
    sagemaker_session=sagemaker.Session()  # SageMaker session
)

# Set hyperparameters for XGBoost
xgboost_estimator.set_hyperparameters(
    max_depth=3,  # Maximum depth of the tree
    eta=0.2,  # Learning rate
    gamma=8,  # Minimum loss reduction required to make a further partition on a leaf node of the tree
    colsample_bytree=0.9,  # Fraction of features that are randomly chosen to grow trees
    subsample=0.8,  # Fraction of samples used for training
    num_round=100,  # Number of boosting rounds
    min_child_weight=10,  # Minimum sum of instance weight (hessian) needed in a child
    objective='multi:softmax',  # Objective function for multi-class classification
    num_class=2  # Number of classes for multi-class classification
)
s3_train_loc = 's3://rust-detection/sagemaker-featurestore/train/train.csv'
s3_valid_loc = 's3://rust-detection/sagemaker-featurestore/validation/validation.csv'

# Set the input data formatting and locations
s3_input_train = TrainingInput(s3_data=s3_train_loc, content_type='csv')
s3_input_validation = TrainingInput(s3_data=s3_valid_loc, content_type='csv')

# Train the XGBoost model
xgboost_estimator.fit({'train': s3_input_train, 'validation': s3_input_validation})


INFO:sagemaker:Creating training-job with name: xgboost-2023-09-18-19-03-18-471


2023-09-18 19:03:18 Starting - Starting the training job...
2023-09-18 19:03:43 Starting - Preparing the instances for training......
2023-09-18 19:04:39 Downloading - Downloading input data...
2023-09-18 19:05:14 Training - Downloading the training image...
2023-09-18 19:05:45 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2023-09-18:19:05:32:INFO] Running standalone xgboost training.[0m
[34m[2023-09-18:19:05:32:INFO] File size need to be processed in the node: 4.45mb. Available memory size in the node: 7990.82mb[0m
[34m[2023-09-18:19:05:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:05:32] S3DistributionType set as FullyReplicated[0m
[34m[19:05:32] 283462x3 matrix with 850386 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-09-18:19:05:32:INFO] Determined delimiter of CSV input is ','[0m
[34m[19:05:32] S3DistributionType set as FullyReplicated[0m
[34m[19:05:32] 60741x3 matrix wit

In [29]:
xgboost_predictor = xgboost_estimator.deploy(initial_instance_count =2, instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2023-09-18-19-27-51-893
INFO:sagemaker:Creating endpoint-config with name xgboost-2023-09-18-19-27-51-893
INFO:sagemaker:Creating endpoint with name xgboost-2023-09-18-19-27-51-893


---------!