In [77]:
import urllib.request
import boto3

import pandas as pd

In [78]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [79]:
s3 = boto3.client('s3')

In [80]:
bucket_name = 'sagemaker-some-random-notebook-to-build'
local_directory = 'data/'
dataset_path = f'{local_directory}/iris.data'

In [81]:
s3.download_file(bucket_name, 'iris/iris.data', dataset_path)

In [82]:
data = pd.read_csv(dataset_path, header=None)

In [83]:
data

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [84]:
data.columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']

In [85]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [86]:
from sklearn.preprocessing import LabelEncoder

In [87]:
encoder = LabelEncoder()
data['Species'] = encoder.fit_transform(data['Species'])

In [88]:
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [89]:
data.shape

(150, 5)

In [90]:
data = data[[data.columns[-1]] + list(data.columns[:-1])]

In [91]:
print(data.shape)
data.head()

(150, 5)


Unnamed: 0,Species,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


In [92]:
iris = data.sample(frac=1)

In [93]:
iris

Unnamed: 0,Species,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
80,1,5.5,2.4,3.8,1.1
126,2,6.2,2.8,4.8,1.8
107,2,7.3,2.9,6.3,1.8
6,0,4.6,3.4,1.4,0.3
146,2,6.3,2.5,5.0,1.9
...,...,...,...,...,...
109,2,7.2,3.6,6.1,2.5
147,2,6.5,3.0,5.2,2.0
52,1,6.9,3.1,4.9,1.5
136,2,6.3,3.4,5.6,2.4


In [94]:
from sklearn.model_selection import train_test_split

In [95]:
train, test = train_test_split(iris, test_size = 0.2)

In [96]:
print(train.shape)
print(test.shape)

(120, 5)
(30, 5)


In [97]:
# seems like not needed for the xgb boost model
train_X = train[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
train_y = train.Species

test_X = test[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
test_y = test.Species

In [98]:
train.to_csv('data.csv', header=False, index=False)
s3.upload_file('data.csv', bucket_name, 'data/train')

test.to_csv('data.csv', header=False, index=False)
s3.upload_file('data.csv', bucket_name, 'data/val')

In [99]:
# create model

In [100]:
import sagemaker
from sagemaker import image_uris
from sagemaker import get_execution_role


In [101]:
container = image_uris.retrieve('xgboost', region='us-east-1', version='latest')

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [102]:
role = get_execution_role()

In [103]:
output_key = 'model/xgb_model'
s3_output_location = f's3://sagemaker-some-random-notebook-to-build/output/{output_key}'

In [104]:
xgb_model = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    train_volume_size=5,
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session()
)

xgb_model.set_hyperparameters(
    max_depth=5,
    gamma=4,
    min_child_weight=6,
    silent=0,
    objective='multi:softmax',
    num_class=3,
    num_round=10
)


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [105]:
# train model

In [106]:
train_data_path = f's3://{bucket_name}/data/train'
val_data_path = f's3://{bucket_name}/data/val'

In [107]:
train_channel = sagemaker.session.s3_input(train_data_path, content_type='text/csv')
val_channel = sagemaker.session.s3_input(val_data_path, content_type='text/csv')

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [108]:
data_channels = {'train': train_channel, 'validation': val_channel}

In [109]:
xgb_model.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: xgboost-2024-09-27-15-15-31-031


2024-09-27 15:15:34 Starting - Starting the training job...
2024-09-27 15:15:58 Starting - Preparing the instances for training.........
2024-09-27 15:17:12 Downloading - Downloading input data...
2024-09-27 15:17:38 Downloading - Downloading the training image......
2024-09-27 15:18:59 Training - Training image download completed. Training in progress.
2024-09-27 15:18:59 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2024-09-27:15:18:54:INFO] Running standalone xgboost training.[0m
[34m[2024-09-27:15:18:54:INFO] File size need to be processed in the node: 0.0mb. Available memory size in the node: 8469.1mb[0m
[34m[2024-09-27:15:18:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[15:18:54] S3DistributionType set as FullyReplicated[0m
[34m[15:18:54] 120x4 matrix with 480 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-09-27:15:18:54:INFO] Determined delimiter of CSV input is ','[0m
[34m[1

In [70]:
# deploy model

In [111]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-09-27-15-28-12-384
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-09-27-15-28-12-384
INFO:sagemaker:Creating endpoint with name xgboost-2024-09-27-15-28-12-384


------!

In [None]:
# INFO:sagemaker:Creating model with name: xgboost-2024-09-27-15-28-12-384
# INFO:sagemaker:Creating endpoint-config with name xgboost-2024-09-27-15-28-12-384
# INFO:sagemaker:Creating endpoint with name xgboost-2024-09-27-15-28-12-384