# Steps

In [1]:
# Data Preparation
# Move Data Into S3 Bucket
# Create Model
# Train Model
# Deploy Model

# Data Preparation

In [2]:
# Download data
import urllib.request
urllib.request.urlretrieve("https://archive.ics.uci.edu/static/public/53/iris.zip", "data.zip")

('data.zip', <http.client.HTTPMessage at 0x7fe472ba5450>)

In [3]:
# Unzip data
!mkdir data
!unzip data.zip -d data/

Archive:  data.zip
  inflating: data/Index              
  inflating: data/bezdekIris.data    
  inflating: data/iris.data          
  inflating: data/iris.names         


In [9]:
import pandas as pd

# Read data
data = pd.read_csv("data/iris.data", header=None)

In [10]:
# Convert to numerical values
data[4] = data[4].replace("Iris-setosa", 0)
data[4] = data[4].replace("Iris-virginica", 1)
data[4] = data[4].replace("Iris-versicolor", 2)

print(data)

       0    1    2    3  4
0    5.1  3.5  1.4  0.2  0
1    4.9  3.0  1.4  0.2  0
2    4.7  3.2  1.3  0.2  0
3    4.6  3.1  1.5  0.2  0
4    5.0  3.6  1.4  0.2  0
..   ...  ...  ...  ... ..
145  6.7  3.0  5.2  2.3  1
146  6.3  2.5  5.0  1.9  1
147  6.5  3.0  5.2  2.0  1
148  6.2  3.4  5.4  2.3  1
149  5.9  3.0  5.1  1.8  1

[150 rows x 5 columns]


  data[4] = data[4].replace("Iris-versicolor", 2)


In [11]:
# Shuffle
data = data.sample(frac=1).reset_index(drop=True)

print(data)

       0    1    2    3  4
0    5.7  3.0  4.2  1.2  2
1    6.3  2.3  4.4  1.3  2
2    5.1  2.5  3.0  1.1  2
3    5.1  3.8  1.6  0.2  0
4    6.3  2.5  5.0  1.9  1
..   ...  ...  ...  ... ..
145  6.7  3.3  5.7  2.1  1
146  5.4  3.9  1.7  0.4  0
147  4.9  3.1  1.5  0.1  0
148  7.2  3.0  5.8  1.6  1
149  5.0  3.5  1.3  0.3  0

[150 rows x 5 columns]


In [12]:
# Change label column index
data = data[[4, 0, 1, 2, 3]]

print(data)

     4    0    1    2    3
0    2  5.7  3.0  4.2  1.2
1    2  6.3  2.3  4.4  1.3
2    2  5.1  2.5  3.0  1.1
3    0  5.1  3.8  1.6  0.2
4    1  6.3  2.5  5.0  1.9
..  ..  ...  ...  ...  ...
145  1  6.7  3.3  5.7  2.1
146  0  5.4  3.9  1.7  0.4
147  0  4.9  3.1  1.5  0.1
148  1  7.2  3.0  5.8  1.6
149  0  5.0  3.5  1.3  0.3

[150 rows x 5 columns]


In [13]:
# Split (train, val sets)
train_data = data[:120]
val_data = data[120:]

# Move Data Into S3 Bucket

In [14]:
import boto3

bucket_name = "sagemaker--build-and-deploy-model-sagemaker"

# Tarin data
train_data.to_csv("data.csv", header=False, index=False)
key = "data/train/data"
url = "s3://{}/{}".format(bucket_name, key)
boto3.Session().resource("s3").Bucket(bucket_name).Object(key).upload_file("data.csv") 

# Validation data
val_data.to_csv("data.csv", header=False, index=False)
key = "data/val/data"
url = "s3://{}/{}".format(bucket_name, key)
boto3.Session().resource("s3").Bucket(bucket_name).Object(key).upload_file("data.csv") 

# Create Model

In [21]:
import sagemaker 
from sagemaker import get_execution_role

bucket_name = "sagemaker--build-and-deploy-model-sagemaker"
key = "model/xgb_model"

# Output location for model
s3_output_location = url = "s3://{}/{}".format(bucket_name, key)

# Fetch the correct XGBoost image URI (new method)
xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version="1.5-1")

# Define the Estimator
xgb_model = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=5, 
    output_path=s3_output_location,
    sagemaker_session=sagemaker.Session()
)

# Set hyperparameters
xgb_model.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    verbosity=0,
    objective="multi:softmax",
    num_class=3,
    num_round=10
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


# Train Model

In [22]:
from sagemaker.inputs import TrainingInput

train_data = "s3://{}/{}".format(bucket_name, "data/train")
val_data = "s3://{}/{}".format(bucket_name, "data/val")

train_channel = TrainingInput(train_data, content_type="text/csv")
val_channel = TrainingInput(val_data, content_type="text/csv")

data_channels = {"train": train_channel, "validation": val_channel}

xgb_model.fit(inputs=data_channels)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-10-08-09-33-57-104


2024-10-08 09:33:58 Starting - Starting the training job...
2024-10-08 09:34:32 Downloading - Downloading input data...
  from pandas import MultiIndex, Int64Index[0m
[34m[2024-10-08 09:35:45.045 ip-10-0-69-154.eu-north-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-10-08 09:35:45.066 ip-10-0-69-154.eu-north-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-10-08:09:35:45:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-10-08:09:35:45:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2024-10-08:09:35:45:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-10-08:09:35:45:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2024-10-08:09:35:45:INFO] Determined 0 GPU(s) available on the instance.[0m
[34m[2024-10-08:09:35:45:INFO] Determined delimiter of CSV input is ','[0m

# Deploy Model

In [23]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-10-08-09-38-24-712
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-10-08-09-38-24-712
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-10-08-09-38-24-712


-----!