### 1. Create a S3 Bucket

In [1]:
import sagemaker
import boto3

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
bucket_name = 'yahoofinancestockpricesri' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-2


In [3]:
s3 = boto3.resource('s3')

try:
    if my_region == 'us-east-2':
        s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={'LocationConstraint': 'us-east-2'}
        )
    print('S3 bucket created successfully')
except Exception as e:
    print('s3 error', e)

s3 error An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.


### 2. Create train and validation csv

In [4]:
!pip install yfinance

import pandas as pd
from datetime import datetime
import yfinance as yf

# initialize parameters
start_date = datetime(2019, 1, 1)
end_date = datetime(2021, 1, 1)

# get the data
df_data = yf.download('AAPL', start = start_date, end = end_date)

df_data.reset_index(inplace=True)

df_data

Collecting yfinance
  Downloading yfinance-0.2.37-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting lxml>=4.9.1 (from yfinance)
  Downloading lxml-5.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting appdirs>=1.4.4 (from yfinance)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.17.1.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-01-02,38.722500,39.712502,38.557499,39.480000,37.845043,148158800
1,2019-01-03,35.994999,36.430000,35.500000,35.547501,34.075397,365248800
2,2019-01-04,36.132500,37.137501,35.950001,37.064999,35.530064,234428400
3,2019-01-07,37.174999,37.207500,36.474998,36.982498,35.450977,219111200
4,2019-01-08,37.389999,37.955002,37.130001,37.687500,36.126778,164101200
...,...,...,...,...,...,...,...
500,2020-12-24,131.320007,133.460007,131.100006,131.970001,129.514450,54930100
501,2020-12-28,133.990005,137.339996,133.509995,136.690002,134.146667,124486200
502,2020-12-29,138.050003,138.789993,134.339996,134.869995,132.360504,121047300
503,2020-12-30,135.580002,135.990005,133.399994,133.720001,131.231903,96452100


### Extract, Load & Transform

In [5]:
df_data.drop(columns = (['Adj Close','Date']), axis=1, inplace=True)
df_data

Unnamed: 0,Open,High,Low,Close,Volume
0,38.722500,39.712502,38.557499,39.480000,148158800
1,35.994999,36.430000,35.500000,35.547501,365248800
2,36.132500,37.137501,35.950001,37.064999,234428400
3,37.174999,37.207500,36.474998,36.982498,219111200
4,37.389999,37.955002,37.130001,37.687500,164101200
...,...,...,...,...,...
500,131.320007,133.460007,131.100006,131.970001,54930100
501,133.990005,137.339996,133.509995,136.690002,124486200
502,138.050003,138.789993,134.339996,134.869995,121047300
503,135.580002,135.990005,133.399994,133.720001,96452100


In [6]:
df_data_features = df_data.iloc[:-1, :]
df_data_features

Unnamed: 0,Open,High,Low,Close,Volume
0,38.722500,39.712502,38.557499,39.480000,148158800
1,35.994999,36.430000,35.500000,35.547501,365248800
2,36.132500,37.137501,35.950001,37.064999,234428400
3,37.174999,37.207500,36.474998,36.982498,219111200
4,37.389999,37.955002,37.130001,37.687500,164101200
...,...,...,...,...,...
499,132.160004,132.429993,130.779999,130.960007,88223700
500,131.320007,133.460007,131.100006,131.970001,54930100
501,133.990005,137.339996,133.509995,136.690002,124486200
502,138.050003,138.789993,134.339996,134.869995,121047300


In [7]:
df_data_targets = df_data.iloc[1: , 0].rename('Target')
df_data_targets

1       35.994999
2       36.132500
3       37.174999
4       37.389999
5       37.822498
          ...    
500    131.320007
501    133.990005
502    138.050003
503    135.580002
504    134.080002
Name: Target, Length: 504, dtype: float64

In [8]:
df_data_features['Target'] = list(df_data_targets)

first_column = df_data_features.pop('Target')
df_data_features.insert(0, 'Target', first_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data_features['Target'] = list(df_data_targets)


In [9]:
df_data_final = df_data_features
df_data_final

Unnamed: 0,Target,Open,High,Low,Close,Volume
0,35.994999,38.722500,39.712502,38.557499,39.480000,148158800
1,36.132500,35.994999,36.430000,35.500000,35.547501,365248800
2,37.174999,36.132500,37.137501,35.950001,37.064999,234428400
3,37.389999,37.174999,37.207500,36.474998,36.982498,219111200
4,37.822498,37.389999,37.955002,37.130001,37.687500,164101200
...,...,...,...,...,...,...
499,131.320007,132.160004,132.429993,130.779999,130.960007,88223700
500,133.990005,131.320007,133.460007,131.100006,131.970001,54930100
501,138.050003,133.990005,137.339996,133.509995,136.690002,124486200
502,135.580002,138.050003,138.789993,134.339996,134.869995,121047300


### Train Test Split

In [10]:
import numpy as np

df_randamized = df_data_final.sample(frac=1, random_state=123)
df_randamized

Unnamed: 0,Target,Open,High,Low,Close,Volume
429,115.230003,118.330002,118.830002,113.610001,115.540001,184642000
282,78.839996,81.184998,81.495003,80.712502,81.237503,80113600
383,97.264999,95.334999,95.980003,94.705002,95.919998,90257200
477,117.180000,118.639999,118.769997,117.290001,117.339996,73604300
428,118.330002,114.720001,115.930000,112.800003,115.360001,140150100
...,...,...,...,...,...,...
98,45.049999,44.950001,45.134998,44.452499,44.915001,146118800
476,118.639999,117.589996,119.059998,116.809998,118.639999,74113000
322,70.599998,70.000000,72.062500,69.512497,71.762497,194994800
382,95.334999,96.262497,96.317497,94.672501,95.752502,125642800


In [11]:
train_data, test_data = np.split(df_randamized, [int(0.8*len(df_randamized))])
print(train_data.shape, test_data.shape)

(403, 6) (101, 6)


  return bound(*args, **kwds)


### Set path and upload dataset to S3 bucket

In [12]:
import os

prefix = 'xgboost-as-a-built-in-algo'

train_csv_path = 's3://{}/{}/{}/{}'.format(bucket_name, prefix, 'train', 'train.csv')
test_csv_path = 's3://{}/{}/{}/{}'.format(bucket_name, prefix, 'test', 'test.csv')

print(train_csv_path)
print(test_csv_path)

s3://yahoofinancestockpricesri/xgboost-as-a-built-in-algo/train/train.csv
s3://yahoofinancestockpricesri/xgboost-as-a-built-in-algo/test/test.csv


Reference: https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html

In [13]:
train_data.to_csv(train_csv_path, index = False, header = False)
test_data.to_csv(test_csv_path, index = False, header = False)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [14]:
import sagemaker
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

### Find XgBoost image URI and builds an XGBoost container

In [15]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.7-1")

display(xgboost_container)

'257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.7-1'

In [16]:
hyperparameters = {
    'max_depth':'5',
    'eta':'0.2',
    'gamma':'4',
    'min_child_weight':'6',
    'subsample':'0.7',
    'objective': 'reg:squarederror',
    'early_stopping_rounds':10,
    'num_round':1000
    }

### Set an output path where the trained model will be saved

In [17]:
# first {} is bucket name
# second {} is bucket prefix
# output folder
output_path = 's3://{}/{}/{}/'.format(bucket_name, prefix, 'output')
print(output_path)

s3://yahoofinancestockpricesri/xgboost-as-a-built-in-algo/output/


### Construct a sagemaker estimator that calls the xgboost container

In [18]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances = True,
                                          EnableManagedSpotTraining = True,
                                          max_run = 300,
                                          max_wait = 300
                                         )

train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


### Define the data type and paths to the training and validation dataset

In [19]:
content_type = 'csv'
train_input = TrainingInput('s3://{}/{}/{}/'.format(bucket_name, prefix, 'train'), content_type=content_type)
test_input = TrainingInput('s3://{}/{}/{}/'.format(bucket_name, prefix, 'test'), content_type=content_type)

### Execute the XGBoost training job

In [20]:
# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': test_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-04-07-18-16-12-381


2024-04-07 18:16:12 Starting - Starting the training job...
2024-04-07 18:16:27 Starting - Preparing the instances for training......
2024-04-07 18:17:19 Downloading - Downloading input data...
2024-04-07 18:17:45 Downloading - Downloading the training image...
2024-04-07 18:18:20 Training - Training image download completed. Training in progress...[34m[2024-04-07 18:18:43.664 ip-10-0-71-191.us-east-2.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-04-07 18:18:43.686 ip-10-0-71-191.us-east-2.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-04-07:18:18:44:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-04-07:18:18:44:INFO] Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34m[2024-04-07:18:18:44:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-04-07:18:18:44:INFO] Running XGBoost Sagemak

### Deploy trained xgb model as Endpoint

In [22]:
from sagemaker.serializers import CSVSerializer

xgb_predictor =  estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-04-07-18-40-20-625
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-04-07-18-40-20-625
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-04-07-18-40-20-625


------!

In [23]:
xgb_predictor.endpoint_name

'sagemaker-xgboost-2024-04-07-18-40-20-625'

### Make predictions using Endpoints

In [24]:
# initialize parameters
start_date = datetime(2021, 1, 4)
end_date = datetime(2021, 1, 5)

# get the data
df_data = yf.download('AAPL', start = start_date, end = end_date)
df_data.reset_index(inplace=True)
df_data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2021-01-04,133.520004,133.610001,126.760002,129.410004,127.002121,143301900


In [25]:
df_data.drop(axis=1, columns=['Adj Close'], inplace=True)
df_data.drop(axis=1, columns=['Date'], inplace=True)

data_features_array = df_data.values
data_features_array

array([[1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,
        1.43301900e+08]])

### Serialize data

### inference - Serialize Input by Sagemaker Function

In [27]:
Y_pred_Fcn = xgb_predictor.predict(data_features_array).decode('utf-8')
print(Y_pred_Fcn, type(Y_pred_Fcn))

132.52374267578125
 <class 'str'>


In [28]:
# Incase if we did not use CSVSerializer we can serialize our input data as below

from sagemaker.serializers import CSVSerializer

serialized_Input_Fcn = CSVSerializer().serialize([[1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,
        1.43301900e+08]])

print(serialized_Input_Fcn, type(serialized_Input_Fcn))

Y_pred_Fcn = xgb_predictor.predict(serialized_Input_Fcn).decode('utf-8')
print(Y_pred_Fcn, type(Y_pred_Fcn))

133.520004,133.610001,126.760002,129.410004,143301900.0 <class 'str'>
132.52374267578125
 <class 'str'>


We can not use CSVSerializer in our Lambda enviroment we need to use other methods

### Inference - Serialized Input by built-in function (Lambda function friendly)

In [30]:
 Input = [[1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08],
          [1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08],
          [1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08]]
    
Serialized_Input = ','.join(map(str, Input[0]))

print(Serialized_Input, type(Serialized_Input))

Y_pred_Fcn = xgb_predictor.predict(Serialized_Input).decode('utf-8')
print(Y_pred_Fcn, type(Y_pred_Fcn))

133.520004,133.610001,126.760002,129.410004,143301900.0 <class 'str'>
132.52374267578125
 <class 'str'>


### Lambda function handler

Referene: https://docs.aws.amazon.com/lambda/latest/dg/python-handler.html

Referene: https://docs.aws.amazon.com/lambda/latest/dg/python-context.html

In [55]:
import boto3

ENDPOINT_NAME = 'sagemaker-xgboost-2024-04-07-18-40-20-625'
runtime = boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    inputs = event['data']
    
    result = []
    
    for input in inputs:
    
        serialized_input = ','.join(map(str, input))
        
        response = runtime.invoke_endpoint(EndpointName = ENDPOINT_NAME,
                                          ContentType = 'text/csv',
                                          Body=serialized_input)
        
        result.append(response['Body'].read().decode())
    
    return result

In [57]:
Input_json = { 'data': 
        [[1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08], 
         [1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08], 
         [1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08]
        ]
}

result = lambda_handler(Input_json, __)
result

['132.52374267578125\n', '132.52374267578125\n', '132.52374267578125\n']

### Create API Gateway

In [61]:
# importing requests library
import requests

# defining the api-endpoint
API_ENDPOINT = 'https://sgyf23n2s7.execute-api.us-east-2.amazonaws.com/xgbmodel'

# data to be sent to api
json = {"data":
        [[1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08], 
         [1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08], 
         [1.33520004e+02, 1.33610001e+02, 1.26760002e+02, 1.29410004e+02,1.43301900e+08]
        ]
}

# sending post request and saving response as response object
r = requests.post(url = API_ENDPOINT, json = json)

#### With the Lambda interation, the Lambda function output is returned as a 200 OK response.

Incase of errors and TTP status code reference: https://aws.amazon.com/blogs/compute/error-handling-patterns-in-amazon-api-gateway-and-aws-lambda/

In [62]:
print(f"Status Code: {r.status_code}, Response: {r.json()}")

Status Code: 200, Response: ['132.52374267578125\n', '132.52374267578125\n', '132.52374267578125\n']


### Close and Terminate

In [63]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-04-07-18-40-20-625


In [64]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'YFE5DEP08BQDQK7T',
   'HostId': 'AYlNWQOrwP3ZhOESEpK7DxrKZdZaB6izumlz68BUS7MeLZ+6gECwhCZ6lxED9jYJiOzcMWcxOdOdU/JbQFKTDA==',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'AYlNWQOrwP3ZhOESEpK7DxrKZdZaB6izumlz68BUS7MeLZ+6gECwhCZ6lxED9jYJiOzcMWcxOdOdU/JbQFKTDA==',
    'x-amz-request-id': 'YFE5DEP08BQDQK7T',
    'date': 'Sun, 07 Apr 2024 22:17:45 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-04-07-18-16-12-381/debug-output/index/000000000/000000000390_worker_0.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-04-07-18-16-12-381/debug-output/events/000000000170/000000000170_worker_0.tfevents'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-04-04-02-47-02-298/debug-output/index/000000000/000000000180_wor