In [1]:
import boto3,os, sagemaker
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                                
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input,Session
import os
from sagemaker.predictor import csv_serializer

In [2]:
s3 = boto3.resource('s3')
bucket_name = "winddata"
data_key = "T1.csv"
data_location = "s3://{}/{}".format(bucket_name,data_key)

role = get_execution_role()

my_region = boto3.session.Session().region_name

model_data=pd.read_csv(data_location)

In [3]:
model_data

Unnamed: 0,Date/Time,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
0,01 01 2018 00:00,380.047791,5.311336,416.328908,259.994904
1,01 01 2018 00:10,453.769196,5.672167,519.917511,268.641113
2,01 01 2018 00:20,306.376587,5.216037,390.900016,272.564789
3,01 01 2018 00:30,419.645905,5.659674,516.127569,271.258087
4,01 01 2018 00:40,380.650696,5.577941,491.702972,265.674286
...,...,...,...,...,...
50525,31 12 2018 23:10,2963.980957,11.404030,3397.190793,80.502724
50526,31 12 2018 23:20,1684.353027,7.332648,1173.055771,84.062599
50527,31 12 2018 23:30,2201.106934,8.435358,1788.284755,84.742500
50528,31 12 2018 23:40,2515.694092,9.421366,2418.382503,84.297913


In [4]:
model_data["Date/Time"] = pd.to_datetime(model_data["Date/Time"])

In [5]:
model_data.set_index("Date/Time",inplace=True)

In [6]:
train_data , test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(35371, 4) (15159, 4)


In [7]:
print(role)
print(my_region)

arn:aws:iam::748050252648:role/service-role/AmazonSageMaker-ExecutionRole-20201012T200352
us-east-1


In [8]:
prefix = 'sagemaker/DEMO-xgboost-dm'
output_path='s3://{}/{}/output'.format(bucket_name,prefix)
print(output_path)

s3://winddata/sagemaker/DEMO-xgboost-dm/output


In [9]:
model_data

Unnamed: 0_level_0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00,380.047791,5.311336,416.328908,259.994904
2018-01-01 00:10:00,453.769196,5.672167,519.917511,268.641113
2018-01-01 00:20:00,306.376587,5.216037,390.900016,272.564789
2018-01-01 00:30:00,419.645905,5.659674,516.127569,271.258087
2018-01-01 00:40:00,380.650696,5.577941,491.702972,265.674286
...,...,...,...,...
2018-12-31 23:10:00,2963.980957,11.404030,3397.190793,80.502724
2018-12-31 23:20:00,1684.353027,7.332648,1173.055771,84.062599
2018-12-31 23:30:00,2201.106934,8.435358,1788.284755,84.742500
2018-12-31 23:40:00,2515.694092,9.421366,2418.382503,84.297913


In [10]:
pd.concat([train_data['LV ActivePower (kW)'], train_data.drop(['LV ActivePower (kW)'], axis=1)], axis=1).to_csv('Train.csv', index=False, header=False)


In [11]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'Train/Train.csv')).upload_file('Train.csv')
s3_input_Train = sagemaker.s3_input(s3_data="s3://{}/{}/Train/Train".format(bucket_name,prefix),content_type="csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [12]:
pd.concat([train_data['LV ActivePower (kW)'], train_data.drop(['LV ActivePower (kW)'], axis=1)], axis=1).to_csv('Test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'Test/Test.csv')).upload_file('Test.csv')
s3_input_Test = sagemaker.s3_input(s3_data="s3://{}/{}/Test/Test".format(bucket_name,prefix),content_type="csv")

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [13]:
model_data

Unnamed: 0_level_0,LV ActivePower (kW),Wind Speed (m/s),Theoretical_Power_Curve (KWh),Wind Direction (°)
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-01 00:00:00,380.047791,5.311336,416.328908,259.994904
2018-01-01 00:10:00,453.769196,5.672167,519.917511,268.641113
2018-01-01 00:20:00,306.376587,5.216037,390.900016,272.564789
2018-01-01 00:30:00,419.645905,5.659674,516.127569,271.258087
2018-01-01 00:40:00,380.650696,5.577941,491.702972,265.674286
...,...,...,...,...
2018-12-31 23:10:00,2963.980957,11.404030,3397.190793,80.502724
2018-12-31 23:20:00,1684.353027,7.332648,1173.055771,84.062599
2018-12-31 23:30:00,2201.106934,8.435358,1788.284755,84.742500
2018-12-31 23:40:00,2515.694092,9.421366,2418.382503,84.297913


In [14]:
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

In [15]:
s3_input_Train

<sagemaker.inputs.s3_input at 0x7f4bdf90de48>

In [16]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, train_instance_count=1,
                                    train_instance_type='ml.m5.large',
                                    output_path=output_path,
                                    sagemaker_session=sess)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [17]:
xgb.set_hyperparameters(objective='reg:linear',num_round=100)


In [18]:
xgb.fit({'train':s3_input_Train})

2020-10-12 14:48:49 Starting - Starting the training job...
2020-10-12 14:48:53 Starting - Launching requested ML instances......
2020-10-12 14:50:15 Starting - Preparing the instances for training......
2020-10-12 14:51:08 Downloading - Downloading input data...
2020-10-12 14:51:45 Training - Downloading the training image..[34mArguments: train[0m
[34m[2020-10-12:14:52:00:INFO] Running standalone xgboost training.[0m
[34m[2020-10-12:14:52:00:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-10-12:14:52:00:INFO] File size need to be processed in the node: 2.14mb. Available memory size in the node: 167.29mb[0m
[34m[2020-10-12:14:52:00:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:52:00] S3DistributionType set as FullyReplicated[0m
[34m[14:52:00] 35371x3 matrix with 106113 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[14:52:00] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 116 extra nod

In [19]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m5.large')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------!

In [20]:
test_data_array = test_data.drop(['LV ActivePower (kW)'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(15159,)


In [21]:
xgb_predictor.endpoint

'xgboost-2020-10-12-14-48-48-978'