In [1]:
import boto3
import sagemaker
import json
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sagemaker import get_execution_role

In [2]:
sagemaker_session = sagemaker.Session()
s3_bucket = sagemaker.Session().default_bucket()  # replace with an existing bucket if needed
s3_prefix = 'Deep_AR'    # prefix used for all data stored within the bucket
role = sagemaker.get_execution_role()             # IAM role to use by SageMaker

In [3]:
image_name = sagemaker.amazon.amazon_estimator.get_image_uri(boto3.Session().region_name, "forecasting-deepar",'latest')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


In [4]:
s3_data_path = "s3://{}/{}/data".format(s3_bucket, s3_prefix)
s3_output_path = "s3://{}/{}/output".format(s3_bucket, s3_prefix)

In [5]:
new_path = 'data/'
if new_path not in sys.path:
  sys.path.append(new_path)

In [6]:
resampled_df = pd.read_pickle(new_path + 'hourly_resampled_contracts_ohlc.pkl')

In [7]:
#flatten multilablbel cols
resampled_df.columns = [''.join(col).strip() for col in resampled_df.columns.values]
resampled_df.rename({'contractIdcontractId': 'contractID', 'qtyqty': 'qty'}, axis=1, inplace=True)
resampled_df = resampled_df.sort_values(by ='contractID')
d = resampled_df['contractID'].unique().tolist()
d = dict((d[i],i) for i in range(len(d)))
resampled_df['labels'] = resampled_df['contractID'].map(d).astype(int)
resampled_df.head()

Unnamed: 0_level_0,contractID,qty,pxopen,pxhigh,pxlow,pxclose,labels
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-03-01 11:00:00,11629792,7.3,-0.99,-0.99,-1.0,-1.0,0
2020-03-01 11:00:00,11629866,28.0,1.3,1.71,1.3,1.41,1
2020-03-01 12:00:00,11629866,1755.3,1.71,18.0,-5.57,-5.57,1
2020-03-01 11:00:00,11629920,20.7,18.5,18.5,18.5,18.5,2
2020-03-01 12:00:00,11629920,1005.3,18.11,19.0,16.21,17.2,2


In [8]:
#flatten multilablbel cols
resampled_df.columns = [''.join(col).strip() for col in resampled_df.columns.values]
resampled_df.rename({'contractIdcontractId': 'contractID', 'qtyqty': 'qty'}, axis=1, inplace=True)
resampled_df = resampled_df.sort_values(by ='contractID')
d = resampled_df['contractID'].unique().tolist()
d = dict((d[i],i) for i in range(len(d)))
resampled_df['labels'] = resampled_df['contractID'].map(d).astype(int)
resampled_df.head()

Unnamed: 0_level_0,contractID,qty,pxopen,pxhigh,pxlow,pxclose,labels
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-03-01 11:00:00,11629792,7.3,-0.99,-0.99,-1.0,-1.0,0
2020-03-01 11:00:00,11629866,28.0,1.3,1.71,1.3,1.41,1
2020-03-01 12:00:00,11629866,1755.3,1.71,18.0,-5.57,-5.57,1
2020-03-01 11:00:00,11629920,20.7,18.5,18.5,18.5,18.5,2
2020-03-01 12:00:00,11629920,1005.3,18.11,19.0,16.21,17.2,2


In [9]:
X = resampled_df['labels'].values
y = resampled_df[['pxopen','pxhigh','pxlow','pxclose']].values
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.25)

In [10]:
training_data = [
    {
        "start": str(resampled_df.index[i]),
         "target": list(y_train[i])
       ,"cat":  int(X_train[i])
    }
    for i in range(len(X_train))
]

print(len(training_data))

43340


In [11]:
test_data = [
    {
        "start": str(resampled_df.index[len(X_train)+i]),
        "target": list(y_valid[i])
        ,"cat": int(X_valid[i])
    }
    for i in range(len(X_valid))
]
print(len(test_data))

14447


In [12]:
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [13]:
%%time
write_dicts_to_file("train.json", training_data)
write_dicts_to_file("test.json", test_data)

CPU times: user 259 ms, sys: 3.73 ms, total: 263 ms
Wall time: 491 ms


In [14]:
s3 = boto3.resource('s3')
def copy_to_s3(local_file, s3_path, override=False):
    assert s3_path.startswith('s3://')
    split = s3_path.split('/')
    bucket = split[2]
    path = '/'.join(split[3:])
    buk = s3.Bucket(bucket)
    
    if len(list(buk.objects.filter(Prefix=path))) > 0:
        if not override:
            print('File s3://{}/{} already exists.\nSet override to upload anyway.\n'.format(s3_bucket, s3_path))
            return
        else:
            print('Overwriting existing file')
    with open(local_file, 'rb') as data:
        print('Uploading file to {}'.format(s3_path))
        buk.put_object(Key=path, Body=data)

In [15]:
%%time
copy_to_s3("train.json", s3_data_path + "/train/train.json")
copy_to_s3("test.json", s3_data_path + "/test/test.json")

File s3://sagemaker-us-east-2-313635455612/s3://sagemaker-us-east-2-313635455612/Deep_AR/data/train/train.json already exists.
Set override to upload anyway.

File s3://sagemaker-us-east-2-313635455612/s3://sagemaker-us-east-2-313635455612/Deep_AR/data/test/test.json already exists.
Set override to upload anyway.

CPU times: user 17.8 ms, sys: 7.64 ms, total: 25.4 ms
Wall time: 99.2 ms


In [16]:
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_name=image_name,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.2xlarge',
   # base_job_name='deepar-electricity-demo',
    output_path=s3_output_path,
    image_uri=image_name
)



hyperparameters = {
    "time_freq": '1H',
    "epochs": "20",
    "early_stopping_patience": "40",
    "mini_batch_size": "64",
    "learning_rate": "5E-4",
    "context_length": 1,
    "prediction_length": 1
}

estimator.set_hyperparameters(**hyperparameters)


data_channels = {
    "train": "{}/train/".format(s3_data_path),
    "test": "{}/test/".format(s3_data_path)
}

estimator.fit(inputs=data_channels, wait=True)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2020-11-02 12:30:24 Starting - Starting the training job...
2020-11-02 12:30:26 Starting - Launching requested ML instances......
2020-11-02 12:31:29 Starting - Preparing the instances for training......
2020-11-02 12:32:48 Downloading - Downloading input data
2020-11-02 12:32:48 Training - Downloading the training image..[34mArguments: train[0m
[34m[11/02/2020 12:33:05 INFO 140088046479168] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'num_dynamic_feat': u'auto', u'dropout_rate': u'0.10', u'mini_batch_size': u'128', u'test_quantiles': u'[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'num_eval_samples': u'100', u'learning_rate': u'0.001', u'num_cells': u'40', u'num_layers': u'2', u'embedding_dimension': u'10', u'_kvstore': u'auto', u'_num_kv_servers': u'auto', u'cardinality': u'auto', u'likelihood': u'student-t', u'early_stopping_patience': u''}[0m
[34