# SetUp

In [75]:
import os
import json
import sys
import time

import pandas as pd
import boto3

#importing forecast notebook utility from notebooks/common directory
sys.path.insert(0, os.path.abspath("common"))
import util

In [76]:
bucket_name = "household-power-consumption"
region = "us-east-1"


In [77]:
session = boto3.Session(region_name=region) 
forecast = session.client(service_name='forecast') 
forecastquery = session.client(service_name='forecastquery')

## Data Preparation 

In [78]:
df = pd.read_csv("data/household_global_activepower_consumption.csv", dtype = object)

In [79]:
df.head()

Unnamed: 0,timestamp,value,item-id
0,2006-12-16 17:24:00,4.216,client_1
1,2006-12-16 17:25:00,5.36,client_1
2,2006-12-16 17:26:00,5.374,client_1
3,2006-12-16 17:27:00,5.388,client_1
4,2006-12-16 17:28:00,3.666,client_1


In [80]:
df.tail()

Unnamed: 0,timestamp,value,item-id
2049275,2010-11-26 20:58:00,0.946,client_1
2049276,2010-11-26 20:59:00,0.944,client_1
2049277,2010-11-26 21:00:00,0.938,client_1
2049278,2010-11-26 21:01:00,0.934,client_1
2049279,2010-11-26 21:02:00,0.932,client_1


Here, we have data from 2006-12-16 to 2010-11-26 . For testing we will use the data from nov 2010 in different csv. We are also going to save the rest of the data in different csv file.

In [81]:
#storing data from dec 2006 to oct 2010 in one dataframe
training = df[((df['timestamp'] >= '2006-12-16') & (df['timestamp'] <= '2010-11-01'))]
training.shape
training.tail()

Unnamed: 0,timestamp,value,item-id
2012012,2010-10-31 23:55:00,2.46,client_1
2012013,2010-10-31 23:56:00,2.408,client_1
2012014,2010-10-31 23:57:00,1.364,client_1
2012015,2010-10-31 23:58:00,1.366,client_1
2012016,2010-10-31 23:59:00,1.366,client_1


In [82]:
remaining = df[(df['timestamp'] >= '2010-11-01')]
remaining.head()

Unnamed: 0,timestamp,value,item-id
2012017,2010-11-01 00:00:00,1.368,client_1
2012018,2010-11-01 00:01:00,1.368,client_1
2012019,2010-11-01 00:02:00,1.366,client_1
2012020,2010-11-01 00:03:00,1.3119999999999998,client_1
2012021,2010-11-01 00:04:00,1.268,client_1


In [83]:
remaining.shape

(37263, 3)

In [84]:
training.to_csv("data/item-demand-time-train.csv", header = False, index = False)
remaining.to_csv("data/item-demand-time-validation.csv", header = False, index = False)


In [85]:
key="activepower_data/item-demand-time-train.csv"

In [86]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(key).upload_file("data/item-demand-time-train.csv")

## Creating the Dataset Group and Dataset

we are using custom domain with 3 attributes:- DateTime, Target value and item_id

In [47]:
DATASET_FREQUENCY = "H"
TIMESTAMP_FORMAT = "yyyy-MM-dd hh:mm:ss"

In [48]:
project = "active_power_forecast"
datasetName = project + '_ds'
datasetGroupName = project + '_dsg'
s3DataPath = "s3://"+ bucket_name + "/" + key

In [49]:
#Now save things
%store project

Stored 'project' (str)


## Create the Dataset Group 

In [50]:
create_dataset_group_response = forecast.create_dataset_group(DatasetGroupName = datasetGroupName,
                                                             Domain = "CUSTOM",
                                                             )
datasetGroupArn = create_dataset_group_response['DatasetGroupArn']


In [51]:
forecast.describe_dataset_group(DatasetGroupArn = datasetGroupArn)

{'DatasetGroupName': 'active_power_forecast_dsg',
 'DatasetGroupArn': 'arn:aws:forecast:us-east-1:879335107588:dataset-group/active_power_forecast_dsg',
 'DatasetArns': [],
 'Domain': 'CUSTOM',
 'Status': 'ACTIVE',
 'CreationTime': datetime.datetime(2020, 12, 12, 21, 41, 40, 616000, tzinfo=tzlocal()),
 'LastModificationTime': datetime.datetime(2020, 12, 12, 21, 41, 40, 616000, tzinfo=tzlocal()),
 'ResponseMetadata': {'RequestId': '3750174e-272b-458f-96a4-3c4935297bbb',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Sat, 12 Dec 2020 15:56:50 GMT',
   'x-amzn-requestid': '3750174e-272b-458f-96a4-3c4935297bbb',
   'content-length': '273',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

## Create the Schema

In [56]:
#specifying the schema of the dataset
schema = {
    'Attributes': [
        {
            'AttributeName': 'timestamp',
            'AttributeType': 'timestamp'
        },
        {
            'AttributeName': 'target_value',
            'AttributeType': 'float'
        },
        {
            'AttributeName': 'item_id',
            'AttributeType': 'string'
        }
    ]
}

## Create the dataset 

In [57]:
response = forecast.create_dataset(
    DatasetName = datasetName,
    Domain = 'CUSTOM',
    DatasetType = 'TARGET_TIME_SERIES',
    DataFrequency = DATASET_FREQUENCY,
    Schema = schema
)

In [None]:
datasetArn = response['DatasetArn']
forecast.describe_dataset(DatasetArn = datasetArn)

## Add dataset to dataset group

In [59]:
forecast.update_dataset_group(DatasetGroupArn = datasetGroupArn, DatasetArns = [datasetArn])

{'ResponseMetadata': {'RequestId': '99985c9a-bb9e-4709-bb74-6796440462ce',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'content-type': 'application/x-amz-json-1.1',
   'date': 'Sat, 12 Dec 2020 16:05:27 GMT',
   'x-amzn-requestid': '99985c9a-bb9e-4709-bb74-6796440462ce',
   'content-length': '2',
   'connection': 'keep-alive'},
  'RetryAttempts': 0}}

# Create IAM role for Forecast

In [73]:
role_name = "ForecastNotebookRole"
role_arn = util.get_or_create_iam_role(role_name = role_name)

Created arn:aws:iam::879335107588:role/ForecastNotebookRole
Attaching policies
Waiting for a minute to allow IAM role policy attachment to propagate
Done.


# Create data import jobs

In [87]:
datasetImportJobName = 'Import_job_target'
ds_import_job_response = forecast.create_dataset_import_job(
    DatasetImportJobName = datasetImportJobName,
    DatasetArn = datasetArn,
    DataSource = {
        'S3Config': {
            'Path': s3DataPath,
            'RoleArn': role_arn
        }
    },
    TimestampFormat = TIMESTAMP_FORMAT
)

In [89]:
ds_import_job_Arn = ds_import_job_response['DatasetImportJobArn']
print(ds_import_job_Arn)

arn:aws:forecast:us-east-1:879335107588:dataset-import-job/active_power_forecast_ds/Import_job_target


In [92]:
forecast.describe_dataset_import_job(DatasetImportJobArn = ds_import_job_Arn)

{'DatasetImportJobName': 'Import_job_target',
 'DatasetImportJobArn': 'arn:aws:forecast:us-east-1:879335107588:dataset-import-job/active_power_forecast_ds/Import_job_target',
 'DatasetArn': 'arn:aws:forecast:us-east-1:879335107588:dataset/active_power_forecast_ds',
 'TimestampFormat': 'yyyy-MM-dd hh:mm:ss',
 'DataSource': {'S3Config': {'Path': 's3://household-power-consumption/activepower_data/item-demand-time-train.csv',
   'RoleArn': 'arn:aws:iam::879335107588:role/ForecastNotebookRole'}},
 'FieldStatistics': {'item_id': {'Count': 2012017,
   'CountDistinct': 1,
   'CountNull': 0},
  'target_value': {'Count': 2012017,
   'CountDistinct': 4180,
   'CountNull': 0,
   'CountNan': 0,
   'Min': '0.076',
   'Max': '11.122',
   'Avg': 1.0896659789653864,
   'Stddev': 1.058404029777552},
  'timestamp': {'Count': 2012017,
   'CountDistinct': 2012017,
   'CountNull': 0,
   'Min': '2006-12-16T17:24:00Z',
   'Max': '2010-10-31T23:59:00Z'}},
 'DataSize': 0.07348200306296349,
 'Status': 'ACTIVE',


## Next Steps

In [93]:
%store datasetGroupArn
%store datasetArn
%store role_name
%store key
%store bucket_name
%store region
%store ds_import_job_Arn

Stored 'datasetGroupArn' (str)
Stored 'datasetArn' (str)
Stored 'role_name' (str)
Stored 'key' (str)
Stored 'bucket_name' (str)
Stored 'region' (str)
Stored 'ds_import_job_Arn' (str)
