# Time series analysis on AWS
*Chapter 9 - Creating a dataset and ingesting your data*

## Initializations
---

In [None]:
!pip install --quiet tqdm kaggle

### Imports

In [None]:
import matplotlib.colors as mpl_colors
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
import warnings
import zipfile

from matplotlib import gridspec
from sklearn.preprocessing import normalize
from tqdm import tqdm
from urllib.request import urlretrieve

### Parameters

In [None]:
RAW_DATA = os.path.join('..', 'Data', 'raw')
DATA = os.path.join('..', 'Data')
warnings.filterwarnings("ignore")
os.makedirs(RAW_DATA, exist_ok=True)

%matplotlib inline
plt.style.use('fivethirtyeight')
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

plt.rcParams['figure.dpi'] = 300
plt.rcParams['lines.linewidth'] = 0.3
plt.rcParams['axes.titlesize'] = 6
plt.rcParams['axes.labelsize'] = 6
plt.rcParams['xtick.labelsize'] = 5
plt.rcParams['ytick.labelsize'] = 5
plt.rcParams['grid.linewidth'] = 0.2
plt.rcParams['legend.fontsize'] = 5

### Helper functions

In [None]:
def progress_report_hook(count, block_size, total_size):
    mb = int(count * block_size // 1048576)
    if count % 500 == 0:
        sys.stdout.write("\r{} MB downloaded".format(mb))
        sys.stdout.flush()

### Downloading datasets

#### **Dataset 4:** Industrial pump data
To download this dataset from Kaggle, you will need to have an account and create a token that you install on your machine. You can follow [**this link**](https://www.kaggle.com/docs/api) to get started with the Kaggle API. Once generated, make sure your Kaggle token is stored in the `~/.kaggle/kaggle.json` file, or the next cells will issue an error. To get a Kaggle token, go to kaggle.com and create an account. Then navigate to **My account** and scroll down to the API section. There, click the **Create new API token** button:

<img src="../Assets/kaggle_api.png" />


In [None]:
FILE_NAME    = 'pump-sensor-data.zip'
ARCHIVE_PATH = os.path.join(RAW_DATA, FILE_NAME)
FILE_PATH    = os.path.join(DATA, 'pump', 'sensor.csv')
FILE_DIR     = os.path.dirname(FILE_PATH)

if not os.path.isfile(FILE_PATH):
    if not os.path.exists('/home/ec2-user/.kaggle/kaggle.json'):
        os.makedirs('/home/ec2-user/.kaggle/', exist_ok=True)
        raise Exception('The kaggle.json token was not found.\nCreating the /home/ec2-user/.kaggle/ directory: put your kaggle.json file there once you have generated it from the Kaggle website')
    else:
        print('The kaggle.json token file was found: making sure it is not readable by other users on this system.')
        !chmod 600 /home/ec2-user/.kaggle/kaggle.json

    os.makedirs(os.path.join(DATA, 'pump'), exist_ok=True)
    !kaggle datasets download -d nphantawee/pump-sensor-data -p $RAW_DATA

    print("\nExtracting data archive")
    zip_ref = zipfile.ZipFile(ARCHIVE_PATH, 'r')
    zip_ref.extractall(FILE_DIR + '/')
    zip_ref.close()
    
else:
    print("File found, skipping download")

## Dataset visualization
---

### **4.** Industrial pump data

In [None]:
FILE_PATH = os.path.join(DATA, 'pump', 'sensor.csv')
pump_df = pd.read_csv(FILE_PATH, sep=',')
pump_df.drop(columns={'Unnamed: 0'}, inplace=True)
pump_df['timestamp'] = pd.to_datetime(pump_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
pump_df = pump_df.set_index('timestamp')

pump_df['machine_status'].replace(to_replace='NORMAL', value=np.nan, inplace=True)
pump_df['machine_status'].replace(to_replace='BROKEN', value=1, inplace=True)
pump_df['machine_status'].replace(to_replace='RECOVERING', value=1, inplace=True)

print('Shape:', pump_df.shape)
pump_df.head()

In [None]:
pump_df

In [None]:
file_structure_df = pump_df.iloc[:, 0:10].resample('5D').mean()

In [None]:
plt.rcParams['hatch.linewidth'] = 0.5
plt.rcParams['lines.linewidth'] = 0.5

fig = plt.figure(figsize=(5,1))
ax1 = fig.add_subplot(1,1,1)
plot1 = ax1.plot(pump_df['sensor_00'], label='Healthy pump')

ax2 = ax1.twinx()
plot2 = ax2.fill_between(
    x=pump_df.index, 
    y1=0.0, 
    y2=pump_df['machine_status'], 
    color=colors[1], 
    linewidth=0.0,
    edgecolor='#000000',
    alpha=0.5, 
    hatch="//////", 
    label='Broken pump'
)
ax2.grid(False)
ax2.set_yticks([])

labels = [plot1[0].get_label(), plot2.get_label()]

plt.legend(handles=[plot1[0], plot2], labels=labels, loc='lower center', ncol=2, bbox_to_anchor=(0.5, -.4))
plt.title('Industrial pump sensor data')

plt.show()

In [None]:
start_date = np.min(pump_df.index)
end_date = np.max(pump_df.index)
num_periods = pump_df.shape[0]

new_index = pd.date_range(start=start_date, periods=num_periods, freq='5min')
pump_df.index = new_index
pump_df.index.name = 'Timestamp'

In [None]:
plt.rcParams['hatch.linewidth'] = 0.5
plt.rcParams['lines.linewidth'] = 0.5

fig = plt.figure(figsize=(5,1))
ax1 = fig.add_subplot(1,1,1)
plot1 = ax1.plot(pump_df['sensor_00'], label='sensor_00')
# plot1 = ax1.plot(pump_df['sensor_34'], label='Healthy sensor_34')

ax2 = ax1.twinx()
plot2 = ax2.fill_between(
    x=pump_df.index, 
    y1=0.0, 
    y2=pump_df['machine_status'], 
    color=colors[1], 
    linewidth=0.0,
    edgecolor='#000000',
    alpha=0.5, 
    hatch="//////", 
    label='Broken pump'
)
ax2.grid(False)
ax2.set_yticks([])

labels = [plot1[0].get_label(), plot2.get_label()]

plt.legend(handles=[plot1[0], plot2], labels=labels, loc='lower center', ncol=2, bbox_to_anchor=(0.5, -.4))
plt.title('Industrial pump sensor data')

# start = pd.to_datetime('2018-06-24 14:25')
# end = pd.to_datetime('2018-07-06 09:40')
# plt.xlim(start, end)
plt.show()

In [None]:
plt.rcParams['axes.titlesize'] = 4
plt.rcParams['axes.labelsize'] = 4
plt.rcParams['xtick.labelsize'] = 3
plt.rcParams['ytick.labelsize'] = 3

for f in list(pump_df.columns):
    fig = plt.figure(figsize=(2.5,0.5))
    ax1 = fig.add_subplot(1,1,1)
    plot1 = ax1.plot(pump_df[f])
    ax1.set_title(f)
    
plt.show()

In [None]:
pump_df = pump_df.drop(columns=['sensor_50', 'sensor_15'])

In [None]:
pump_df

## Preparing the dataset for Lookout for Equipment
---
### Preparing time series data

In [None]:
TRAIN_DATA = os.path.join('..', 'Data', 'pump', 'train-data')
LABEL_DATA = os.path.join('..', 'Data', 'pump', 'label-data')

os.makedirs(TRAIN_DATA, exist_ok=True)
os.makedirs(LABEL_DATA, exist_ok=True)

pump_df.index.name = 'Timestamp'

In [None]:
features = list(pump_df.columns)[:-1]

for tag in tqdm(features):
    os.makedirs(os.path.join(TRAIN_DATA, tag), exist_ok=True)
    fname = os.path.join(TRAIN_DATA, tag, 'tag_data.csv')
    tag_df = pump_df[[tag]]
    tag_df.to_csv(fname)

### Preparing label data

In [None]:
expanded_labels = pump_df[['machine_status']]
expanded_labels['machine_status'].unique()

In [None]:
from dateutil.relativedelta import relativedelta

range_df = expanded_labels.copy()
range_df['BROKEN'] = False
range_df.loc[range_df['machine_status'] == 1.0, 'BROKEN'] = True

range_df['Next Status'] = range_df['BROKEN'].shift(-1)
range_df['Start Range'] = (range_df['BROKEN'] == False) & (range_df['Next Status'] == True)
range_df['End Range'] = (range_df['BROKEN'] == True) & (range_df['Next Status'] == False)
range_df.iloc[0,3] = range_df.iloc[0,1]
range_df = range_df[(range_df['Start Range'] == True) | (range_df['End Range'] == True)]

labels_df = pd.DataFrame(columns=['start', 'end'])
for index, row in range_df.iterrows():
    if row['Start Range']:
        start = index

    if row['End Range']:
        end = index
        labels_df = labels_df.append({
            'start': start + relativedelta(hours=-12),
            'end': end + relativedelta(hours=+12)
        }, ignore_index=True)
        
labels_df

In [None]:
from dateutil.relativedelta import relativedelta

labels_fname = os.path.join(LABEL_DATA, 'labels.csv')
labels_df['start'] = pd.to_datetime(labels_df['start'])
labels_df['end'] = pd.to_datetime(labels_df['end'])
labels_df['start'] = labels_df['start'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f')
labels_df['end'] = labels_df['end'].dt.strftime('%Y-%m-%dT%H:%M:%S.%f')
labels_df.to_csv(labels_fname, header=None, index=None)

## Creating schema
---

In [None]:
!pip install --quiet markdown

In [None]:
# Helper functions for managing Lookout for Equipment API calls:
sys.path.append('../../amazon-lookout-for-equipment-python-sdk/src')
import lookoutequipment as lookout
import sagemaker

In [None]:
DATASET_NAME = 'pump'
BUCKET       = 'pump-anomaly-detection'
PREFIX       = 'train-data/'
ROLE_ARN     = sagemaker.get_execution_role()

In [None]:
lookout_dataset = lookout.LookoutEquipmentDataset(
    dataset_name=DATASET_NAME,
    component_root_dir=TRAIN_DATA,
    access_role_arn=ROLE_ARN
)

In [None]:
lookout_dataset.dataset_schema