In [1]:
import json
import pandas as pd
import numpy as np
import torch

In [2]:
window_length = 30
step_size = 1

In [5]:
data_dir = "./NAB/data/realKnownCause/nyc_taxi.csv"
data = pd.read_csv(data_dir)
data['timestamp'] = pd.to_datetime(data['timestamp'])

In [6]:
label_dir = './NAB/labels/combined_windows.json'

with open(label_dir) as FI:
    j_label = json.load(FI)

In [7]:
def get_label(df_x, j_label):
    ano_spans = j_label['realKnownCause/nyc_taxi.csv']

    y = torch.zeros(len(df_x))

    for ano_span in ano_spans:
        ano_start = pd.to_datetime(ano_span[0])
        ano_end = pd.to_datetime(ano_span[1])
        for idx in df_x.index:
            if df_x.loc[idx, 'timestamp'] >= ano_start and df_x.loc[idx, 'timestamp'] <= ano_end:
                y[idx] = 1.0
    
    xx = torch.from_numpy(df_x['value'].values)

    return xx, y

In [8]:
xx, label = get_label(data, j_label)

In [9]:
xx.shape, label.shape

(torch.Size([10320]), torch.Size([10320]))

In [10]:
def split_dataset(x, y, train_ratio, val_ratio):
    
    train_idx, val_idx = int(train_ratio*len(x)), int(val_ratio*len(x))

    train = x[:train_idx], y[:train_idx]
    val = x[:train_idx+val_idx], y[:train_idx+val_idx]
    test = x[train_idx+val_idx:], y[train_idx+val_idx:]

    return train, val, test

In [11]:
train, val, test = split_dataset(xx, label, 0.6,0.1)

In [12]:
def get_minmax(train_data):

    minimum, maximum = train_data.min().item(), train_data.max().item()

    return minimum, maximum

def standardize(data, min, max):

    return (data-min)/(max-min)

In [13]:
min, max = get_minmax(train[0])
standardize(train[0], min, max)[0:10]

tensor([0.2492, 0.1773, 0.1265, 0.0854, 0.0633, 0.0382, 0.0248, 0.0168, 0.0209,
        0.0193])

In [14]:
def unroll(xx, yy, window_size, step_size):

    seq_len = (len(xx)-window_size)//step_size + 1 

    window_data = torch.zeros((seq_len, window_size))
    window_label = torch.zeros((seq_len, ))

    idx = 0
    seq_idx = 0
    while(idx < xx.shape[0] - window_size+1):
        window_data[seq_idx] = xx[idx:idx+window_size]
        window_label[seq_idx] = 1 if yy[idx:idx+window_size].sum().item() > 0 else 0
        idx += step_size
        seq_idx +=1

    return window_data, window_label

In [16]:
input, label2=unroll(standardize(train[0], min, max), train[1], 40, 5)

In [18]:
input.shape, label2.shape

(torch.Size([1231, 40]), torch.Size([1231]))