In [482]:
# Reset all variables so that we can 'run all cells' and not get unused variables hanging around
%reset -f

In [483]:
import pandas as pd
import numpy as np
from pivottablejs import pivot_ui
import random
import json
from collections import defaultdict
from functools import partial

In [484]:
df = pd.read_csv("data/welder_time_local.csv").set_index('time_local')
df.sample(10)

Unnamed: 0_level_0,welder_is_on,day_name,hour,day_index,day
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-11 02:00:00+03:00,0,Tuesday,2,1,1_Tuesday
2018-09-16 21:00:00+03:00,0,Sunday,21,6,6_Sunday
2018-09-18 18:00:00+03:00,0,Tuesday,18,1,1_Tuesday
2018-09-11 12:00:00+03:00,0,Tuesday,12,1,1_Tuesday
2018-09-08 23:00:00+03:00,0,Saturday,23,5,5_Saturday
2018-09-19 17:00:00+03:00,0,Wednesday,17,2,2_Wednesday
2018-09-11 01:00:00+03:00,0,Tuesday,1,1,1_Tuesday
2018-09-14 09:00:00+03:00,5,Friday,9,4,4_Friday
2018-09-11 06:00:00+03:00,0,Tuesday,6,1,1_Tuesday
2018-09-10 20:00:00+03:00,0,Monday,20,0,0_Monday


In [485]:
# To come up with the config options, use this example to output the config: 
# https://pivottable.js.org/examples/onrefresh.html
pivot_ui(df, 
         rows=['day'],
         cols=['hour'],
         rendererName="Table Barchart",
         aggregatorName="Average",
         vals=["welder_is_on"])

In [486]:
pivot_ui(df, 
         rows=['day'],
         cols=['hour'],
         rendererName="Table Barchart",
         aggregatorName="Sum over Sum",
         vals=["welder_is_on", "hour"])

## Example functions for next section:

In [487]:
measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
s1 = random.choice(measured)
s1

7

In [488]:
df[df['day_name'] == 'Monday'].sample(3)

Unnamed: 0_level_0,welder_is_on,day_name,hour,day_index,day
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-10 09:00:00+03:00,0,Monday,9,0,0_Monday
2018-09-17 02:00:00+03:00,0,Monday,2,0,0_Monday
2018-09-17 04:00:00+03:00,0,Monday,4,0,0_Monday


## Create dataset for the load profile generator

Take a sample of every hour of every weekday: So for example:

```
measured_usage = {
  fri_09: [0, 1, 0, 3, 0, 1, 1]     # Friday @ 9am
  sat_10: [0, 0, 1, 30, 10, 0, 2]   # Saturday @ 10am
  ...
}

```

In [489]:
day_range = range(7)
hour_range = range(25)

In [490]:
df.sample(5)

Unnamed: 0_level_0,welder_is_on,day_name,hour,day_index,day
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-16 07:00:00+03:00,0,Sunday,7,6,6_Sunday
2018-09-10 00:00:00+03:00,0,Monday,0,0,0_Monday
2018-09-13 15:00:00+03:00,0,Thursday,15,3,3_Thursday
2018-09-17 06:00:00+03:00,0,Monday,6,0,0_Monday
2018-09-18 01:00:00+03:00,0,Tuesday,1,1,1_Tuesday


In [491]:
# Shortens a day name to the first 3 letters (Saturday => sat)
def shorten_day_name(day_string):
    return day_string[0:3].lower()

# Python allows tuples as dict keys. For example: ("Saturday" 10): [0, 0, 2, 4, 0]
# But this data structure will be exported to JSON, which only allows string keys. 
# So the key generated by this function will look like this: "sat_10": [0, 0, 2, 4, 0]
def composite_key(day_name, hour):
    padded_hour = str(hour).zfill(2)
    return "{}_{}".format(shorten_day_name(day_name), padded_hour)

In [498]:
# TODO: document this function
def create_usage_profile_data(df):
    dict = defaultdict(list)
    for index, row in df.iterrows():
        key = composite_key(row['day_name'], row['hour'])
        dict[key].append(row['welder_is_on'])
    return dict

measured_usage = create_usage_profile_data(df)
# measured_usage

In [493]:
# Make sure there is a key for every hour of every day of the week  
# represented: (7x24 = 168) 
len(measured_usage) == 7*24

True

In [494]:
# This dataset is everything the web app needs to generate a 52-week
# usage profile based on sampling (more on that below). 
# Output to JSON so it can be imported into the app
with open('data/welder_usage_generator_data.json', 'w') as fp:
    json.dump(measured_usage, fp)

## Create Usage Profile Generator
Day 1 of the year (Sat @ 10am): sample(measured_usage.sat_10)

Day 2 of the year (Frid @ 9am): sample(measured_usage.sat_10)

Check: see if the sum, avg and std roughly matches the measured values

In [495]:
# TODO: document 
def create_year_range_df(year=2018):
    start_date_str = '1/1/{}'.format(year + 1)
    start_date = pd.to_datetime(start_date_str) - pd.Timedelta(days=365)
    hourly_periods = 8760
    date_range = pd.date_range(start_date, periods=hourly_periods, freq='H')
    year_hours = list(range(len(date_range)))
    df_year = pd.DataFrame({"hour_of_year": year_hours}, index=date_range)
    
    # Now add day of week and hour of day columns
    df_year['day_name'] = df_year.index.day_name()
    df_year['hour_of_day'] = df_year.index.hour
    return df_year

df_year = create_year_range_df()
df_year.sample(5)

Unnamed: 0,hour_of_year,day_name,hour_of_day
2018-05-16 01:00:00,3241,Wednesday,1
2018-12-14 13:00:00,8341,Friday,13
2018-05-19 13:00:00,3325,Saturday,13
2018-06-28 09:00:00,4281,Thursday,9
2018-03-31 18:00:00,2154,Saturday,18


In [504]:
random.choice([1, 2, 3])

3

In [508]:
# TODO: (Problem)
# Using random.choice on a list of length 1 will always return the single value
# If there is only a single measured value in the list, it will always pick that one, 
# which is likely overrepresenting usage
# Options:
# 1. Always pad a list with a zero if there is only 1 value (this can be done easily in sample_usage())
# 2. Sample across multiple hours or days if there is only 1 value
# 3. Sample across multiple hours or days if the measured samples are below ~5-10
# 4. Always sample across multiple hours or days
def sample_usage(measured_usage, row):
    key = composite_key(row['day_name'], row['hour_of_day'])
    return random.choice(measured_usage[key])
    
# measured_usage[composite_key(row['day_name'], row['hour_of_day'])][0]
def generate_usage_profile(measured_usage, year=2018):
    df = create_year_range_df(year)
    df['welder'] = df.apply(partial(sample_usage, measured_usage), axis=1) 
    return df

df_usage_profile = generate_usage_profile(measured_usage)
df_usage_profile.head(20)

Unnamed: 0,hour_of_year,day_name,hour_of_day,welder
2018-01-01 00:00:00,0,Monday,0,0
2018-01-01 01:00:00,1,Monday,1,0
2018-01-01 02:00:00,2,Monday,2,0
2018-01-01 03:00:00,3,Monday,3,1
2018-01-01 04:00:00,4,Monday,4,0
2018-01-01 05:00:00,5,Monday,5,0
2018-01-01 06:00:00,6,Monday,6,0
2018-01-01 07:00:00,7,Monday,7,0
2018-01-01 08:00:00,8,Monday,8,0
2018-01-01 09:00:00,9,Monday,9,8


In [506]:
pivot_ui(df_usage_profile, 
         rows=['day_name', 'hour_of_day'],
         cols=['welder'],
         rendererName="Table",
         aggregatorName="Count")