In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os   
import h5py
from sklearn.preprocessing import MinMaxScaler
from utils import (remove_days, standardize_format, 
                   remove_non_positive_values, split_train_val_test_datasets, 
                   get_weather_data, calculate_stats_and_plot_hist, remove_outliers)

from timezonefinder import TimezoneFinder
import time

import warnings
warnings.filterwarnings('ignore')



In [2]:

# Set working directory
os.chdir(r"..") # should be the git repo root directory
print("Current working directory: " + os.getcwd())
assert os.getcwd()[-8:] == "WattCast", "Current working directory is not the git repo root directory"

raw_data_path = os.path.join(os.getcwd(),'data','raw_data')
if not os.path.exists(raw_data_path):
    os.makedirs(raw_data_path)
    print("Created directory: " + raw_data_path)

save_path = os.path.join(os.getcwd(),'data','clean_data')
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print("Created directory: " + save_path)

Current working directory: /Users/nikolaushouben/Desktop/WattCast


# <u>Raw data import, visualization & cleaning<u/>



Structure of Blocks:

1. Each block starts with the Name of the dataset and its download link, alternatively you can follow this [link](https://www.dropbox.com/sh/fvx3wune2qg2x43/AADP4F3UwIqrS9tYnN6mTob5a?dl=0) to download the raw and cleaned data directory from our gdrive.
2. The overview states for which aggregation level the data was used. For example, while (1) BA dataset is only used on the county level, the (2) Substation dataset from Germany is used for both the neighborhood level in an aggregated form and on the household level in its disaggregated form.
3. Power and Weather data are then imported and cleaned, wherever necessary.
4. Each block also provides an interactive visualization of the timeseries, often resampled to a lower temporal resolution to make it run faster.
5. The block ends with spliting* the data into one train (1 year) and two test datasets (one in summer and winter); once more 60 minute and, if available, in 15 minute resolution

a) <u>Note</u> on the train test splits: Not all data was recorded in the same temporal resolution and during the same time. However, training and testing set lengths were kept consistent to ensure a fair comparison of algorithms across scales. 

b) <u>Note</u> on the timezones: Data comes in various timezones and are always processed to match the local timezone. This helps with interepretability of results due to known patters of human behaviour.

## &#x2460; 'Portland Feeder'


Used for: 
* 2_town
----------
* Duration: 1 year
* Resolution: 60 minutes
* Power Level: 1-50 MW

### Data Prep

Load

In [3]:
# set this path to the directory where the data is stored

dir_path = os.path.join(raw_data_path,'Portland_feeder_ts.csv')

dir_path

'/Users/nikolaushouben/Desktop/WattCast/data/raw_data/Portland_feeder_ts.csv'

In [4]:

def train_test_split_month(df, train_months=None, val_months = None, test_months = None):

    '''
    Splits a dataframe into train, validation and test sets based on the month of the year.
    The months are first ranked based on the maximum value in that month.
    Train should contain the months with the highest and lowest maximum values.
    Validation should contain the month with the second highest maximum value.
    Test should contain the month with the second lowest maximum value.
    
    '''

    if train_months is None:
        months_ranked = df.groupby(df.index.month).max().sort_values(by = df.columns[0], ascending = False).index
        train_months = [months_ranked[0]] + [months_ranked[-1]] + months_ranked[2:-2].tolist()
        val_months = [months_ranked[-2]]
        test_months = [months_ranked[1]]

    df_train = df[df.index.month.isin(train_months)]
    df_val = df[df.index.month.isin(val_months)]
    df_test = df[df.index.month.isin(test_months)]

    return df_train, df_val, df_test, train_months, val_months, test_months



def train_val_test_split_visual(df, train_months = None, val_months = None, test_months = None):

    '''
    Splits a dataframe into train, validation and test sets based on the month of the year.
    Determined visually
    
    '''

    if train_months is None:
        train_months = [1,3,4,5,7,8,9,10,11,12]
        val_months = [2]
        test_months = [6]

    df_train = df[df.index.month.isin(train_months)]
    df_val = df[df.index.month.isin(val_months)]
    df_test = df[df.index.month.isin(test_months)]

    return df_train, df_val, df_test, train_months, val_months, test_months



In [15]:
df_ami = pd.read_csv(dir_path, parse_dates=True, index_col=0)

df_ami.columns = [col.replace(" ", ".").replace("(", "").replace(")","") for col in df_ami.columns]

scales = list(set([col.split("-")[0] for col in df_ami.columns.tolist()[:-1]]))


In [16]:
df_ami.columns

Index(['GLENDOVEER-13596.MWh', 'GLENDOVEER-13597.MWh', 'GLENDOVEER-13598.MWh',
       'GLENDOVEER-13599.MWh', 'GLENDOVEER-CLIFFGATE.MWh',
       'GLENDOVEER-NORTHEAST.MWh', 'KELLY.BUTTE-BINNSMEAD.MWh',
       'KELLY.BUTTE-FAIRLAWN.MWh', 'KELLY.BUTTE-MALL.205.MWh',
       'KELLY.BUTTE-MCGREW.MWh', 'LENTS-13101.MWh', 'LENTS-HAPPY.VALLEY.MWh',
       'LENTS-MT.SCOTT.MWh', 'LENTS-NORTH.MWh', 'MIDWAY-DIVISION.MWh',
       'MIDWAY-DOUGLAS.MWh', 'MIDWAY-LYNCH.MWh', 'MIDWAY-POWELLHURST.MWh',
       'RAMAPO-EMERALD.MWh', 'RAMAPO-GILBERT.MWh', 'RAMAPO-RAMAPO.13.MWh',
       'Temperature.°C'],
      dtype='object')

In [17]:

location_acronnyms = {col.replace("(", "").replace(")",""):col.split("-")[1].replace("(", "").replace(")","") for col in df_ami.columns[:-1]}
coords = (45.514904014844944, -122.65894786115099)

unit = "MW" # unit of the demand data
temp_resolutions = [60] # minutes, this dataset is only available in hourly resolution

In [18]:
scales

['MIDWAY', 'LENTS', 'KELLY.BUTTE', 'GLENDOVEER', 'RAMAPO']

In [19]:
location_acronnyms

{'GLENDOVEER-13596.MWh': '13596.MWh',
 'GLENDOVEER-13597.MWh': '13597.MWh',
 'GLENDOVEER-13598.MWh': '13598.MWh',
 'GLENDOVEER-13599.MWh': '13599.MWh',
 'GLENDOVEER-CLIFFGATE.MWh': 'CLIFFGATE.MWh',
 'GLENDOVEER-NORTHEAST.MWh': 'NORTHEAST.MWh',
 'KELLY.BUTTE-BINNSMEAD.MWh': 'BINNSMEAD.MWh',
 'KELLY.BUTTE-FAIRLAWN.MWh': 'FAIRLAWN.MWh',
 'KELLY.BUTTE-MALL.205.MWh': 'MALL.205.MWh',
 'KELLY.BUTTE-MCGREW.MWh': 'MCGREW.MWh',
 'LENTS-13101.MWh': '13101.MWh',
 'LENTS-HAPPY.VALLEY.MWh': 'HAPPY.VALLEY.MWh',
 'LENTS-MT.SCOTT.MWh': 'MT.SCOTT.MWh',
 'LENTS-NORTH.MWh': 'NORTH.MWh',
 'MIDWAY-DIVISION.MWh': 'DIVISION.MWh',
 'MIDWAY-DOUGLAS.MWh': 'DOUGLAS.MWh',
 'MIDWAY-LYNCH.MWh': 'LYNCH.MWh',
 'MIDWAY-POWELLHURST.MWh': 'POWELLHURST.MWh',
 'RAMAPO-EMERALD.MWh': 'EMERALD.MWh',
 'RAMAPO-GILBERT.MWh': 'GILBERT.MWh',
 'RAMAPO-RAMAPO.13.MWh': 'RAMAPO.13.MWh'}

In [22]:
lat, lng = coords
temp_resolution = temp_resolutions[0]
tf = TimezoneFinder()
tz = tf.timezone_at(lng=lng, lat=lat)
for spatial_scale in scales:
    store = pd.HDFStore(os.path.join(save_path, f"{spatial_scale}.h5"), mode='w')
    dfs = {}
    df_scale = df_ami.filter(like=spatial_scale)
    for col in df_scale.columns:
        print(f"Preparing data for {col}...")
        # power
        value = location_acronnyms[col]
        df = df_scale[[col]]
        print(df.columns)
        # df = df.tz_localize('UTC').tz_convert(tz).tz_localize(None), data is already local
        df = standardize_format(df, 'load', temp_resolution, col, unit)
        df = remove_outliers(df, df.columns[0], lower_percentile=1, upper_percentile=99.9)
        df = remove_non_positive_values(df, set_nan=True)
        train, val, test, train_months, val_months, test_months = train_val_test_split_visual(df)
        dfs[value] = df
        store.put(f'{value}/{temp_resolution}min/train_target', train, format='table')
        store.put(f'{value}/{temp_resolution}min/val_target', val, format='table')
        store.put(f'{value}/{temp_resolution}min/test_target', test, format='table')

        # weather
        start_date = df.index[0].strftime("%Y-%m-%d")
        end_date = df.index[-1].strftime("%Y-%m-%d")
        df_weather = get_weather_data(lat, lng, start_date, end_date, variables=['temperature_2m'], keep_UTC=False).tz_localize(None)
        df_weather = standardize_format(df_weather, 'temperature', temp_resolution, value, "C")
        df_weather = df_weather.reindex(df.index).dropna() 
        dfs[value + 'weather'] = df_weather
        train_weather, val_weather, test_weather, _, _, _ = train_val_test_split_visual(df_weather, train_months, val_months, test_months)

        store.put(f'{value}/{temp_resolution}min/train_cov', train_weather, format='table')
        store.put(f'{value}/{temp_resolution}min/val_cov', val_weather, format='table')
        store.put(f'{value}/{temp_resolution}min/test_cov', test_weather, format='table')

    store.close()


Preparing data for MIDWAY-DIVISION.MWh...
Index(['MIDWAY-DIVISION.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for MIDWAY-DOUGLAS.MWh...
Index(['MIDWAY-DOUGLAS.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for MIDWAY-LYNCH.MWh...
Index(['MIDWAY-LYNCH.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for MIDWAY-POWELLHURST.MWh...
Index(['MIDWAY-POWELLHURST.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for LENTS-13101.MWh...
Index(['LENTS-13101.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for LENTS-HAPPY.VALLEY.MWh...
Index(['LENTS-HAPPY.VALLEY.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for LENTS-MT.SCOTT.MWh...
Index(['LENTS-MT.SCOTT.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for LENTS-NORTH.MWh...
Index(['LENTS-NORTH.MWh'], dtype='object')
Fetching timezone from coordinates
Preparing data for KEL

In [23]:
from utils import get_hdf_keys

clean_data_path = os.path.join(os.getcwd(), "data", "clean_data")


In [24]:
get_hdf_keys(clean_data_path)

({'GLENDOVEER.h5': ['13596.MWh',
   '13597.MWh',
   '13598.MWh',
   '13599.MWh',
   'CLIFFGATE.MWh',
   'NORTHEAST.MWh'],
  'LENTS.h5': ['13101.MWh', 'HAPPY.VALLEY.MWh', 'MT.SCOTT.MWh', 'NORTH.MWh'],
  'MIDWAY.h5': ['DIVISION.MWh', 'DOUGLAS.MWh', 'LYNCH.MWh', 'POWELLHURST.MWh'],
  'RAMAPO.h5': ['EMERALD.MWh', 'GILBERT.MWh', 'RAMAPO.13.MWh'],
  'KELLY.BUTTE.h5': ['BINNSMEAD.MWh',
   'FAIRLAWN.MWh',
   'MALL.205.MWh',
   'MCGREW.MWh']},
 {'GLENDOVEER.h5': ['60min'],
  'LENTS.h5': ['60min'],
  'MIDWAY.h5': ['60min'],
  'RAMAPO.h5': ['60min'],
  'KELLY.BUTTE.h5': ['60min']})

### Visualization

In [25]:
fig = plot_location_splits(clean_data_path,3,1)

NameError: name 'plot_location_splits' is not defined