In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import os   
import h5py
from sklearn.preprocessing import MinMaxScaler
from utils import (remove_days, standardize_format, 
                   remove_non_positive_values, split_train_val_test_datasets, 
                   get_weather_data, calculate_stats_and_plot_hist, remove_outliers)

from timezonefinder import TimezoneFinder
import time

import warnings
warnings.filterwarnings('ignore')



In [2]:

# Set working directory
os.chdir(r"..") # should be the git repo root directory
print("Current working directory: " + os.getcwd())
assert os.getcwd()[-8:] == "WattCast", "Current working directory is not the git repo root directory"

raw_data_path = os.path.join(os.getcwd(),'data','raw_data')
if not os.path.exists(raw_data_path):
    os.makedirs(raw_data_path)
    print("Created directory: " + raw_data_path)

save_path = os.path.join(os.getcwd(),'data','clean_data')
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print("Created directory: " + save_path)

Current working directory: /Users/nikolaushouben/Desktop/WattCast


# <u>Raw data import, visualization & cleaning<u/>



Structure of Blocks:

1. Each block starts with the Name of the dataset and its download link, alternatively you can follow this [link](https://www.dropbox.com/sh/fvx3wune2qg2x43/AADP4F3UwIqrS9tYnN6mTob5a?dl=0) to download the raw and cleaned data directory from our gdrive.
2. The overview states for which aggregation level the data was used. For example, while (1) BA dataset is only used on the county level, the (2) Substation dataset from Germany is used for both the neighborhood level in an aggregated form and on the household level in its disaggregated form.
3. Power and Weather data are then imported and cleaned, wherever necessary.
4. Each block also provides an interactive visualization of the timeseries, often resampled to a lower temporal resolution to make it run faster.
5. The block ends with spliting* the data into one train (1 year) and two test datasets (one in summer and winter); once more 60 minute and, if available, in 15 minute resolution

a) <u>Note</u> on the train test splits: Not all data was recorded in the same temporal resolution and during the same time. However, training and testing set lengths were kept consistent to ensure a fair comparison of algorithms across scales. 

b) <u>Note</u> on the timezones: Data comes in various timezones and are always processed to match the local timezone. This helps with interepretability of results due to known patters of human behaviour.

## &#x2460; 'Portland Feeder'


Used for: 
* 2_town
----------
* Duration: 3 years
* Resolution: 60 minutes
* Power Level: 1-50 GW

### Data Prep

Load

In [5]:
# set this path to the directory where the data is stored

dir_path = os.path.join(raw_data_path,'Portland_feeder_ts.csv')

dir_path

'/Users/nikolaushouben/Desktop/WattCast/data/raw_data/Portland_feeder_ts.csv'

In [6]:

def train_test_split_month(df, train_months=None, val_months = None, test_months = None):

    '''
    Splits a dataframe into train, validation and test sets based on the month of the year.
    The months are first ranked based on the maximum value in that month.
    Train should contain the months with the highest and lowest maximum values.
    Validation should contain the month with the second highest maximum value.
    Test should contain the month with the second lowest maximum value.
    
    '''

    if train_months is None:
        months_ranked = df.groupby(df.index.month).max().sort_values(by = df.columns[0], ascending = False).index
        train_months = [months_ranked[0]] + [months_ranked[-1]] + months_ranked[2:-2].tolist()
        val_months = [months_ranked[-2]]
        test_months = [months_ranked[1]]
    

    df_train = df[df.index.month.isin(train_months)]
    df_val = df[df.index.month.isin(val_months)]
    df_test = df[df.index.month.isin(test_months)]

    return df_train, df_val, df_test, train_months, val_months, test_months


In [7]:
df_ami = pd.read_csv(dir_path, parse_dates=True, index_col=0)


county_acronnyms = {col:col.split(" ")[0] for col in df_ami.columns[:-1]}

county_coordinates = {acronym: (45.514904014844944, -122.65894786115099) for acronym in county_acronnyms.values()}

unit = "MW" # unit of the demand data
temp_resolutions = [60] # minutes, this dataset is only available in hourly resolution
spatial_scale = "2_town" # spatial scale of the data

In [8]:

store = pd.HDFStore(os.path.join(save_path, f"{spatial_scale}.h5"), mode='w')
for temp_resolution in temp_resolutions:
    dfs = {}
    for key, value in county_acronnyms.items():
    
        lat, lng = county_coordinates[value]
        tf = TimezoneFinder()
        tz = tf.timezone_at(lng=lng, lat=lat)
        # power

        df = df_ami[[key]]
        # df = df.tz_localize('UTC').tz_convert(tz).tz_localize(None), data is already local
        df = standardize_format(df, 'load', temp_resolution, key, unit)
        df = remove_non_positive_values(df, set_nan=True)
        train, val, test, train_months, val_months, test_months = train_test_split_month(df)
        dfs[value] = df
        store.put(f'{value}/{temp_resolution}min/train_target', train, format='table')
        store.put(f'{value}/{temp_resolution}min/val_target', val, format='table')
        store.put(f'{value}/{temp_resolution}min/test_target', test, format='table')

        # weather
        start_date = df.index[0].strftime("%Y-%m-%d")
        end_date = df.index[-1].strftime("%Y-%m-%d")
        df_weather = get_weather_data(lat, lng, start_date, end_date, variables=['temperature_2m'], keep_UTC=False).tz_localize(None)
        df_weather = standardize_format(df_weather, 'temperature', temp_resolution, value, "C")
        df_weather = df_weather.reindex(df.index).dropna() 
        dfs[value + 'weather'] = df_weather
        train_weather, val_weather, test_weather, _, _, _ = train_test_split_month(df_weather, train_months, val_months, test_months)

        store.put(f'{value}/{temp_resolution}min/train_cov', train_weather, format='table')
        store.put(f'{value}/{temp_resolution}min/val_cov', val_weather, format='table')
        store.put(f'{value}/{temp_resolution}min/test_cov', test_weather, format='table')

store.close()
# Merge all the dataframes into one
df_county = pd.concat(dfs.values(), axis=1)
df_county.columns = dfs.keys()
df_county.head()

Fetching timezone from coordinates


KeyboardInterrupt: 

### Visualization

In [None]:
from utils import plot_location_splits

fig = plot_location_splits(os.path.join(os.getcwd(), "data", "clean_data"))

### Statistics

In [None]:
calculate_stats_and_plot_hist(df_county[[col for col in df_county.columns if not 'weather' in col]])