# Something something Efficiency and BTUs:

The `Efficiency-BTUs-data.ipynb` notebook in this same directory will also perform the database queries and export the files as zipped csv files to this directory. You would just need to change the `pg.read_csv()` calls to look in the current directory instead of the /shared one.

In [None]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import sqlalchemy as sqla
import os
import sys
sys.path.insert(0,'..')
from config.read_config import get_database_config
import numpy as np
%matplotlib inline
sys.executable  # shows you your path to the python you're using

In [None]:
# read in db credentials from ../config/config.txt
# * make sure you add those to the ../config/config.txt file! *

## Uncomment the following line to use the live database queries
'''
database_config = get_database_config("../config/config.txt")
'''
#

In [None]:
# get our DB connection

# uncomment if you want to use the live queries to the database instead of the prepared data
'''
engine = sqla.create_engine('postgresql://{}:{}@{}:{}/{}'.format(database_config['username'],
                                                                     database_config['password'],
                                                                     database_config['hostname'],
                                                                     database_config['port'],
                                                                     database_config['database']
                                                                     ))
'''


In [None]:
#Select a list of Texas homes from dataport metadata having good gas data availabilty

# Uncomment the following block to use live database queries
'''
query = """select distinct dataid, egauge_1min_data_availability, gas_data_availability, grid, solar from other_datasets.metadata 
                                         
                                          where grid = 'yes'
                                          and egauge_1min_min_time <= '2018-03-01' 
                                          and egauge_1min_max_time > '2018-09-01'
                                          and city='Austin'
                                          and (egauge_1min_data_availability like '100%' 
                                               or 
                                               egauge_1min_data_availability like '9%')
                                          and gas_ert_min_time <= '2018-03-01'
                                          and gas_ert_max_time > '2018-09-01'
                                          and
                                              (
                                              gas_data_availability like '100%'
                                              or
                                              gas_data_availability like '9%'
                                              or
                                              gas_data_availability like '8%'
                                              or
                                              gas_data_availability like '7%'
                                              )
                                          ;
         """

df = pd.read_sql_query(sqla.text(query), engine)
df
'''

# otherwise we'll read from the prepared/extracted zipped data files
df = pd.read_csv('/shared/JupyterHub-Examples-Data/efficiency/efficiency_btus_metadata.zip', compression='zip')
df

In [None]:
# grab dataids and convert them to a string to put into the SQL query
dataids_list = df['dataid'].tolist()
print("{} dataids selected listed here:".format(len(dataids_list)))
dataids_str = ','.join(list(map(str, dataids_list)))

In [None]:
# Uncomment the following block to use live database queries

'''
#Pull electricity data for selected homes.
data = """select dataid,localminute::timestamp,solar,grid 
               from electricity.eg_realpower_1min 
               where localminute >= '2018-03-01' and localminute <  '2018-09-01' """
data = data + """AND dataid in ({})""".format(dataids_str)

# create a dataframe with the data from the sql query
data_df = pd.read_sql_query(sqla.text(data), engine)
'''

# otherwise we'll read in the already prepared electricity data
data_df = pd.read_csv('/shared/JupyterHub-Examples-Data/efficiency/efficiency_btus_electricity_data.zip', compression='zip')

data_df

In [None]:
# convert 'localminute' to a datetime
data_df['datetime'] = pd.to_datetime(data_df['localminute'])

# index by datetime 
data_df = data_df.set_index('datetime')

# bring to central timezone
data_df = data_df.tz_localize(tz='US/Central')
data_df

In [None]:
# add hour of day to df
data_df['hour'] = data_df.index.hour
data_df

In [None]:
# If there's no grid value, throw the row out
data_df.dropna(subset=['grid'], inplace=True)
data_df

In [None]:
# replace solar NaNs with 0
data_df['solar'].fillna(value=0, inplace=True)
print(data_df['solar'].isna().sum())

# calculate the use, the grid minus the solar (we're actually adding them because solar generation shows up negative in the database)
data_df['use'] = data_df['grid'] + data_df['solar']
data_df

In [None]:
# group by the hour and take the mean to get the hourly average use
usage = data_df.groupby(['hour']).mean()
usage

In [None]:
# calc btus for kWh (1kWh = 3412 BTUs)
usage['Electrical BTUs'] = usage['use'] * 3412
usage

In [None]:
# Read in insolation data downloaded from Solar Forecast Arbitor for Austin for the same timeframe
# https://dashboard.solarforecastarbiter.org/observations/c6d40462-7e49-11e9-aef1-0a580a8003e9
insol = pd.read_csv('/shared/JupyterHub-Examples-Data/efficiency/Austin_TX_ghi_2018-03-01T06_00_00+00_00-2018-10-01T06_00_00+00_00.csv.zip', skiprows=2)
insol['datetime'] = pd.to_datetime(insol['timestamp'])
insol = insol.set_index('datetime')
insol = insol.tz_convert(tz='US/Central')
insol

In [None]:
# add hour of day to df
insol['hour'] = insol.index.hour
insol

In [None]:
# one could potentially look at the quality flag to determine if we want to keep the row or not
insol = insol.drop(columns=['quality_flag'])
insol.describe()

In [None]:
# group into hour of the day and take the mean
grouped = insol.groupby(['hour']).mean()
grouped

In [None]:
# Plot insolation vs grid usage for hour of the day

fig, ax1 = plt.subplots(figsize=(20,10))

color = 'tab:orange'

ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Insolation or GHI (W/m^2)')
p1, = ax1.plot(grouped.index, grouped['value'],label="Insolation or GHI (W/m^2)", color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Usage (kWh)')
p2, = ax2.plot(grouped.index, usage['use'], label="Usage (kWh)", color=color)
ax2.tick_params(axis='y', labelcolor=color)

ax1.legend((p1, p2), ('Insolation (W/m^2)', 'Usage (kWh)'), loc='upper right', shadow=True)

plt.show()

In [None]:
# Read gas data for same homes and time period

# uncomment the following block to do the live db query
'''
gas_sql = """select * 
               from water_and_gas.gas_ert 
               where readtime >= '2018-03-01' and readtime <  '2018-09-01' """
gas_sql = gas_sql + """AND dataid in ({})""".format(dataids_str)

# create a dataframe with the data from the sql query
gas_df = pd.read_sql_query(sqla.text(gas_sql), engine)

'''

# read from the prepared csv.zip file
gas_df = pd.read_csv('/shared/JupyterHub-Examples-Data/efficiency/efficiency_btus_gas_data.zip', compression='zip')

gas_df

In [None]:
# convert readtime to a datetime, set the index, and convert to Central Time
pd.options.display.max_rows = 500
gas_df['datetime'] = pd.to_datetime(gas_df['readtime'], utc=True)
gas_df = gas_df.set_index('datetime')
gas_df = gas_df.tz_convert(tz='US/Central')
gas_df = gas_df.drop(columns=['readtime'])
gas_df

In [None]:
# the gas (and water data for that matter) are cumulative meter readings, meaning that they gradually increase as more gas flows. 
# So you can have the same reading many times in a row, or you can have gaps with no readings until more gas is used.
# You can also have the meter get reset to 0, so we're going to do some gymnastics to deal with all of that

# group by dataid and hour then run a diff on the rows
gas_df_group = gas_df.groupby(['dataid', pd.Grouper(freq='H')]).max().diff()

gas_df_group = gas_df_group.dropna()

# zero out the negative diffs because that's the meter being reset and going from some high number reading to 0 (This Needs Looking AT!!!! IDK if it's the right thing to do!)
gas_df_group['meter_value'] = gas_df_group['meter_value'].clip(0)
gas_df_group.describe()

In [None]:
# gas is measured in ft^3 convert to BTUs with (1CCF = 103,700BTU = 30.4kWh). (Per EIA's calculator, 1 cubic ft of natural gas is 1,037BTU.)

# calculate the BTUs of gas used
gas_df_group['Gas BTUs'] = gas_df_group['meter_value'] * 1037
gas_df_group

In [None]:
# add hour of day to gas
gas_df_group = gas_df_group.reset_index()
gas_df_group = gas_df_group.set_index('datetime')
gas_df_group['hour'] = gas_df_group.index.hour
gas_df_group

In [None]:
# take the mean over all the same hours of the day across all dataids
gas_hr_mean = gas_df_group.groupby(['hour']).mean()
gas_hr_mean

In [None]:
# don't need those columns anymore
gas_hr_mean = gas_hr_mean.drop(columns=['dataid','meter_value'])

In [None]:
# the first row is a bit meaningless as a diff, so we're dropping it
gas_hr_mean = gas_hr_mean.drop([0])
gas_hr_mean

In [None]:
# Let's have a peek at it
gas_hr_mean.plot(figsize=(20,10), grid=True, x_compat=True, title="Gas Use in BTUs Per Hour of Day")

In [None]:
# combine electrical usage and gas as btus
combined = pd.merge(left=usage['Electrical BTUs'], right=gas_hr_mean['Gas BTUs'], left_on=usage.index, right_on=gas_hr_mean.index)
combined = combined.drop(columns=['key_0'])
combined

In [None]:
# let's see how those look on a plot. 
# Plotting with 2 y axes because of the scale difference between them

fig, ax1 = plt.subplots(figsize=(20,10))

color = 'tab:orange'

ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Electrical Use as BTUs')
p1, = ax1.plot(usage.index, usage['Electrical BTUs'],label="Electrical Use as BTUs", color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Gas Use as BTUs')
p2, = ax2.plot(gas_hr_mean.index, gas_hr_mean['Gas BTUs'], label="Gas Use as BTUs", color=color)
ax2.tick_params(axis='y', labelcolor=color)

ax1.legend((p1, p2), ('Electrical Use as BTUs', 'Gas Use as BTUs'), loc='upper right', shadow=True)

plt.show()

In [None]:
# Combining into a single dataframe
gas_hr_mean['Elec BTUs'] = usage['Electrical BTUs']
gas_hr_mean

In [None]:
# More combining and renaming

btus = usage.merge(gas_hr_mean, on=['hour'], how='left')
btus = btus.drop(labels=['dataid', 'solar', 'grid', 'use', 'Elec BTUs'], axis=1)
btus = btus.rename({'BTUs':'Electrical BTUs', 'Gas BTUs':'Gas BTUs'},axis='columns')
btus


In [None]:
# lets go get some blucube water data now

# uncomment this block for a direct database pull
'''
#Pull data for selected homes.
water_sql = """SELECT * FROM water_and_gas.blucube_water_data
               where epoch_timestamp >= '2018-03-01' and epoch_timestamp <  '2018-09-01' """
water_sql = water_sql + """AND dataid in ({})""".format(dataids_str)

# create a dataframe with the data from the sql query
water_df = pd.read_sql_query(sqla.text(water_sql), engine)

'''

# read in the prepared csv file
water_df = pd.read_csv('/shared/JupyterHub-Examples-Data/efficiency/efficiency_btus_water_data.zip')

water_df

In [None]:
# don't need the meter id
water_df = water_df.drop(columns=['met_id']) 
water_df

In [None]:
# convert 'localminute' to a datetime
water_df['datetime'] = pd.to_datetime(water_df['epoch_timestamp'], utc=True)

# index by datetime 
water_df = water_df.set_index('datetime')

# bring to central timezone
water_df = water_df.tz_convert(tz='US/Central')
water_df

In [None]:
water = water_df.drop(columns=['epoch_timestamp'])
water

In [None]:
# group by dataid and hour then run a diff on the rows
water_group = water.groupby(['dataid', pd.Grouper(freq='H')]).max().diff()

water_group = water_group.dropna()

# zero out the negative diffs because that's the meter being reset and going from some high number reading to 0
water_group['reading_in_gal'] = water_group['reading_in_gal'].clip(0)
water_group.describe()

In [None]:
water_group

In [None]:
# add hour of day to water
water_group = water_group.reset_index()
water_group = water_group.set_index('datetime')
water_group['hour'] = water_group.index.hour
water_group

In [None]:
# take the mean by hour of the day
water_mean = water_group.groupby(['hour']).mean()
water_mean = water_mean.drop(columns=['dataid'])
water_mean

In [None]:
# convert gallons to kWh via 1,000,000 gallons =  5000kWh for Austin (reduced this is 200g to 1kWh, or divide the gallons by 200 to get kWh)
########### PUT REF HERE!!!!!!!!!!!
water_mean['Water BTUs'] = (water_mean['reading_in_gal'] / 200.0) * 3412
water_mean

In [None]:
# again the first row is a bit meaningless because it's a diff against nothing
water_mean = water_mean.drop([0])
water_mean = water_mean.drop(columns=['reading_in_gal'])
water_mean.plot(figsize=(20,10), grid=True, x_compat=True)

In [None]:
water_mean

In [None]:
btus

In [None]:
# combine all three into one dataframe
btus = btus.merge(water_mean, on=['hour'], how='left')
btus

In [None]:
# let's make some pretty pictures
btus.plot(figsize=(20,10), grid=True, x_compat=True)

In [None]:
# now time to bring the insolation in, let's get it named properly
grouped = grouped.rename({'value':'Insolation or GHI (W/m^2)'} , axis='columns')
grouped

In [None]:
# Add insolation to the btus dataframe
btus = btus.merge(grouped, on=['hour'], how='left')

# calculate the sum of the 3 different btu usages
btus['Sum BTUs'] = btus['Electrical BTUs'] + btus['Gas BTUs'] + btus['Water BTUs']
btus

In [None]:
# let's plot it all
# BTUs on one Y axis, and Insolation on the other.
# X axis is the hour of the day

fig, ax1 = plt.subplots(figsize=(20,10))

color = 'tab:orange'

ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('BTUs')
p1, = ax1.plot(btus.index, btus['Electrical BTUs'],label="Electrical Use in BTUs", color=color)
p2, = ax1.plot(btus.index, btus['Gas BTUs'], label="Gas use in BTUs", color='tab:green')
p3, = ax1.plot(btus.index, btus['Water BTUs'], label="Water use in BTUs", color='tab:purple')
p4, = ax1.plot(btus.index, btus['Sum BTUs'], label="Sum BTUs", color='tab:red')
plt.legend(loc='upper left')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Insolation or GHI (W/m^2)')
p2, = ax2.plot(btus.index, btus['Insolation or GHI (W/m^2)'], label="Insolation or GHI (W/m^2)", color=color)
ax2.tick_params(axis='y', labelcolor=color)

plt.legend(loc='upper right')
plt.title('Home BTU Usage and Solar Insolation Per Hour of Day')

plt.show()

In [None]:
# let's plot sum BTUs and Insolation
fig, ax1 = plt.subplots(figsize=(20,10))

color = 'tab:orange'

ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('BTUs')
p4, = ax1.plot(btus.index, btus['Sum BTUs'], label="Sum BTUs", color='tab:red')
plt.legend(loc='upper left')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Insolation or GHI (W/m^2)')
p2, = ax2.plot(btus.index, btus['Insolation or GHI (W/m^2)'], label="Insolation or GHI (W/m^2)", color=color)
ax2.tick_params(axis='y', labelcolor=color)

plt.legend(loc='upper right')
plt.title('Sum of Home BTU Usage and Solar Insolation Per Hour of Day')

plt.show()

# Have Scott write something up here. Or maybe Cavan. Welcome aboard Cavan! 