In [40]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import psycopg2
import sqlalchemy as sqla
import os
from config.read_config import get_database_config
import numpy as np
import sys
%matplotlib inline
sys.executable  # shows you your path to the python you're using

'/usr/bin/python3.8'

In [41]:
# read in db credentials from config/config.txt
# * make sure you add those to the config/config.txt file! *

database_config = get_database_config("./config/config.txt")

In [42]:
# get our DB connection
engine = sqla.create_engine('postgresql://{}:{}@{}:{}/{}'.format(database_config['username'],
                                                                     database_config['password'],
                                                                     database_config['hostname'],
                                                                     database_config['port'],
                                                                     database_config['database']
                                                                     ))

In [43]:
#Select a list of Texas homes from dataport metadata having CAR and solar configured and also has data for year 2018.
query = """select distinct dataid from other_datasets.metadata 
                                          where car1='yes' and solar='yes' 
                                          and egauge_1min_min_time < '2018-01-01' 
                                          and egauge_1min_max_time > '2019-01-01'
                                          and state='Texas'
                                          and (egauge_1min_data_availability like '100%' 
                                               or 
                                               egauge_1min_data_availability like '99%');
         """

df = pd.read_sql_query(sqla.text(query), engine)

In [44]:
# grab dataids and convert them to a string to put into the SQL query
dataids_list = df['dataid'].tolist()
print("{} dataids selected listed here:".format(len(dataids_list)))
dataids_str = ','.join(list(map(str, dataids_list)))
dataids_str
dataids_list

24 dataids selected listed here:


[114,
 379,
 1169,
 1354,
 2814,
 3368,
 3829,
 3967,
 5109,
 5357,
 5450,
 5749,
 6139,
 6248,
 6691,
 7024,
 7850,
 7989,
 8142,
 8645,
 8857,
 9647,
 9776,
 9932]

In [45]:
#Check data completeness for dataids selected from metadata above.

query2 = """select dataid,count(*) total_rec from electricity.eg_realpower_1min 
            where dataid in ({})""".format(dataids_str)
query2 = query2 + """ and localminute >= '2018-01-01' and localminute < '2019-01-01' group by 1"""

df2 = pd.read_sql_query(sqla.text(query2), engine)

In [46]:
#Select homes with atleast 99% data availability for year 2018.
df2['perc'] = (df2['total_rec']/525600)*100
final_dataids = df2[df2['perc'] >= 99]
final_dataids['dataid'].count()

23

In [47]:
# Pull data for homes
final_dataids_list = final_dataids['dataid'].tolist()
print("{} dataids selected listed here:".format(len(final_dataids_list)))
final_dataids_str = ','.join(list(map(str, final_dataids_list)))
final_dataids_str
final_dataids_list

23 dataids selected listed here:


[114,
 379,
 1169,
 1354,
 2814,
 3368,
 3829,
 3967,
 5109,
 5357,
 5450,
 5749,
 6139,
 6248,
 6691,
 7024,
 7850,
 7989,
 8142,
 8645,
 9647,
 9776,
 9932]

In [48]:
#fall
fall = """select localminute::timestamp,car1,solar,grid 
               from electricity.eg_realpower_1min 
               where localminute >= '2018-09-01' and localminute <  '2018-12-01' """
fall = fall + """AND dataid in ({})""".format(final_dataids_str)

fall_df = pd.read_sql_query(sqla.text(fall), engine)

In [49]:
#spring
spring = """select localminute::timestamp,car1,solar,grid 
               from electricity.eg_realpower_1min 
               where localminute >= '2018-03-01' and localminute <  '2018-06-01' """
spring = spring + """AND dataid in ({})""".format(final_dataids_str)

spring_df = pd.read_sql_query(sqla.text(spring), engine)

In [50]:
#summer
summer = """select localminute::timestamp,car1,solar,grid 
               from electricity.eg_realpower_1min 
               where localminute >= '2018-06-01' and localminute <  '2018-09-01' """
summer = summer + """AND dataid in ({})""".format(final_dataids_str)

# create a dataframe with the data from the sql query
summer_df = pd.read_sql_query(sqla.text(summer), engine)

In [51]:
#winter
winter = """select localminute::timestamp,car1,solar,grid 
               from electricity.eg_realpower_1min 
               where localminute >= '2018-12-01' and localminute <  '2019-03-01' """
winter = winter + """AND dataid in ({})""".format(final_dataids_str)

# create a dataframe with the data from the sql query
winter_df = pd.read_sql_query(sqla.text(winter), engine)