In [1]:
from rfo_core.aws.iam import ensure_glue_service_role_exists, resolve_role_arn, get_aws_session
from rfo_core.aws.s3 import ensure_s3_bucket_exists
from rfo_core.aws.s3 import get_bucket_name, create_s3_subfolders
from rfo_core.configuration import (
    aws_key, aws_secret, aws_service_role_name,
    aws_region_default, aws_default_sync_mode, aws_versioning_on
)

In [2]:
import awswrangler as wr
import boto3 as bt

In [3]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)

In [4]:
session = aws_get_session(region='us-east-1')

In [5]:
table="rfo_load_enriched"

In [None]:
# checking all unique datatypes in the table
sql = "SELECT DISTINCT datatype FROM rfo_weather_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['datatype'].tolist())

            datatype
0       electric_hdd
1   relativeHumidity
2       weighted_hdd
3      windDirection
4     population_cdd
5    temperature_avg
6            gas_cdd
7       electric_cdd
8        temperature
9          heatIndex
10         windSpeed
11        cloudCover
12         windChill
13          dewpoint
14   temperature_min
15           gas_hdd
16   temperature_max
17      weighted_cdd
18    population_hdd


In [6]:
#for importing load data for MIDC
names = ('AVRN',
'Avista Corporation',
'BPA',
'City of Tacoma, Department of Public Utilities',
 'Idaho Power Company',
'PUD No. 1 of Douglas County',
 'PUD No. 2 of Grant County, Washington',
'PacifiCorp West',
 'Portland General Electric Company',
'Puget Sound',
 'Puget Sound Energy, Inc.',
'Seattle City Light',
)

In [7]:
# conditions='''WHERE region IN ('CAISO', 'NWPP', 'SOUTHWEST')
#   AND datatype = 'temperature'
#   AND CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31' '''
conditions=f'''WHERE name IN {names}
    AND CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31' '''
conditions=''

In [8]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
# columns = ["datetime","datatype","avgvalue","siteid","name","station_name","state","region","location","timezone"] # for weather
columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [15]:
#save df a csv file
df.to_csv("Temp_CAINWPPSW_10yrs.csv", index=False)

In [9]:
df.head(30)

Unnamed: 0,datetime,avgvalue,objectid,name,yes_objectid,iso
0,2012-02-12 15:00:00,6603.916667,L000060,BPA,10001845403,CAISO
1,2012-02-12 16:00:00,7009.583333,L000060,BPA,10001845403,CAISO
2,2012-02-12 17:00:00,7274.5,L000060,BPA,10001845403,CAISO
3,2012-02-12 18:00:00,7206.916667,L000060,BPA,10001845403,CAISO
4,2012-02-12 19:00:00,6641.166667,L000060,BPA,10001845403,CAISO
5,2012-02-12 20:00:00,6416.75,L000060,BPA,10001845403,CAISO
6,2012-02-12 21:00:00,6257.666667,L000060,BPA,10001845403,CAISO
7,2012-02-12 22:00:00,6136.416667,L000060,BPA,10001845403,CAISO
8,2012-02-12 23:00:00,6045.0,L000060,BPA,10001845403,CAISO
9,2012-02-13 00:00:00,6056.5,L000060,BPA,10001845403,CAISO


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51691908 entries, 0 to 51691907
Data columns (total 6 columns):
 #   Column        Dtype         
---  ------        -----         
 0   datetime      datetime64[ns]
 1   avgvalue      float64       
 2   objectid      string        
 3   name          string        
 4   yes_objectid  Int64         
 5   iso           string        
dtypes: Int64(1), datetime64[ns](1), float64(1), string(3)
memory usage: 2.4 GB


In [13]:
df[df['name']=='Puget Sound'].set_index('datetime').sort_index()

Unnamed: 0_level_0,avgvalue,objectid,name,yes_objectid,iso
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-01-01 01:00:00,,L000117,Puget Sound,10001865762,CAISO
2012-01-01 02:00:00,,L000117,Puget Sound,10001865762,CAISO
2012-01-01 03:00:00,,L000117,Puget Sound,10001865762,CAISO
2012-01-01 04:00:00,,L000117,Puget Sound,10001865762,CAISO
2012-01-01 05:00:00,,L000117,Puget Sound,10001865762,CAISO
...,...,...,...,...,...
2013-12-31 20:00:00,,L000117,Puget Sound,10001865762,CAISO
2013-12-31 21:00:00,,L000117,Puget Sound,10001865762,CAISO
2013-12-31 22:00:00,,L000117,Puget Sound,10001865762,CAISO
2013-12-31 23:00:00,,L000117,Puget Sound,10001865762,CAISO
