In [1]:
from rfo_core.aws.iam import ensure_glue_service_role_exists, resolve_role_arn, get_aws_session
from rfo_core.aws.s3 import ensure_s3_bucket_exists
from rfo_core.aws.s3 import get_bucket_name, create_s3_subfolders
from rfo_core.configuration import (
    aws_key, aws_secret, aws_service_role_name,
    aws_region_default, aws_default_sync_mode, aws_versioning_on
)

In [2]:
import awswrangler as wr
import boto3 as bt

In [3]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)

In [4]:
session = aws_get_session(region='us-east-1')

In [5]:
table="rfo_load_enriched"

In [None]:
# checking all unique datatypes in the table
sql = "SELECT DISTINCT datatype FROM rfo_weather_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['datatype'].tolist())

            datatype
0       electric_hdd
1   relativeHumidity
2       weighted_hdd
3      windDirection
4     population_cdd
5    temperature_avg
6            gas_cdd
7       electric_cdd
8        temperature
9          heatIndex
10         windSpeed
11        cloudCover
12         windChill
13          dewpoint
14   temperature_min
15           gas_hdd
16   temperature_max
17      weighted_cdd
18    population_hdd


In [14]:
# conditions='''WHERE region IN ('CAISO', 'NWPP', 'SOUTHWEST')
#   AND datatype = 'temperature'
#   AND CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31' '''
conditions=''

In [None]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
# columns = ["datetime","datatype","avgvalue","siteid","name","station_name","state","region","location","timezone"] # for weather
columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [15]:
#save df a csv file
df.to_csv("Temp_CAINWPPSW_10yrs.csv", index=False)

In [20]:
sorted(df['name'].unique().tolist())

['.H.INTERNAL_HUB',
 '.Z.CONNECTICUT',
 '.Z.MAINE',
 '.Z.NEMASSBOST',
 '.Z.NEWHAMPSHIRE',
 '.Z.RHODEISLAND',
 '.Z.SEMASS',
 '.Z.VERMONT',
 '.Z.WCMASS',
 'AECO',
 'AECO Sub Zone',
 'AEP',
 'AEPAPT',
 'AEPIMP',
 'AEPKPT',
 'AEPOPT',
 'AP',
 'ATSI',
 'AVRN',
 'Arizona Public Service Company',
 'Associated Electric Cooperative, Inc.',
 'Avista Corporation',
 'BANCMID',
 'BANCRDNG',
 'BANCRSVL',
 'BANCSMUD',
 'BANCWASN',
 'BGE',
 'BPA',
 'Balancing Authority of Northern California',
 'CAISO',
 'CAPITL',
 'CENTRL',
 'CNCT',
 'COMED',
 'California',
 'Carolinas',
 'Central',
 'Central (old)',
 'City of Tacoma, Department of Public Utilities',
 'DAYTON',
 'DEOK',
 'DOMINION',
 'DPL',
 'DPLCO',
 'DUNWOOD',
 'DUQUESNE',
 'Duke Energy Carolinas',
 'Duke Energy Florida, Inc.',
 'Duke Energy Progress East',
 'Duke Energy Progress West',
 'EASTON',
 'EKPC (PJMISO)',
 'East (old)',
 'El Paso Electric Company',
 'Electric Energy, Inc.',
 'Florida',
 'Florida Municipal Power Pool',
 'Florida Power & Li