In [1]:
from rfo_core.aws.iam import ensure_glue_service_role_exists, resolve_role_arn, get_aws_session
from rfo_core.aws.s3 import ensure_s3_bucket_exists
from rfo_core.aws.s3 import get_bucket_name, create_s3_subfolders
from rfo_core.configuration import (
    aws_key, aws_secret, aws_service_role_name,
    aws_region_default, aws_default_sync_mode, aws_versioning_on
)

In [2]:
import awswrangler as wr
import boto3 as bt

In [3]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)

In [4]:
session = aws_get_session(region='us-east-1')

In [5]:
table="rfo_weather_enriched"

In [6]:
# checking all unique regions in the table
sql = "SELECT DISTINCT region FROM rfo_weather_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['region'].tolist())

['MISO', 'MIDWEST', 'CAISO', 'PACIFIC', 'OIL/GAS FIELDS', 'PRODUCING', 'SPP', 'CONUS', 'AESO', 'CONSUM. EAST', 'IESO', 'NWPP', 'PJM', 'SOUTHEAST', <NA>, 'NYISO', 'ERCOT', 'MOUNTAIN', 'NEISO', 'SOUTHWEST', 'CONSUM. WEST', 'SOUTH CENTRAL', 'EAST']


In [8]:
# checking all unique regions in the table
sql = "SELECT DISTINCT iso FROM rfo_load_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['iso'].tolist())

['MISO', 'CAISO', 'NYISO', 'PJMISO', 'NEISO', <NA>, 'IESO']


In [None]:
# checking all unique datatypes in the table
sql = "SELECT DISTINCT datatype FROM rfo_weather_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['datatype'].tolist())

In [10]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT station_name FROM rfo_weather_enriched WHERE region = 'NWPP'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['station_name'].value_counts())

station_name
Tacoma            1
Boise             1
Eugene            1
Portland          1
NWPP              1
Vancouver         1
Yakima            1
Pendleton         1
Billings          1
Medford           1
Reno              1
Salt Lake City    1
Great Falls       1
Klamath Falls     1
Spokane           1
Seattle           1
Name: count, dtype: Int64


In [None]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT station_name FROM rfo_weather_enriched WHERE region = 'CAISO'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['station_name'].value_counts())

In [7]:
print(unique_datatypes['region'].tolist())

['SOUTHWEST', 'MISO', 'NEISO', 'OIL/GAS FIELDS', 'MOUNTAIN', 'CONSUM. WEST', 'CONSUM. EAST', 'PRODUCING', 'PACIFIC', 'AESO', 'IESO', 'NWPP', 'SOUTHEAST', <NA>, 'CAISO', 'SPP', 'EAST', 'CONUS', 'SOUTH CENTRAL', 'ERCOT', 'PJM', 'NYISO', 'MIDWEST']


In [7]:
# conditions='''WHERE region IN ('CAISO', 'NWPP', 'SOUTHWEST')
#   AND datatype = 'temperature'
#   AND CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31' '''
conditions="WHERE CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31'"

In [8]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
columns = ["datetime","datatype","avgvalue","siteid","station_name","region","timezone"] # for weather
# columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [15]:
#save df a csv file
df.to_csv("Temp_CAINWPPSW_10yrs.csv", index=False)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185196783 entries, 0 to 185196782
Data columns (total 7 columns):
 #   Column        Dtype         
---  ------        -----         
 0   datetime      datetime64[ns]
 1   datatype      string        
 2   avgvalue      float64       
 3   siteid        string        
 4   station_name  string        
 5   region        string        
 6   timezone      string        
dtypes: datetime64[ns](1), float64(1), string(5)
memory usage: 9.7 GB
