In [1]:
from rfo_core.aws.iam import get_aws_session
from rfo_core.configuration import (aws_key, aws_secret, aws_region_default)
import awswrangler as wr
import boto3 as bt

In [2]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)
session = aws_get_session(region='us-east-1')

In [3]:
table="rfo_weather_hourly_forecasts_enriched"

In [5]:
# checking all unique regions in the table
sql = "SELECT DISTINCT region FROM rfo_weather_hourly_forecasts_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['region'].tolist())

['EAST', 'SOUTHEAST', 'CONSUM. WEST', 'SOUTH CENTRAL', 'MIDWEST', 'CAISO', 'NWPP', 'IESO', 'PACIFIC', 'AESO', 'CONSUM. EAST', 'NYISO', 'MISO', 'OIL/GAS FIELDS', 'PRODUCING', 'SPP', 'NEISO', 'PJM', 'SOUTHWEST', 'ERCOT', 'MOUNTAIN']


In [6]:
# checking all unique datatypes in the table
sql = "SELECT DISTINCT datatype FROM rfo_weather_hourly_forecasts_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
unique_datatypes['datatype'].tolist()

['feelsliketemp',
 'winddir',
 'tempdiff',
 'feelsliketempdiff',
 'tempnormal',
 'temp',
 'ghirradiance',
 'windspeed_mph',
 'dewpoint',
 'cloud_cover',
 'precip']

In [7]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT station_name FROM rfo_weather_hourly_forecasts_enriched WHERE region = 'NWPP'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['station_name'].value_counts())

station_name
Medford           1
Reno              1
Vancouver         1
Boise             1
Salt Lake City    1
Great Falls       1
Yakima            1
NWPP              1
Spokane           1
Eugene            1
Portland          1
Seattle           1
Pendleton         1
Tacoma            1
Klamath Falls     1
Billings          1
Name: count, dtype: Int64


In [10]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT model_name FROM rfo_weather_hourly_forecasts_enriched WHERE region = 'NWPP'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['model_name'].value_counts())

model_name
WSI    1
Name: count, dtype: Int64


In [8]:
conditions='''WHERE region = 'NWPP'
  AND station_name = 'NWPP'
  AND CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31' '''

In [11]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
columns = ["datetime","datatype","avgvalue","model_date","model_hour","timezone"] # for weather
# columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [14]:
df.head()

Unnamed: 0,datetime,datatype,avgvalue,model_date,model_hour,timezone
0,2025-08-26 14:00:00,ghirradiance,,2025-08-20,18,
1,2025-08-26 15:00:00,ghirradiance,,2025-08-20,18,
2,2025-08-26 16:00:00,ghirradiance,,2025-08-20,18,
3,2025-08-26 17:00:00,ghirradiance,,2025-08-20,18,
4,2025-08-26 18:00:00,ghirradiance,,2025-08-20,18,


In [16]:
#df's datatype column has two values: 'temperature' and 'temperature_avg', and the corresponding value is in avgvalue column. I want two separate columns 'temperature' and 'temperature_avg' with their values taken from avgvalue column.
df = df[['datetime','model_date','model_hour','datatype','avgvalue']].pivot(index=['datetime','model_date','model_hour'], columns='datatype', values='avgvalue').reset_index()
df = df.rename_axis(None, axis=1)  # remove the axis name

In [17]:
df.head(20)

Unnamed: 0,datetime,model_date,model_hour,cloud_cover,dewpoint,feelsliketemp,feelsliketempdiff,ghirradiance,precip,temp,tempdiff,tempnormal,winddir,windspeed_mph
0,2025-08-20 07:00:00,2025-08-20,18,40.6,50.72,67.88,-0.85,,,68.15,-0.58,63.99,,5.6
1,2025-08-20 08:00:00,2025-08-20,18,34.35,51.46,67.47,0.56,,,67.47,0.56,61.4,,6.4
2,2025-08-20 09:00:00,2025-08-20,18,37.16,51.42,66.18,0.76,,,66.22,0.81,59.42,,5.8
3,2025-08-20 10:00:00,2025-08-20,18,36.41,51.02,65.13,1.07,,,65.13,1.07,58.74,,5.8
4,2025-08-20 11:00:00,2025-08-20,18,42.06,50.39,64.12,1.17,,,64.12,1.17,58.36,,6.0
5,2025-08-20 12:00:00,2025-08-20,18,54.7,49.79,63.41,1.11,,,63.41,1.11,58.09,,5.0
6,2025-08-20 13:00:00,2025-08-20,18,58.84,48.79,62.92,1.38,,,62.92,1.38,57.75,,6.4
7,2025-08-20 14:00:00,2025-08-20,18,46.18,49.08,62.73,0.99,,,62.73,0.99,58.89,,6.6
8,2025-08-20 15:00:00,2025-08-20,18,52.26,49.4,64.55,0.52,,,64.55,0.52,61.48,,6.8
9,2025-08-20 16:00:00,2025-08-20,18,51.55,49.52,66.47,0.19,,,66.7,0.08,65.37,,6.8


In [18]:
#save df a csv file
df.to_csv("data/ALLWeathForecastDTypes_NWPP_10yrs.csv", index=False)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359168 entries, 0 to 1359167
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   datetime          1359168 non-null  datetime64[ns]
 1   siteid            1359168 non-null  string        
 2   station_name      1359168 non-null  string        
 3   region            1359168 non-null  string        
 4   timezone          1274220 non-null  string        
 5   cloudCover        1359168 non-null  float64       
 6   dewpoint          1359168 non-null  float64       
 7   electric_cdd      56208 non-null    float64       
 8   electric_hdd      56208 non-null    float64       
 9   gas_cdd           56208 non-null    float64       
 10  gas_hdd           56208 non-null    float64       
 11  heatIndex         1359168 non-null  float64       
 12  population_cdd    56208 non-null    float64       
 13  population_hdd    56208 non-null    float6