In [1]:
from rfo_core.aws.iam import get_aws_session
from rfo_core.configuration import (aws_key, aws_secret, aws_region_default)
import awswrangler as wr
import boto3 as bt

In [2]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)
session = aws_get_session(region='us-east-1')

In [5]:
db = "rfo_analytics"
table="rfo_weather_hourly_forecasts_enriched"

In [6]:
#checking the last 'model_date' value in the table
df = wr.athena.read_sql_query(f"SELECT DISTINCT model_date FROM {table} ORDER BY model_date DESC LIMIT 1", database=db, boto3_session=session)
latest_model_date = df['model_date'].iloc[0]
print(f"Latest model_date in {table}: {latest_model_date}")

Latest model_date in rfo_weather_hourly_forecasts_enriched: 2025-09-23


In [None]:
# checking all unique names in the table
unique_datatypes = dict()
for things in ['model_name', 'datatype']:
    sql = f"SELECT DISTINCT {things}, 'CAISO' as iso FROM {table}"
    unique_datatypes[things] = wr.athena.read_sql_query(
        sql=sql,
        database=db,
        boto3_session=session
    )

KeyboardInterrupt: 

In [8]:
# checking all unique names in the table
unique_datatypes = dict()
for things in ['model_name', 'datatype']:
    sql = f"SELECT DISTINCT {things}, 'CAISO' as iso FROM rfo_weather_forecasts_enriched"
    unique_datatypes[things] = wr.athena.read_sql_query(
        sql=sql,
        database="rfo_analytics",
        boto3_session=session
    )


In [9]:
unique_datatypes['model_name']

Unnamed: 0,model_name,iso
0,ECM Op,CAISO
1,ECM Ens MOS,CAISO
2,WSI - Prev. Evening,CAISO
3,GFS Op,CAISO
4,GEM Ens Mean,CAISO
5,GFS Ens Mean,CAISO
6,WSI - Morning,CAISO
7,WSI,CAISO
8,ECM Ens Mean,CAISO
9,GEM Op,CAISO


In [10]:
unique_datatypes['datatype']

Unnamed: 0,datatype,iso
0,min_temp,CAISO
1,max_temp,CAISO


In [7]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT station_name FROM rfo_weather_hourly_forecasts_enriched WHERE region = 'NWPP'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['station_name'].value_counts())

station_name
Medford           1
Reno              1
Vancouver         1
Boise             1
Salt Lake City    1
Great Falls       1
Yakima            1
NWPP              1
Spokane           1
Eugene            1
Portland          1
Seattle           1
Pendleton         1
Tacoma            1
Klamath Falls     1
Billings          1
Name: count, dtype: Int64


In [10]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT model_name FROM rfo_weather_hourly_forecasts_enriched WHERE region = 'NWPP'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['model_name'].value_counts())

model_name
WSI    1
Name: count, dtype: Int64


In [4]:
conditions='''WHERE region = 'NWPP'
  AND CAST(datetime AS DATE) BETWEEN DATE '2025-01-01' AND DATE '2025-12-31' '''

In [5]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
columns = ["datetime","datatype","avgvalue","model_date","model_hour","station_name","timezone"] # for weather
# columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [7]:
df.head()

Unnamed: 0,datetime,datatype,avgvalue,model_date,model_hour,station_name,timezone
0,2025-09-05 09:00:00,cloud_cover,0.0,2025-09-05,7,Boise,America/Boise
1,2025-09-05 11:00:00,cloud_cover,30.0,2025-09-05,7,Boise,America/Boise
2,2025-09-05 12:00:00,cloud_cover,34.0,2025-09-05,7,Boise,America/Boise
3,2025-09-05 14:00:00,cloud_cover,7.0,2025-09-05,7,Boise,America/Boise
4,2025-09-05 19:00:00,cloud_cover,5.0,2025-09-05,7,Boise,America/Boise


In [6]:
#df's datatype column has two values: 'temperature' and 'temperature_avg', and the corresponding value is in avgvalue column. I want two separate columns 'temperature' and 'temperature_avg' with their values taken from avgvalue column.
df = df[['datetime','model_date','model_hour','station_name','datatype','avgvalue']].pivot(index=['datetime','model_date','model_hour','station_name'], columns='datatype', values='avgvalue').reset_index()
df = df.rename_axis(None, axis=1)  # remove the axis name

In [14]:
import pandas as pd

In [10]:
df['datetime'].min(), df['datetime'].max() 

(Timestamp('2025-08-20 06:00:00'), Timestamp('2025-09-21 06:00:00'))

In [17]:
df[(df['model_date'] == pd.to_datetime('9/4/2025').date())&(df['datetime'].dt.date == pd.to_datetime('9/10/2025').date())&(df['station_name'] == 'Portland')&(df['model_hour'] == 11)].head(20)

Unnamed: 0,datetime,model_date,model_hour,station_name,cloud_cover,dewpoint,feelsliketemp,feelsliketempdiff,ghirradiance,precip,temp,tempdiff,tempnormal,winddir,windspeed_mph


In [11]:
#save df a csv file
df.to_csv("../data/WFC_RNWPP_22Aug25-20Sep25.csv", index=False)

In [7]:
df.head()

Unnamed: 0,datetime,model_date,model_hour,station_name,cloud_cover,dewpoint,feelsliketemp,feelsliketempdiff,ghirradiance,precip,temp,tempdiff,tempnormal,winddir,windspeed_mph
0,2025-08-20 06:00:00,2025-08-20,18,Billings,0.0,37.04,81.7,3.98,0.0,0.0,84.02,6.3,63.68,110.0,8.1
1,2025-08-20 06:00:00,2025-08-20,18,Boise,0.0,48.92,77.0,-1.08,0.0,0.0,77.0,-1.08,67.43,120.0,3.4
2,2025-08-20 06:00:00,2025-08-20,18,Great Falls,0.0,51.98,71.06,-0.18,0.0,0.0,71.06,-0.18,57.73,0.0,15.0
3,2025-08-20 06:00:00,2025-08-20,18,Salt Lake City,25.0,35.96,78.07,-1.09,0.0,0.0,80.06,0.9,72.05,170.0,4.7
4,2025-08-20 07:00:00,2025-08-20,18,Billings,0.0,37.94,77.0,2.16,0.0,0.0,77.0,2.16,60.74,350.0,3.4
