In [1]:
from rfo_core.aws.iam import get_aws_session
from rfo_core.configuration import (aws_key, aws_secret, aws_region_default)
import awswrangler as wr
import boto3 as bt

In [2]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)
session = aws_get_session(region='us-east-1')

In [4]:
table="rfo_generation_enriched"

In [19]:

# checking all unique names in the table
unique_datatypes = dict()
for things in ['name', 'datatype']:
    sql = f"SELECT DISTINCT {things}, 'CAISO' as iso FROM {table}"
    unique_datatypes[things] = wr.athena.read_sql_query(
        sql=sql,
        database="rfo_analytics",
        boto3_session=session
    )

In [15]:
print(unique_datatypes[unique_datatypes['datatype'].str.contains('EIA930')]['datatype'].to_string(index=False))

  EIA930_NET_GEN_COAL
   EIA930_NET_GEN_NUC
   EIA930_NET_GEN_OIL
EIA930_NET_GENERATION
  EIA930_NET_GEN_WIND
   EIA930_NET_GEN_WAT
   EIA930_NET_GEN_GAS
 EIA930_NET_GEN_OTHER
   EIA930_NET_GEN_SUN


In [26]:
for name in names:
    print(name,name.lower() in unique_datatypes['name']['name'].str.lower().values)

AVRN False
Avista Corporation True
BPA True
City of Tacoma, Department of Public Utilities True
Idaho Power Company True
PUD No. 1 of Douglas County True
PUD No. 2 of Grant County, Washington True
PacifiCorp West True
Portland General Electric Company True
Puget Sound False
Puget Sound Energy, Inc. True
Seattle City Light True


In [17]:
#PRINT unique_datatypes['name'] sorted alphabetically
print(unique_datatypes['name'].sort_values().to_string(index=False))


KeyError: 'name'

In [25]:
#for importing load data for MIDC
names = ('AVRN',
'Avista Corporation',
'BPA',
'City of Tacoma, Department of Public Utilities',
 'Idaho Power Company',
'PUD No. 1 of Douglas County',
 'PUD No. 2 of Grant County, Washington',
'PacifiCorp West',
 'Portland General Electric Company',
'Puget Sound',
 'Puget Sound Energy, Inc.',
'Seattle City Light',
)

In [9]:
conditions='''WHERE region = 'NWPP'
  AND station_name = 'NWPP'
  AND CAST(datetime AS DATE) BETWEEN DATE '2016-01-01' AND DATE '2025-12-31' '''

In [10]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
columns = ["datetime","datatype","avgvalue","siteid","station_name","region","timezone"] # for weather
# columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [12]:
#df's datatype column has two values: 'temperature' and 'temperature_avg', and the corresponding value is in avgvalue column. I want two separate columns 'temperature' and 'temperature_avg' with their values taken from avgvalue column.
df = df[['datetime','datatype','avgvalue']].pivot(index=['datetime'], columns='datatype', values='avgvalue').reset_index()
df = df.rename_axis(None, axis=1)  # remove the axis name

In [13]:
df.head(20)

Unnamed: 0,datetime,cloudCover,dewpoint,electric_cdd,electric_hdd,gas_cdd,gas_hdd,heatIndex,population_cdd,population_hdd,relativeHumidity,temperature,temperature_avg,temperature_max,temperature_min,weighted_cdd,weighted_hdd,windChill,windDirection,windSpeed
0,2016-01-01 00:00:00,10.0,18.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,57.0,32.0,27.5,35.0,20.0,0.0,37.6,24.0,81.0,10.0
1,2016-01-01 01:00:00,12.0,18.0,,,,,30.0,,,63.0,30.0,,,,,,23.0,78.0,7.0
2,2016-01-01 02:00:00,4.0,18.0,,,,,29.0,,,66.0,29.0,,,,,,20.0,96.0,10.0
3,2016-01-01 03:00:00,5.0,17.0,,,,,28.0,,,65.0,28.0,,,,,,19.0,97.0,9.0
4,2016-01-01 04:00:00,6.0,17.0,,,,,27.0,,,66.0,27.0,,,,,,19.0,97.0,8.0
5,2016-01-01 05:00:00,8.0,16.0,,,,,26.0,,,68.0,26.0,,,,,,16.0,89.0,10.0
6,2016-01-01 06:00:00,10.0,16.0,,,,,26.0,,,67.0,26.0,,,,,,16.0,84.0,9.0
7,2016-01-01 07:00:00,7.0,16.0,,,,,25.0,,,69.0,25.0,,,,,,16.0,85.0,9.0
8,2016-01-01 08:00:00,5.0,16.0,,,,,23.0,,,72.0,23.0,,,,,,14.0,76.0,8.0
9,2016-01-01 09:00:00,9.0,17.0,,,,,24.0,,,75.0,24.0,,,,,,15.0,78.0,9.0


In [14]:
#save df a csv file
df.to_csv("data/ALLWeathDTypes_NWPP_10yrs.csv", index=False)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359168 entries, 0 to 1359167
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   datetime          1359168 non-null  datetime64[ns]
 1   siteid            1359168 non-null  string        
 2   station_name      1359168 non-null  string        
 3   region            1359168 non-null  string        
 4   timezone          1274220 non-null  string        
 5   cloudCover        1359168 non-null  float64       
 6   dewpoint          1359168 non-null  float64       
 7   electric_cdd      56208 non-null    float64       
 8   electric_hdd      56208 non-null    float64       
 9   gas_cdd           56208 non-null    float64       
 10  gas_hdd           56208 non-null    float64       
 11  heatIndex         1359168 non-null  float64       
 12  population_cdd    56208 non-null    float64       
 13  population_hdd    56208 non-null    float6