In [1]:
from rfo_core.aws.iam import get_aws_session
from rfo_core.configuration import (aws_key, aws_secret, aws_region_default)
import awswrangler as wr
import boto3 as bt

In [2]:
def aws_get_session(region: str = aws_region_default) -> bt.Session:
    return get_aws_session(aws_key=aws_key, aws_secret=aws_secret, aws_region=region)
session = aws_get_session(region='us-east-1')

In [3]:
table="rfo_weather_enriched"

In [4]:
# checking all unique regions in the table
sql = "SELECT DISTINCT region FROM rfo_weather_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['region'].tolist())

KeyboardInterrupt: 

In [4]:
# checking all unique datatypes in the table
sql = "SELECT DISTINCT datatype FROM rfo_weather_enriched"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
unique_datatypes['datatype'].tolist()

['temperature_avg',
 'cloudCover',
 'temperature_max',
 'windChill',
 'electric_cdd',
 'population_hdd',
 'windDirection',
 'weighted_hdd',
 'gas_hdd',
 'relativeHumidity',
 'temperature_min',
 'gas_cdd',
 'weighted_cdd',
 'temperature',
 'electric_hdd',
 'heatIndex',
 'dewpoint',
 'windSpeed',
 'population_cdd']

In [26]:
# Query to get entries with a specific date
sql = "SELECT station_name, avgvalue, CAST(datetime AS DATE) AS date FROM rfo_weather_enriched WHERE datatype = 'gas_cdd' AND avgvalue IS NOT NULL AND CAST(datetime AS DATE) = DATE '2021-09-01'"
random_entries = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(random_entries)

     station_name  avgvalue        date
0             JFK       0.0  2021-09-01
1    Central Park       0.0  2021-09-01
2      Charleston       0.0  2021-09-01
3       Vancouver       0.0  2021-09-01
4     Parkersburg       0.0  2021-09-01
..            ...       ...         ...
255  Williamsport       0.0  2021-09-01
256    Burlington       0.0  2021-09-01
257        Dayton       0.0  2021-09-01
258        Peoria       0.0  2021-09-01
259   Houston IAH       0.0  2021-09-01

[260 rows x 3 columns]


In [29]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(random_entries.sort_values('avgvalue', ascending=False))

          station_name  avgvalue        date
75                <NA>       6.9  2021-09-01
158              CONUS       6.9  2021-09-01
242       CONSUM. EAST       3.1  2021-09-01
231          PRODUCING       2.4  2021-09-01
7        SOUTH CENTRAL       2.3  2021-09-01
173               EAST       1.8  2021-09-01
45        CONSUM. WEST       1.5  2021-09-01
113            MIDWEST       1.3  2021-09-01
105            PACIFIC       0.9  2021-09-01
89            MOUNTAIN       0.6  2021-09-01
243            Medford       0.0  2021-09-01
2           Charleston       0.0  2021-09-01
229          Milwaukee       0.0  2021-09-01
230          San Diego       0.0  2021-09-01
232           Columbia       0.0  2021-09-01
233          Rochester       0.0  2021-09-01
234          Jonesboro       0.0  2021-09-01
235             Toledo       0.0  2021-09-01
236          Flagstaff       0.0  2021-09-01
237           Stockton       0.0  2021-09-01
238          Lexington       0.0  2021-09-01
239       

In [34]:
# Query to get entries with a specific date
sql = "SELECT station_name, avgvalue, CAST(datetime AS DATE) AS date FROM rfo_weather_enriched WHERE datatype = 'population_cdd' AND avgvalue IS NOT NULL AND CAST(datetime AS DATE) = DATE '2021-09-01'"
random_entries = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(random_entries)

     station_name  avgvalue        date
0    Williamsport       0.0  2021-09-01
1            MISO       0.0  2021-09-01
2         Buffalo       0.0  2021-09-01
3         Sudbury       0.0  2021-09-01
4       St. Louis       0.0  2021-09-01
..            ...       ...         ...
255      Syracuse       0.0  2021-09-01
256       Decatur       0.0  2021-09-01
257   New Orleans       0.0  2021-09-01
258      Scranton       0.0  2021-09-01
259          AESO       0.0  2021-09-01

[260 rows x 3 columns]


In [35]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(random_entries.sort_values('avgvalue', ascending=False))

          station_name  avgvalue        date
183              CONUS       8.6  2021-09-01
52                <NA>       8.6  2021-09-01
127       CONSUM. EAST       4.1  2021-09-01
28           PRODUCING       3.1  2021-09-01
50       SOUTH CENTRAL       3.1  2021-09-01
21                EAST       3.0  2021-09-01
244       CONSUM. WEST       1.3  2021-09-01
76             MIDWEST       1.1  2021-09-01
80             PACIFIC       0.8  2021-09-01
237           MOUNTAIN       0.6  2021-09-01
228            Roanoke       0.0  2021-09-01
19          Wilmington       0.0  2021-09-01
242         Des Moines       0.0  2021-09-01
241         Charleston       0.0  2021-09-01
239               Reno       0.0  2021-09-01
240        Thunder Bay       0.0  2021-09-01
238        Springfield       0.0  2021-09-01
236           Edmonton       0.0  2021-09-01
1                 MISO       0.0  2021-09-01
3              Sudbury       0.0  2021-09-01
233          Riverside       0.0  2021-09-01
232       

In [10]:
# checking all unique station names in the table with region as "CAISO"
sql = "SELECT DISTINCT station_name FROM rfo_weather_enriched WHERE region = 'NWPP'"
unique_datatypes = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)
print(unique_datatypes['station_name'].value_counts())

station_name
Tacoma            1
Boise             1
Eugene            1
Portland          1
NWPP              1
Vancouver         1
Yakima            1
Pendleton         1
Billings          1
Medford           1
Reno              1
Salt Lake City    1
Great Falls       1
Klamath Falls     1
Spokane           1
Seattle           1
Name: count, dtype: Int64


In [4]:
conditions='''WHERE region = 'NWPP'
  AND CAST(datetime AS DATE) BETWEEN DATE '2021-09-01' AND DATE '2025-12-31' '''

In [5]:
#To query all columns
# table_info = wr.catalog.table(
#     database="rfo_analytics", 
#     table=table, 
#     boto3_session=session
# )
# columns = table_info['Column Name'].values

#If you know which columns to query
columns = ["datetime","datatype","avgvalue","siteid","station_name","timezone"] # for weather
# columns = ["datetime","avgvalue","objectid","name","yes_objectid","iso"]
# Build SELECT clause with automatic datetime casting
select_parts = []
for col_name in columns:
    if col_name.lower() == 'datetime':
        select_parts.append(f"CAST({col_name} AS timestamp) as {col_name}")
    else:
        select_parts.append(col_name)

select_clause = ",\n    ".join(select_parts)

# Build and execute query
sql = f"""SELECT 
    {select_clause}
FROM {table}
{conditions}
"""

df = wr.athena.read_sql_query(
    sql=sql,
    database="rfo_analytics",
    boto3_session=session
)

In [6]:
#each unique station_name has a unique timezone. construct a dictionary with station_name as key and timezone as value
timezone_dict = df[['station_name','timezone']].drop_duplicates().set_index('station_name').to_dict()['timezone']

In [16]:
#save timezone_dict
import joblib
joblib.dump(timezone_dict, '../data/timezone_dict.pkl')


['../data/timezone_dict.pkl']

In [None]:
from datetime import datetime
import sys
import os
sys.path.append(os.path.abspath(".."))
from helpers import utc_to_local
import pytz
# Example of converting UTC to a specific timezone


# You can test it with timezone_dict values
# For example:
test_time = datetime.now(pytz.UTC)
local_time = utc_to_local(test_time, timezone_dict['Vancouver'])

In [None]:
#df's datatype column has two values: 'temperature' and 'temperature_avg', and the corresponding value is in avgvalue column. I want two separate columns 'temperature' and 'temperature_avg' with their values taken from avgvalue column.
df = df[['station_name','datetime','datatype','avgvalue']].pivot(index=['station_name','datetime'], columns='datatype', values='avgvalue').reset_index()
df = df.rename_axis(None, axis=1)  # remove the axis name

In [8]:
df['datetime'].max()

Timestamp('2025-09-22 11:00:00')

In [10]:
#save df a csv file
df.to_csv("../data/ALLWeath_regNWPP_2021-2025.csv", index=False)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1359168 entries, 0 to 1359167
Data columns (total 24 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   datetime          1359168 non-null  datetime64[ns]
 1   siteid            1359168 non-null  string        
 2   station_name      1359168 non-null  string        
 3   region            1359168 non-null  string        
 4   timezone          1274220 non-null  string        
 5   cloudCover        1359168 non-null  float64       
 6   dewpoint          1359168 non-null  float64       
 7   electric_cdd      56208 non-null    float64       
 8   electric_hdd      56208 non-null    float64       
 9   gas_cdd           56208 non-null    float64       
 10  gas_hdd           56208 non-null    float64       
 11  heatIndex         1359168 non-null  float64       
 12  population_cdd    56208 non-null    float64       
 13  population_hdd    56208 non-null    float6