In [20]:
#papermill_description=imports

import json
import os
import logging
import sys
import geopandas as gpd
from io import StringIO
import pandas as pd
from datetime import datetime
from gis_utils.dataframe import get_bbox_from_geodf
from aws_utils import S3Utils
import calendar
import time
from gis_utils.meteo import OpenMeteoAPI, convert_epoch_to_timezone, map_months_to_numbers
import pytz
from datetime import datetime, timedelta

# Configure logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

In [21]:
#papermill_description=parameters

model = "weather"
years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
propertyName = "test"
output_type = "weather"

timezone = "Australia/Sydney"

historical = False
forecast = True

workspaceId = "018f9876-b3d7-73aa-97e6-0cf7a874383d"
propertyId = "018f99ea-564e-72fa-a4b7-2dbd24b65c3e"
local=True


In [22]:
def process_weather_data(weather_data, variable_order, utc=True):
    variables = {
        name: weather_data.Variables(index).ValuesInt64AsNumpy() if name in ['sunrise', 'sunset'] 
              else weather_data.Variables(index).ValuesAsNumpy()
        for index, name in enumerate(variable_order)
    }

    # timezone is fucking things up because it extends the last date for some months
    # target_timezone = pytz.timezone(timezone)
    # start_time = pd.to_datetime(weather_data.Time(), unit='s', utc=True).tz_convert(target_timezone)
    # end_time = pd.to_datetime(weather_data.TimeEnd(), unit='s', utc=True).tz_convert(target_timezone)

    start_time = pd.to_datetime(weather_data.Time(), unit='s', utc=utc)
    end_time = pd.to_datetime(weather_data.TimeEnd(), unit='s', utc=utc)

    time_range = pd.date_range(
        start=start_time,
        end=end_time,
        freq=pd.Timedelta(seconds=weather_data.Interval()),
        inclusive="left"
    )

    # Create a time range if needed
    data = {"date": time_range}
    data.update(variables)

    return pd.DataFrame(data)


def get_month_date_range(year, month, month_numbers):
    """
    Generate the start (from) and end (to) date strings for a given month and year,
    using the month numbers dictionary provided. If the computed end date is in the future,
    it defaults to today's date.
    """
    month_number = month_numbers[month.lower()]  # Access using lowercase to avoid case sensitivity issues
    first_day = 1
    last_day = calendar.monthrange(year, month_number)[1]  # Get the last day of the month
    datetime_from = f"{year}-{month_number:02d}-{first_day:02d}"
    datetime_end = f"{year}-{month_number:02d}-{last_day:02d}"
    
    # Convert strings to date objects for comparison
    date_end_obj = datetime.strptime(datetime_end, '%Y-%m-%d').date()
    today_date = datetime.today().date()
    yesterday_date = datetime.today().date() - timedelta(days=2)
    
    # Check if the end date is in the future
    if date_end_obj > today_date:
        datetime_end = yesterday_date.strftime('%Y-%m-%d')  # Update end date to today if in the future
    
    return datetime_from, datetime_end

def get_date_ranges(years, months):
    """
    Compute date ranges for each month in each given year.
    """
    month_numbers = map_months_to_numbers(months)
    date_ranges = {}
    today_year = datetime.today().year
    today_month = datetime.today().month
    for year in years:
        for month in months:
            if year == today_year and month_numbers[month.lower()] > today_month:
                break  # Stop iterating if the year and month are beyond the current date
            datetime_from, datetime_end = get_month_date_range(year, month, month_numbers)
            date_ranges[f"{year} {month}"] = (datetime_from, datetime_end)
            if year == today_year and month_numbers[month.lower()] == today_month:
                break  # Stop iterating after processing the current month
    return date_ranges

In [23]:
historical_hourly = [
		"temperature_2m",
		"relative_humidity_2m",
		"dew_point_2m",
		"precipitation",
		"weather_code",
		"cloud_cover",
		"et0_fao_evapotranspiration",
		"wind_speed_10m",
		"wind_speed_40m"
]
historical_daily = [
		"weather_code",
		"sunrise",
		"sunset",
		"uv_index_max",
		"temperature_2m_max", 
		"temperature_2m_min",
		"apparent_temperature_max", 
		"apparent_temperature_min",
		"daylight_duration",
		"sunshine_duration",
		"uv_index_clear_sky_max",
		"precipitation_sum",
		"precipitation_hours",
		"wind_direction_10m_dominant",
		"shortwave_radiation_sum",
		"et0_fao_evapotranspiration"
]

forecast_hourly = [
		"temperature_2m",
    "apparent_temperature", # added as apparent temp included in daily forecast
		"relative_humidity_2m",
		"dew_point_2m",
		"precipitation",
		"weather_code",
		"cloud_cover",
		"et0_fao_evapotranspiration",
		"wind_speed_10m",
    "wind_speed_40m",
    "wind_direction_10m",
		"wind_direction_40m", # Do we need to fetch data at 10m and 40m above ground?
		"wind_gusts_10m",
		"sunshine_duration", # seconds of sunshine in preceeding hour
		"visibility",
		"soil_temperature_0_to_10cm",
		"soil_moisture_0_to_10cm"
]

forecast_daily = [
		"weather_code",
		"sunrise",
		"sunset",
		"uv_index_max",
		"temperature_2m_max", 
		"temperature_2m_min",
		"apparent_temperature_max", 
		"apparent_temperature_min",
		"daylight_duration",
		"sunshine_duration", # removed uv from forecast as BOM doesn't include it?
		"precipitation_sum",
		"precipitation_hours",
		"wind_speed_10m_max", # added for completeness
		"wind_gusts_10m_max", # added for completeness
		"wind_direction_10m_dominant",
		"shortwave_radiation_sum",
		"et0_fao_evapotranspiration"
]

In [24]:
#papermill_description=processing_bounding_box

api = OpenMeteoAPI()

def process_date_range(date_range, dates, storage_directory, notebook_key, latitude, longitude, boundary_id):
    calculated_date_from = dates[0]
    calculated_date_end = dates[1]
    
    # Ensure the specific output directory for this date range exists
    year, month = date_range.split(" ")
    output_directory = os.path.join(storage_directory, notebook_key, year, month)

    os.makedirs(output_directory, exist_ok=True)

    # Define the filenames for daily and hourly weather data outputs
    weather_output_daily_filename = os.path.join(output_directory, f"{model}_{propertyName}_{calculated_date_from}_{calculated_date_end}_daily_weather.csv")
    weather_output_hourly_filename = os.path.join(output_directory, f"{model}_{propertyName}_{calculated_date_from}_{calculated_date_end}_hourly_weather.csv")

    responses = api.fetch_weather_data(
        latitude=latitude,
        longitude=longitude,
        start_date=calculated_date_from,
        end_date=calculated_date_end,
        url="https://archive-api.open-meteo.com/v1/archive",
        daily=historical_daily,
        hourly=historical_hourly,
        timezone=None,
        timeformat="unixtime"
    )
    response = responses[0]

    daily = response.Daily()
    hourly = response.Hourly()
    daily_dataframe = process_weather_data(daily, historical_daily, utc=True)
    hourly_dataframe = process_weather_data(hourly, historical_hourly, utc=True)

    daily_dataframe = convert_epoch_to_timezone(daily_dataframe, ["sunrise", "sunset"])
    #hourly_dataframe = convert_epoch_to_timezone(hourly_dataframe, ["sunrise", "sunset"])

    hourly_dataframe["longitude"] = latitude
    hourly_dataframe["latitude"] = longitude
    hourly_dataframe["boundary_id"] = boundary_id
    hourly_dataframe["boundary_name"] = notebook_key
    hourly_dataframe["workspace_id"] = workspaceId
    hourly_dataframe["property_id"] = propertyId

    daily_dataframe["longitude"] = latitude
    daily_dataframe["latitude"] = longitude
    daily_dataframe["boundary_id"] = boundary_id
    daily_dataframe["boundary_name"] = notebook_key
    daily_dataframe["workspace_id"] = workspaceId
    daily_dataframe["property_id"] = propertyId

    # Save the dataframes to CSV
    daily_dataframe.to_csv(weather_output_daily_filename, index=False)
    hourly_dataframe.to_csv(weather_output_hourly_filename, index=False)

    time.sleep(1)  # Pause between API requests

def process_date_range_forecast(storage_directory, notebook_key, gpd_lat, gpd_lon, boundary_id, forecast_days=15):

    output_directory = f"{storage_directory}/{notebook_key}/forecast/{(datetime.now().date()).strftime('%Y-%m-%d')}"
    os.makedirs(output_directory, exist_ok=True)

    weather_output_forecast_daily_filename = os.path.join(output_directory, f"{model}_{propertyName}_daily_weather_forecast.csv")

    responses = api.fetch_weather_data(
        latitude=gpd_lat,
        longitude=gpd_lon,
        start_date=None,
        end_date=None,
        url="https://api.open-meteo.com/v1/forecast",
        daily=forecast_daily,
        hourly=None,
        timezone=timezone,
        timeformat="unixtime",
        forecast_days=forecast_days
    )
    response = responses[0]

    daily = response.Daily()
    forecast_daily_data = process_weather_data(daily, forecast_daily, utc=True)

    forecast_daily_data = convert_epoch_to_timezone(forecast_daily_data, ["sunrise", "sunset"])

    forecast_daily_data["longitude"] = gpd_lon
    forecast_daily_data["latitude"] = gpd_lat
    forecast_daily_data["boundary_id"] = boundary_id
    forecast_daily_data["boundary_name"] = notebook_key
    forecast_daily_data["workspace_id"] = workspaceId
    forecast_daily_data["property_id"] = propertyId

    forecast_daily_data.to_csv(weather_output_forecast_daily_filename, index=False)
    time.sleep(1)

In [25]:
#papermill_description=process_variables

# Set up the initial directory based on the environment
storage_directory = "/tmp"
if local:
    storage_directory = "/workspace/notebook_outputs"

# Ensure the storage directory exists
os.makedirs(storage_directory, exist_ok=True)

workspace_directory = f"/workspace/geojsons"

date_ranges = get_date_ranges(years, months)

# Iterate over each geojson file in workspace directory
for geojson_file in os.listdir(workspace_directory):
    # Read the geojson file
    with open(os.path.join(workspace_directory, geojson_file), "r") as file:
   
        geojson_data = json.load(file)

        # Convert the GeoJSON string to a GeoDataFrame
        gdf = gpd.read_file(StringIO(json.dumps(geojson_data)))

        # Get bounding box from GeoJSON
        bbox = get_bbox_from_geodf(geojson_data)

        boundary_id = gdf['boundaryId'][0]

        # get the central latitude and longitude of the bounding box
        gpd_lon = (bbox[0] + bbox[2]) / 2
        gpd_lat = (bbox[1] + bbox[3]) / 2
        centroid = [gpd_lon, gpd_lat]

        notebook_key = geojson_data['name']

        # Iterate over each date range calculated previously
        # Using the function within another iterator:
        if historical:
            for date_range, dates in date_ranges.items():
                process_date_range(
                    date_range=date_range, 
                    dates=dates,
                    storage_directory=storage_directory, 
                    notebook_key=notebook_key,
                    latitude=gpd_lat, 
                    longitude=gpd_lon, 
                    boundary_id=boundary_id
                )
        if forecast:
            process_date_range_forecast(
                storage_directory, 
                notebook_key,
                gpd_lat, 
                gpd_lon, 
                boundary_id
            )



INFO:botocore.credentials:Found credentials in environment variables.


INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.
INFO:botocore.credentials:Found credentials in environment variables.


In [27]:
import os
from aws_utils import S3Utils

storage_directory = "/workspace/notebook_outputs"

aws_s3_notebook_output = "notebook-uploads"
aws_default_region = os.getenv('AWS_DEFAULT_REGION')

folder = os.path.join(storage_directory)
s3_client = S3Utils(
		region_name=aws_default_region,
		s3_bucket=aws_s3_notebook_output,
		prefix='test'
)

daily_parquet_file = f"{storage_directory}/daily_weather.parquet"
hourly_parquet_file = f"{storage_directory}/hourly_weather.parquet"
daily_forecast_parquet_file = f"{storage_directory}/daily_weather_forecast.parquet"

s3_client.upload_file(file_path=daily_parquet_file)
s3_client.upload_file(file_path=hourly_parquet_file)
s3_client.upload_file(file_path=daily_forecast_parquet_file)

--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/usr/local/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/usr/local/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/usr/local/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: %x format: an integer is required, not str
Call stack:
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in laun

INFO:aws_utils.s3_utils:File daily_weather.parquet uploaded successfully.


--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/usr/local/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/usr/local/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/usr/local/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: %x format: an integer is required, not str
Call stack:
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in laun

INFO:aws_utils.s3_utils:File hourly_weather.parquet uploaded successfully.


--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/logging/__init__.py", line 1100, in emit
    msg = self.format(record)
  File "/usr/local/lib/python3.10/logging/__init__.py", line 943, in format
    return fmt.format(record)
  File "/usr/local/lib/python3.10/logging/__init__.py", line 678, in format
    record.message = record.getMessage()
  File "/usr/local/lib/python3.10/logging/__init__.py", line 368, in getMessage
    msg = msg % self.args
TypeError: %x format: an integer is required, not str
Call stack:
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in laun

INFO:aws_utils.s3_utils:File daily_weather_forecast.parquet uploaded successfully.


True