In [17]:
#papermill_description=imports

import json
import os
import logging
import sys
import geopandas as gpd
from io import StringIO
import pandas as pd
from datetime import datetime
from gis_utils.dataframe import get_bbox_from_geodf
from aws_utils import S3Utils
import calendar
import time
from gis_utils.meteo import OpenMeteoAPI

# Configure logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

In [18]:
def calculate_days_between(date_str1: str, date_str2: str, format = "%Y-%m-%d") -> int:
    """
    Calculate the number of days between two dates given as strings.

    Parameters:
    - date_str1 (str): The first date string.
    - date_str2 (str): The second date string.

    Returns:
    - int: The difference in days between the two dates.
    
    Example:
    ```python
    days_difference = calculate_days_between('2024-05-01', '2024-04-25')
    print(f"The difference in days is: {days_difference}")
        
    """
    # convert the date strings into datetime objects
    date_format = format  # allow for custom date formats because merica
    datetime1 = datetime.strptime(date_str1, date_format)
    datetime2 = datetime.strptime(date_str2, date_format)
    
    # calculate the difference in days
    delta = datetime1 - datetime2
    return abs(delta.days)  # use abs to ensure a non-negative results

In [19]:
#papermill_description=parameters

model = "weather"
#years = [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
years = [2024]
#months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
months = ["January", "February", "March", "April"]
propertyName = "test"
output_type = "weather"
datetime_from="2023-12-01"
datetime_end="2023-12-31"
boundaryName = "boundaryName"
timezone = "Australia/Sydney"
historical = True

workspaceId = "018f9876-b3d7-73aa-97e6-0cf7a874383d"
propertyId = "018f99ea-564e-72fa-a4b7-2dbd24b65c3e"
local=True


In [20]:
def process_weather_data(weather_data, variable_order):
    variables = {name: weather_data.Variables(index).ValuesAsNumpy() for index, name in enumerate(variable_order)}
    time_range = pd.date_range(
        start=pd.to_datetime(weather_data.Time(), unit="s", utc=True),
        end=pd.to_datetime(weather_data.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=weather_data.Interval()),
        inclusive="left"
    )
    data = {"date": time_range}
    data.update(variables)
    return pd.DataFrame(data)

def map_months_to_numbers(months):
    """
    Map month names to their corresponding calendar numbers using the calendar module.
    """
    # Using calendar.month_abbr which is case-sensitive, ensure input is properly formatted
    month_to_number = {calendar.month_name[i].lower(): i for i in range(1, 13)}
    return {month.lower(): month_to_number[month.lower()] for month in months}

def get_month_date_range(year, month, month_numbers):
    """
    Generate the start (from) and end (to) date strings for a given month and year,
    using the month numbers dictionary provided.
    """
    month_number = month_numbers[month.lower()]  # Access using lowercase to avoid case sensitivity issues
    first_day = 1
    last_day = calendar.monthrange(year, month_number)[1]  # Get the last day of the month
    datetime_from = f"{year}-{month_number:02d}-{first_day:02d}"
    datetime_end = f"{year}-{month_number:02d}-{last_day:02d}"
    return datetime_from, datetime_end

def get_date_ranges(years, months):
    """
    Compute date ranges for each month in each given year.
    """
    month_numbers = map_months_to_numbers(months)
    date_ranges = {}
    for year in years:
        for month in months:
            datetime_from, datetime_end = get_month_date_range(year, month, month_numbers)
            date_ranges[f"{year} {month}"] = (datetime_from, datetime_end)
    return date_ranges

In [21]:
hourly = [
		"temperature_2m",
		"relative_humidity_2m",
		"dew_point_2m",
		"precipitation",
		"weather_code",
		"cloud_cover",
		"et0_fao_evapotranspiration",
		"wind_speed_10m",
		"wind_speed_40m"
]
daily = [
		"weather_code", 
		"temperature_2m_max", 
		"temperature_2m_min",
		"apparent_temperature_max", 
		"apparent_temperature_min",
		"sunrise",
		"sunset",
		"daylight_duration",
		"sunshine_duration",
		"uv_index_max",
		"uv_index_clear_sky_max",
		"precipitation_sum",
		"precipitation_hours",
		"wind_direction_10m_dominant",
		"shortwave_radiation_sum",
		"et0_fao_evapotranspiration"
]

forecast_hourly = [
		"temperature_2m",
    "apparent_temperature", # added as apparent temp included in daily forecast
		"relative_humidity_2m",
		"dew_point_2m",
		"precipitation",
		"weather_code",
		"cloud_cover",
		"et0_fao_evapotranspiration",
		"wind_speed_10m",
    "wind_speed_40m",
    "wind_direction_10m",
		"wind_direction_40m", # Do we need to fetch data at 10m and 40m above ground?
		"wind_gusts_10m",
		"sunshine_duration", # seconds of sunshine in preceeding hour
		"visibility",
		"soil_temperature_0_to_10cm",
		"soil_moisture_0_to_10cm"
]

forecast_daily = [
		"weather_code",
		"temperature_2m_max", 
		"temperature_2m_min",
		"apparent_temperature_max", 
		"apparent_temperature_min",
		"sunrise",
		"sunset",
		"uv_index_max",
		"daylight_duration",
		"sunshine_duration", # removed uv from forecast as BOM doesn't include it?
		"precipitation_sum",
		"precipitation_hours",
		"wind_speed_10m_max", # added for completeness
		"wind_gusts_10m_max", # added for completeness
		"wind_direction_10m_dominant",
		"shortwave_radiation_sum",
		"et0_fao_evapotranspiration"
]

In [22]:
#papermill_description=processing_bounding_box

api = OpenMeteoAPI()

def process_date_range(date_range, dates, storage_directory, notebook_key, gpd_lat, gpd_lon, boundary_id):
    print(f"For {date_range}: Start Date = {dates[0]}, End Date = {dates[1]}")
    calculated_date_from = dates[0]
    calculated_date_end = dates[1]
    
    # Ensure the specific output directory for this date range exists
    year, month = date_range.split(" ")
    output_directory = os.path.join(storage_directory, notebook_key, year, month)

    os.makedirs(output_directory, exist_ok=True)

    # Define the filenames for daily and hourly weather data outputs
    weather_output_daily_filename = os.path.join(output_directory, f"{model}_{propertyName}_{calculated_date_from}_{calculated_date_end}_daily_weather.csv")
    weather_output_hourly_filename = os.path.join(output_directory, f"{model}_{propertyName}_{calculated_date_from}_{calculated_date_end}_hourly_weather.csv")

    responses = api.fetch_weather_data(
        latitude=gpd_lat,
        longitude=gpd_lon,
        start_date=calculated_date_from,
        end_date=calculated_date_end,
        historical=True,
        daily=daily,
        hourly=hourly,
        timezone=timezone
    )
    response = responses[0]

    hourly_dataframe = process_weather_data(response.Hourly(), hourly)
    daily_dataframe = process_weather_data(response.Daily(), daily)

    hourly_dataframe["longitude"] = gpd_lon
    hourly_dataframe["latitude"] = gpd_lat
    hourly_dataframe["boundary_id"] = boundary_id
    hourly_dataframe["boundary_name"] = notebook_key
    hourly_dataframe["workspace_id"] = workspaceId
    hourly_dataframe["property_id"] = propertyId

    daily_dataframe["longitude"] = gpd_lon
    daily_dataframe["latitude"] = gpd_lat
    daily_dataframe["boundary_id"] = boundary_id
    daily_dataframe["boundary_name"] = notebook_key
    daily_dataframe["workspace_id"] = workspaceId
    daily_dataframe["property_id"] = propertyId

    # Save the dataframes to CSV
    daily_dataframe.to_csv(weather_output_daily_filename, index=False)
    hourly_dataframe.to_csv(weather_output_hourly_filename, index=False)

    time.sleep(1.5)  # Pause between API requests

def process_date_range_forecast(storage_directory, notebook_key, gpd_lat, gpd_lon, boundary_id):
    forecast_days = 7 # API docs indicate 7 days is the max forecast period

    # dynamically generate start/end dates based on current date and forecast_days
    datetime_from = (datetime.now().date()).strftime("%Y-%m-%d")
    datetime_end = (datetime.now().date() + pd.Timedelta(days=forecast_days)).strftime("%Y-%m-%d")

    date_ranges = pd.date_range(start=datetime_from, end=datetime_end, freq='D')

    calculated_date_from = date_ranges[0].strftime("%Y-%m-%d")
    calculated_date_end = date_ranges[-1].strftime("%Y-%m-%d")

    print(f"start date: {calculated_date_from}")
    print(f"start date: {calculated_date_end}")

    output_directory = f"{storage_directory}/{notebook_key}/forecast/{date_ranges[0].strftime('%Y-%m-%d')}"
    os.makedirs(output_directory, exist_ok=True)

    weather_output_forecast_daily_filename = os.path.join(output_directory, f"{model}_{propertyName}_daily_weather.csv")
    weather_output_forecast_hourly_filename = os.path.join(output_directory, f"{model}_{propertyName}_hourly_weather.csv")

    responses = api.fetch_weather_data(
        latitude=gpd_lat,
        longitude=gpd_lon,
        start_date=calculated_date_from,
        end_date=calculated_date_end,
        historical=False,
        daily=forecast_daily,
        hourly=forecast_hourly,
        timezone=timezone
    )
    response = responses[0]

    forecast_hourly_dataframe = process_weather_data(response.Hourly(), forecast_hourly)
    forecast_daily_dataframe = process_weather_data(response.Daily(), forecast_daily)

    forecast_hourly_dataframe["longitude"] = gpd_lon
    forecast_hourly_dataframe["latitude"] = gpd_lat
    forecast_hourly_dataframe["boundary_id"] = boundary_id
    forecast_hourly_dataframe["boundary_name"] = notebook_key
    forecast_hourly_dataframe["workspace_id"] = workspaceId
    forecast_hourly_dataframe["property_id"] = propertyId

    forecast_daily_dataframe["longitude"] = gpd_lon
    forecast_daily_dataframe["latitude"] = gpd_lat
    forecast_daily_dataframe["boundary_id"] = boundary_id
    forecast_daily_dataframe["boundary_name"] = notebook_key
    forecast_daily_dataframe["workspace_id"] = workspaceId
    forecast_daily_dataframe["property_id"] = propertyId

    print(weather_output_forecast_daily_filename)
    forecast_daily_dataframe.to_csv(weather_output_forecast_daily_filename, index=False)
    forecast_hourly_dataframe.to_csv(weather_output_forecast_hourly_filename, index=False)
    time.sleep(1.5)

In [23]:
#papermill_description=processing_file_io

# req = geojson
# geojson_data = req['body']  # Directly accessing the 'body' since it's already a dictionary in this mock setup

# # Convert the GeoJSON string to a GeoDataFrame
# gdf = gpd.read_file(StringIO(json.dumps(geojson_data)))

In [24]:
#papermill_description=process_variables

# Set up the initial directory based on the environment
storage_directory = "/tmp"
if local:
    storage_directory = "/workspace/notebook_outputs"

# Ensure the storage directory exists
os.makedirs(storage_directory, exist_ok=True)

workspace_directory = f"/workspace/geojsons"

date_ranges = get_date_ranges(years, months)

# Iterate over each geojson file in workspace directory
for geojson_file in os.listdir(workspace_directory):
    # Read the geojson file
    with open(os.path.join(workspace_directory, geojson_file), "r") as file:
   
        geojson_data = json.load(file)

        # Convert the GeoJSON string to a GeoDataFrame
        gdf = gpd.read_file(StringIO(json.dumps(geojson_data)))

        # Get bounding box from GeoJSON
        bbox = get_bbox_from_geodf(geojson_data)

        boundary_id = gdf['boundaryId'][0]

        print(f"Boundary ID: {boundary_id}")

        # get the central latitude and longitude of the bounding box
        gpd_lon = (bbox[0] + bbox[2]) / 2
        gpd_lat = (bbox[1] + bbox[3]) / 2
        centroid = [gpd_lon, gpd_lat]
        print(f"Centroid: {centroid}")

        notebook_key = geojson_data['name']

        # Iterate over each date range calculated previously
        # Using the function within another iterator:
        # for date_range, dates in date_ranges.items():
        #     process_date_range(
        #         date_range, 
        #         dates,
        #         storage_directory, 
        #         notebook_key,
        #         gpd_lat, 
        #         gpd_lon, 
        #         boundary_id
        #     )
        process_date_range_forecast(
            storage_directory, 
            notebook_key,
            gpd_lat, 
            gpd_lon, 
            boundary_id
        )



INFO:botocore.credentials:Found credentials in environment variables.


Boundary ID: 018f9f82-ea97-749d-baea-cd65d3ecc989
Centroid: [145.63004997472387, -38.40358558670786]
start date: 2024-05-28
start date: 2024-06-04


OpenMeteoRequestsError: {'reason': "Data corrupted at path ''. Cannot initialize ForecastVariableDaily from invalid String value weather_codetemperature_2m_max.", 'error': True}

In [None]:
# aws_s3_notebook_output = os.getenv('AWS_S3_BUCKET_NOTEBOOK_OUTPUT')
# aws_default_region = os.getenv('AWS_DEFAULT_REGION')
# s3_client = S3Utils(
# 		region_name=aws_default_region,
# 		s3_bucket=aws_s3_notebook_output,
# 		prefix=notebook_key
# )

In [None]:
# Save dataframes to CSV
# output_dir = f"{storage_directory}/{notebook_key}/{calculated_date_from}_{calculated_date_end}"
# os.makedirs(output_dir, exist_ok=True)
# daily_dataframe.to_csv(weather_output_daily_filename, index=False)
# hourly_dataframe.to_csv(weather_output_hourly_filename, index=False)


# s3_client.upload_file(
# 		file_path=weather_output_daily_filename,
# )

# s3_client.upload_file(
# 		file_path=weather_output_hourly_filename,
# )