In [None]:
#papermill_description=imports

import json
import os
import logging
import sys
import geopandas as gpd
from io import StringIO
import pandas as pd
from datetime import datetime
from gis_utils.dataframe import get_bbox_from_geodf
from aws_utils import S3Utils
import calendar
import time
from gis_utils.meteo import OpenMeteoAPI, convert_epoch_to_timezone, map_months_to_numbers
import pytz
from datetime import datetime, timedelta

logger = logging.getLogger()

In [None]:
#papermill_description=parameters

notebook_key = "localjupyter"
historical_years = 2024
historical_months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
geojson = {
    'body': {
        "type": "FeatureCollection",
        "name": "dissolved-boundaries",
        "crs": {
            "type": "name",
            "properties": {
                "name": "urn:ogc:def:crs:OGC:1.3:CRS84" 
            }
        },
        "features": [
            {
                "type": "Feature",
                "properties": {
                    "fid": 1
                },
                "geometry": {
                    "type": "Polygon",
                    "coordinates": [
                        [
                            [116.26012130269045, -29.225295369642396],
                            [116.261724812149055, -29.241374854584375],
                            [116.283751968396274, -29.256813692452539],
                            [116.284342735038919, -29.268250184258388],
                            [116.292247755352392, -29.265992437426529],
                            [116.292360282331941, -29.293057573630019],
                            [116.314865678242256, -29.293523728033122],
                            [116.326259034921833, -29.293033039128805],
                            [116.326315298411629, -29.305397680579894],
                            [116.355065941687045, -29.307016748931797],
                            [116.355065941687045, -29.306575187382712],
                            [116.383366477044206, -29.307384715430175],
                            [116.384322956370426, -29.290407813444993],
                            [116.387586238777402, -29.282629879611861],
                            [116.386517232471661, -29.259807919053017],
                            [116.359201308185533, -29.259488866292969],
                            [116.359229439930417, -29.259243440415627],
                            [116.35242155766754, -29.259292525638209],
                            [116.352140240218716, -29.220237788279107],
                            [116.302234524787593, -29.223503148505326],
                            [116.281388901825679, -29.2239696200396],
                            [116.26012130269045, -29.225295369642396]
                        ]
                    ]
                }
            }
        ]
    }
}
propertyName = "test"
output_type = "weather"

timezone = "Australia/Sydney"

historical = False
forecast = True

workspaceId = "018f9876-b3d7-73aa-97e6-0cf7a874383d"
propertyId = "018f99ea-564e-72fa-a4b7-2dbd24b65c3e"


In [None]:
def process_weather_data(weather_data, variable_order, utc=True):
    variables = {
        name: weather_data.Variables(index).ValuesInt64AsNumpy() if name in ['sunrise', 'sunset'] 
              else weather_data.Variables(index).ValuesAsNumpy()
        for index, name in enumerate(variable_order)
    }

    # timezone is fucking things up because it extends the last date for some months
    # target_timezone = pytz.timezone(timezone)
    # start_time = pd.to_datetime(weather_data.Time(), unit='s', utc=True).tz_convert(target_timezone)
    # end_time = pd.to_datetime(weather_data.TimeEnd(), unit='s', utc=True).tz_convert(target_timezone)

    start_time = pd.to_datetime(weather_data.Time(), unit='s', utc=utc)
    end_time = pd.to_datetime(weather_data.TimeEnd(), unit='s', utc=utc)

    time_range = pd.date_range(
        start=start_time,
        end=end_time,
        freq=pd.Timedelta(seconds=weather_data.Interval()),
        inclusive="left"
    )

    # Create a time range if needed
    data = {"date": time_range}
    data.update(variables)

    return pd.DataFrame(data)


def get_month_date_range(year, month, month_numbers):
    """
    Generate the start (from) and end (to) date strings for a given month and year,
    using the month numbers dictionary provided. If the computed end date is in the future,
    it defaults to today's date.
    """
    month_number = month_numbers[month.lower()]  # Access using lowercase to avoid case sensitivity issues
    first_day = 1
    last_day = calendar.monthrange(year, month_number)[1]  # Get the last day of the month
    datetime_from = f"{year}-{month_number:02d}-{first_day:02d}"
    datetime_end = f"{year}-{month_number:02d}-{last_day:02d}"
    
    # Convert strings to date objects for comparison
    date_end_obj = datetime.strptime(datetime_end, '%Y-%m-%d').date()
    today_date = datetime.today().date()
    yesterday_date = datetime.today().date() - timedelta(days=2)
    
    # Check if the end date is in the future
    if date_end_obj > today_date:
        datetime_end = yesterday_date.strftime('%Y-%m-%d')  # Update end date to today if in the future
    
    return datetime_from, datetime_end

def get_date_ranges(years, months):
    """
    Compute date ranges for each month in each given year.
    """
    month_numbers = map_months_to_numbers(months)
    date_ranges = {}
    today_year = datetime.today().year
    today_month = datetime.today().month
    for year in years:
        for month in months:
            if year == today_year and month_numbers[month.lower()] > today_month:
                break  # Stop iterating if the year and month are beyond the current date
            datetime_from, datetime_end = get_month_date_range(year, month, month_numbers)
            date_ranges[f"{year} {month}"] = (datetime_from, datetime_end)
            if year == today_year and month_numbers[month.lower()] == today_month:
                break  # Stop iterating after processing the current month
    return date_ranges

In [None]:
#papermill_description=process_variables

# Set up the initial directory based on the environment
storage_directory = "/tmp/{notebook_key}"

# Ensure the storage directory exists
os.makedirs(storage_directory, exist_ok=True)

years = [historical_years]
months = historical_months


date_ranges = get_date_ranges(years, months)

# Iterate over each geojson file in workspace directory
for geojson_file in os.listdir(workspace_directory):
    # Read the geojson file
    with open(os.path.join(workspace_directory, geojson_file), "r") as file:
   
        geojson_data = json.load(file)

        # Convert the GeoJSON string to a GeoDataFrame
        gdf = gpd.read_file(StringIO(json.dumps(geojson_data)))

        # Get bounding box from GeoJSON
        bbox = get_bbox_from_geodf(geojson_data)

        boundary_id = gdf['boundaryId'][0]

        # get the central latitude and longitude of the bounding box
        gpd_lon = (bbox[0] + bbox[2]) / 2
        gpd_lat = (bbox[1] + bbox[3]) / 2
        centroid = [gpd_lon, gpd_lat]

        notebook_key = geojson_data['name']

        # Iterate over each date range calculated previously
        # Using the function within another iterator:
        if historical:
            for date_range, dates in date_ranges.items():
                process_date_range(
                    date_range=date_range, 
                    dates=dates,
                    storage_directory=storage_directory, 
                    notebook_key=notebook_key,
                    latitude=gpd_lat, 
                    longitude=gpd_lon, 
                    boundary_id=boundary_id
                )
        if forecast:
            process_date_range_forecast(
                storage_directory, 
                notebook_key,
                gpd_lat, 
                gpd_lon, 
                boundary_id
            )

