In [2]:
import openmeteo_requests
import requests_cache
from retry_requests import retry

from datetime import datetime
import pandas as pd
from time import sleep
import os

In [3]:
def generate_month_dates(start_date: datetime, end_date: datetime) -> list[datetime]:
    """
    Generates first date of every month between start_date and end_date

    Args:
    start_date (datetime): A starting date of time series.
    end_date (datetime): A ending date of time series.

    Returns:
    dates (list): List of first date of each month.
    """

    current = start_date
    dates = []
    while current <= end_date:
        dates.append(current.strftime('%Y-%m-%d'))
        if current.month == 12:
            current = current.replace(year=current.year + 1, month=1, day=1)
        else:
            current = current.replace(month=current.month + 1, day=1)
            
    return dates

def generate_year_dates(start_year: int, end_year: int) -> list[datetime]:
    """
    Generates first date of every year between start_date and end_date

    Args:
    start_year (int): A starting year of time series.
    end_year (int): A ending year of time series.

    Returns:
    dates (list): List of first date of each year.
    """

    dates = []
    for year in range(int(start_year), int(end_year) + 1):
        last_day = datetime(year, 12, 31)
        dates.append(last_day.strftime('%Y-%m-%d'))
    
    return dates

def create_variable_template(df: pd.DataFrame, time_series: str = 'yearly') -> dict[str, pd.DataFrame]:
    """
    Create dictionary for storing variable names(key) and its data(value).
    As well as create data frame template for the data(value) variable.

    Args:
    df (DataFrame): A data frame of interest that has both 'latitude' and 'longitude' columns.
    time_series (string): {'yearly', 'monthly}, default 'yearly'
    Determine if the time series is the series of month or year.

    Returns:
    variable_dict (dict): [variable names: keys, variable data: values]
    """

    valid_time_series = ['monthly', 'yearly']
    if time_series not in valid_time_series:
        raise ValueError(f"Invalid time series frequency. Choose from: {', '.join(valid_time_series)}")

    if time_series == 'yearly':
        export_template = pd.DataFrame(index=pd.MultiIndex.from_frame(df[['latitude', 'longitude']]),
                                    columns=[year.split('-')[0] for year in generate_year_dates(start_date.year, end_date.year)])
    else:
        export_template = pd.DataFrame(index=pd.MultiIndex.from_frame(df[['latitude', 'longitude']]),
                                    columns=[month for month in generate_month_dates(start_date, end_date)])
    
    variable_dict = dict.fromkeys(params['daily'])

    for variable in variable_dict.keys():
        variable_dict[variable] = export_template.copy()
    
    return variable_dict

def detect_exist_dataset(variable_dict: dict[str, pd.DataFrame], start_date: datetime, end_date: datetime, time_series: str = 'yearly') -> dict[str, pd.DataFrame]: 
    """
    Detect every interest variable data files and assign the existing data to the variable in the 'variable_dict',
    pass them to 'variable_dict' and carry on pulling the remaining data without requesting the existed data.

    Args:
    variable_dict (dict): [variable names: keys, variable data: values]
    start_date (datetime): A starting date of time series.
    end_date (datetime): A ending date of time series.
    time_series (string): {'yearly', 'monthly}, default 'yearly'
    Determine if the time series is the series of month or year.

    Returns:
    variable_dict (dict): [variable names: keys, variable data: values]
    """

    start_date = datetime.strftime(start_date, '%Y-%m-%d')
    end_date = datetime.strftime(end_date, '%Y-%m-%d')
        
    for variable in variable_dict.keys():
        file = f"{variable}_{start_date}_{end_date}_{time_series}.csv"
        existing_file = os.path.join(output_folder, file)
        
        if os.path.exists(existing_file):
            variable_dict[variable] = pd.read_csv(existing_file)
            if 'latitude' in variable_dict[variable].columns:
                variable_dict[variable] = variable_dict[variable].set_index(['latitude', 'longitude'])
        else:
            continue
    
    return variable_dict

def export_csv(variable_dict: dict[str, pd.DataFrame], start_date: datetime, end_date: datetime, time_series: str = 'yearly'):
    """
    Export each variable data to a csv file in the desired directory.

    Args:
    variable_dict (dict): [variable names: keys, variable data: values]
    start_date (datetime): A starting date of time series.
    end_date (datetime): A ending date of time series.
    time_series (string): {'yearly', 'monthly}, default 'yearly'
    Determine if the time series is the series of month or year.
    """
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)  # Create the folder if it doesn't exist
        
    start_date = datetime.strftime(start_date, '%Y-%m-%d')
    end_date = datetime.strftime(end_date, '%Y-%m-%d')
    
    for variable in variable_dict.keys():
        file = f"{variable}_{start_date}_{end_date}_{time_series}.csv"
        output_file = os.path.join(output_folder, file)
        variable_dict[variable].to_csv(output_file, index=True)

In [3]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure to be in GIS_MANIPULATION directory
hazard_df = pd.read_csv('./dataset/ddpm_amphoe_hazard_dataset.csv')

start_date = datetime(2011, 1, 1)
end_date = datetime(2020, 12, 31)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": [],
	"longitude": [],
	"start_date": datetime.strftime(start_date, "%Y-%m-%d"),
	"end_date": datetime.strftime(end_date, "%Y-%m-%d"),
	"daily": ["weather_code", "temperature_2m_max", "temperature_2m_min", "temperature_2m_mean", "apparent_temperature_max",
              "apparent_temperature_min", "apparent_temperature_mean", "sunrise", "sunset", "daylight_duration",
              "sunshine_duration", "precipitation_sum", "rain_sum", "snowfall_sum", "precipitation_hours", "wind_speed_10m_max",
              "wind_gusts_10m_max", "wind_direction_10m_dominant", "shortwave_radiation_sum", "et0_fao_evapotranspiration"],
	"timezone": "Asia/Bangkok"
}
output_folder = './dataset/open-meteo'

# _______________ SYSTEM CONFIG ___________________

In [4]:
variable_dict = create_variable_template(hazard_df, time_series='monthly')
variable_dict = detect_exist_dataset(variable_dict, start_date, end_date, time_series='monthly')

# Determine start point(index) for pulling data
start_point = 0
for df in variable_dict.values():
    if df.dropna().empty:
        start_point = 0
        break
    else:
        start_point = len(df.dropna())

step = 150
responses_list = []

# Start API request
print(f'start point: {start_point}')
for index in range(start_point, len(hazard_df), step):
    location_start = index
    location_stop = index + step

    latitude = hazard_df['latitude'][location_start:location_stop]
    longitude = hazard_df['longitude'][location_start:location_stop]
    
    params['latitude'] = latitude
    params['longitude'] = longitude

    try:
        responses = openmeteo.weather_api(url, params=params)
        responses_list.extend(responses)
        print(f"Downloaded: Start Point ({location_start}), Stop Point ({location_stop})")
        sleep(10) # Preventing Minutely API request limit exceed.
        continue
    
    except Exception as e:
        print(f"Stop downloading at Start Point ({location_start}), Stop Point ({location_stop}): {e}")
        break

start point: 300
Downloaded: Start Point (300), Stop Point (450)
Stop downloading at Start Point (450), Stop Point (600): {'error': True, 'reason': 'Hourly API request limit exceeded. Please try again in the next hour.'}


In [None]:
# Process the location.
for response in responses_list:

	# Process daily data. The order of variables needs to be the same as requested.
	daily = response.Daily()
	daily_weather_code = daily.Variables(0).ValuesAsNumpy()
	daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
	daily_temperature_2m_min = daily.Variables(2).ValuesAsNumpy()
	daily_temperature_2m_mean = daily.Variables(3).ValuesAsNumpy()
	daily_apparent_temperature_max = daily.Variables(4).ValuesAsNumpy()
	daily_apparent_temperature_min = daily.Variables(5).ValuesAsNumpy()
	daily_apparent_temperature_mean = daily.Variables(6).ValuesAsNumpy()
	daily_sunrise = daily.Variables(7).ValuesAsNumpy()
	daily_sunset = daily.Variables(8).ValuesAsNumpy()
	daily_daylight_duration = daily.Variables(9).ValuesAsNumpy()
	daily_sunshine_duration = daily.Variables(10).ValuesAsNumpy()
	daily_precipitation_sum = daily.Variables(11).ValuesAsNumpy()
	daily_rain_sum = daily.Variables(12).ValuesAsNumpy()
	daily_snowfall_sum = daily.Variables(13).ValuesAsNumpy()
	daily_precipitation_hours = daily.Variables(14).ValuesAsNumpy()
	daily_wind_speed_10m_max = daily.Variables(15).ValuesAsNumpy()
	daily_wind_gusts_10m_max = daily.Variables(16).ValuesAsNumpy()
	daily_wind_direction_10m_dominant = daily.Variables(17).ValuesAsNumpy()
	daily_shortwave_radiation_sum = daily.Variables(18).ValuesAsNumpy()
	daily_et0_fao_evapotranspiration = daily.Variables(19).ValuesAsNumpy()

	daily_data = {"date": pd.date_range(
		start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
		end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
		freq = pd.Timedelta(seconds = daily.Interval()),
		inclusive = "left"
	)}
	daily_data["weather_code"] = daily_weather_code
	daily_data["temperature_2m_max"] = daily_temperature_2m_max
	daily_data["temperature_2m_min"] = daily_temperature_2m_min
	daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
	daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
	daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
	daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
	daily_data["sunrise"] = daily_sunrise
	daily_data["sunset"] = daily_sunset
	daily_data["daylight_duration"] = daily_daylight_duration
	daily_data["sunshine_duration"] = daily_sunshine_duration
	daily_data["precipitation_sum"] = daily_precipitation_sum
	daily_data["rain_sum"] = daily_rain_sum
	daily_data["snowfall_sum"] = daily_snowfall_sum
	daily_data["precipitation_hours"] = daily_precipitation_hours
	daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
	daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
	daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
	daily_data["shortwave_radiation_sum"] = daily_shortwave_radiation_sum
	daily_data["et0_fao_evapotranspiration"] = daily_et0_fao_evapotranspiration

	daily_dataframe = pd.DataFrame(data = daily_data)
	daily_dataframe = daily_dataframe.groupby(pd.Grouper(key='date', freq='1ME')).median() # Group by month
	daily_dataframe = daily_dataframe[(daily_dataframe.index.tz_localize(None).to_pydatetime() >= start_date) & 
								   	  (daily_dataframe.index.tz_localize(None).to_pydatetime() <= end_date)] # Filter out dates that aren't in the time series if we accidently got one.
	daily_dataframe = daily_dataframe.T

	# Determine which location of data frame should we start inputing data
	try:
		if start_point == 0:
			for variable in variable_dict.keys():
				variable_dict[variable].iloc[responses_list.index(response)] = daily_dataframe.loc[variable]
		else:
			for variable in variable_dict.keys():
				variable_dict[variable].iloc[responses_list.index(response) + start_point] = daily_dataframe.loc[variable]
	except Exception as e:
		print(f'Error at index {responses_list.index(response)} of response_list')
		print(f'Error message: {e}')
	
	print(f"Inputing {response.Latitude()}°N {response.Longitude()}°E, Elevation {response.Elevation()} m asl")

In [46]:
export_csv(variable_dict, start_date, end_date, time_series='monthly')