# Weather Data Parsing for Korkeasaari Zoo

This notebook fetches historical weather data from the Open-Meteo API for Korkeasaari Zoo's location and saves it for further analysis. The data will be used as predictors in our visitor count model.

In [94]:
import pandas as pd
from datetime import timedelta
from pathlib import Path
import openmeteo_requests
import requests_cache
from retry_requests import retry

In [74]:
# Setup the Open-Meteo API client
cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Constants
# Korkeasaari coordinates
LATITUDE = 60.1783
LONGITUDE = 24.9883

# Data paths
DATA_DIR = Path("../data")
CLEAN_DATA_DIR = DATA_DIR / "clean"
WEATHER_CSV = CLEAN_DATA_DIR / "weather.csv"

# Ensure clean data directory exists
CLEAN_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [75]:
def fetch_weather_data(start_date: str, end_date: str) -> pd.DataFrame:
    """
    Fetch hourly weather data from Open-Meteo API for Korkeasaari location.

    Args:
        start_date: Start date in YYYY-MM-DD format
        end_date: End date in YYYY-MM-DD format

    Returns:
        DataFrame with hourly weather data
    """
    url = "https://archive-api.open-meteo.com/v1/archive"

    params = {
        "latitude": LATITUDE,
        "longitude": LONGITUDE,
        "start_date": start_date,
        "end_date": end_date,
        "daily": ["rain_sum", "temperature_2m_max"],
        "timezone": "Europe/Helsinki",
    }

    responses = openmeteo.weather_api(url, params=params)
    response = responses[0]

    # Process daily data. The order of variables needs to be the same as requested.
    daily = response.Daily()
    daily_rain_sum = daily.Variables(0).ValuesAsNumpy()
    daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()

    daily_data = {
        "date": pd.date_range(
            start=pd.to_datetime(daily.Time(), unit="s", utc=True),
            end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
            freq="D",
            inclusive="right",
        )
    }

    daily_data["rain_sum"] = daily_rain_sum
    daily_data["temperature_2m_max"] = daily_temperature_2m_max

    daily_dataframe = pd.DataFrame(data=daily_data)
    daily_dataframe.set_index("date", inplace=True)

    return daily_dataframe

In [122]:
def load_and_check_weather_data(
    visitor_data: pd.DataFrame = None,
) -> tuple[pd.DataFrame, list]:
    """
    Load existing weather data from CSV and check for missing dates.

    Args:
        visitor_data: Optional DataFrame with visitor data to check against

    Returns:
        Tuple of (weather_data DataFrame, list of missing dates)
    """
    if WEATHER_CSV.exists():
        weather_data = pd.read_csv(WEATHER_CSV, parse_dates=["date"], index_col="date")

        # Get unique dates from hourly data
        unique_dates = weather_data.index.unique().strftime("%Y-%m-%d")

        if visitor_data is not None:
            visitor_dates = visitor_data.index.strftime("%Y-%m-%d")

            # Find missing dates
            missing_dates = visitor_dates[~visitor_dates.isin(unique_dates)]

            return weather_data, missing_dates.tolist()

        return weather_data, []

    return pd.DataFrame(), []

In [125]:
# Load visitor data to get date range
visitor_data = pd.read_csv(
    CLEAN_DATA_DIR / "visitors.csv", parse_dates=["date"], index_col="date"
)

print(
    f"Visitor data date range: {visitor_data.index.min()} to {visitor_data.index.max()}"
)

# Load existing weather data and check for missing dates
weather_data, missing_dates = load_and_check_weather_data(visitor_data)

if len(weather_data) == 0:
    print("No existing weather data found. Fetching all data...")
    # Fetch all data
    weather_data = fetch_weather_data(
        visitor_data.index.min().strftime("%Y-%m-%d"),
        visitor_data.index.max().strftime("%Y-%m-%d"),
    )
elif len(missing_dates) > 0:
    print(f"Found {len(missing_dates)} missing dates. Fetching missing data...")

    # Sort missing dates and group them into continuous ranges
    missing_dates = pd.to_datetime(missing_dates).sort_values()
    ranges = []
    range_start = missing_dates[0]
    prev_date = missing_dates[0]

    for date in missing_dates[1:]:
        if date - prev_date > timedelta(days=1):
            ranges.append((range_start, prev_date))
            range_start = date
        prev_date = date
    ranges.append((range_start, prev_date))

    # Fetch data for each range and combine
    for start, end in ranges:
        print(f"Fetching data from {start} to {end}")
        new_data = fetch_weather_data(
            start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")
        )
        weather_data = pd.concat([weather_data, new_data])

    # Sort index and remove any duplicates
    weather_data = weather_data.sort_index().loc[~weather_data.index.duplicated()]
else:
    print("Weather data is up to date!")

Visitor data date range: 2018-01-01 00:00:00 to 2023-12-31 00:00:00
Found 1 missing dates. Fetching missing data...
Fetching data from 2023-12-31 00:00:00 to 2023-12-31 00:00:00


In [126]:
# Save daily aggregates
weather_data.to_csv(CLEAN_DATA_DIR / "weather.csv")
print(f"Saved daily weather to {CLEAN_DATA_DIR / 'weather.csv'}")

# Show summary statistics of the daily weather data
print("\nWeather Data Summary:")
print(weather_data.describe())

Saved daily weather to ../data/clean/weather.csv

Weather Data Summary:
          rain_sum  temperature_2m_max
count  2192.000000         2192.000000
mean      1.642929            9.444744
std       3.527515            8.780896
min       0.000000          -15.128500
25%       0.000000            2.371500
50%       0.100000            8.971499
75%       1.425000           17.321500
max      37.000000           31.321500
