# Melbourne Weather Historical Data Collection (HOURLY)
# Research Notebook for Environmental Analysis - Raw Data Ingestion

## Overview
This notebook's **sole responsibility** is to fetch raw, high-resolution **hourly** historical weather data from the Open-Meteo API. It targets 23 monitoring locations across Melbourne, creating a foundational dataset for subsequent processing and analysis.

### Key Features:
- **Single Responsibility**: Focuses exclusively on ingesting raw hourly data.
- **High-Quality Source**: Utilizes the Open-Meteo API for comprehensive historical weather data.
- **Robust Retry Logic**: Automatically handles API rate limits (HTTP 429) to ensure complete data collection.
- **Clean Raw Output**: Saves all data into a single, clean hourly CSV file, preserving the highest level of detail.

## 1. Environment Setup and Dependencies

In [6]:
# =============================================================================
# CELL 1: SETUP, IMPORTS, AND LOGGER CONFIGURATION
# =============================================================================
import os
import sys
import requests
import time
import logging
from datetime import datetime
from typing import Dict, List, Tuple
from tqdm.notebook import tqdm
import pandas as pd

# --- 1. UTF-8 Aware Logger Setup ---
def setup_logger(log_file='logs/02_weather_melbourne_data_collection.log', level=logging.INFO):
    logger = logging.getLogger()
    logger.setLevel(level)
    if logger.hasHandlers():
        logger.handlers.clear()

    console_handler = logging.StreamHandler(sys.stdout)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)
    
    file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
    file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    
    return logger

# --- 2. Initialize Environment ---
logger = setup_logger()
logger.info("🌦️ Melbourne HOURLY Weather Data Collection System (Open-Meteo)")
logger.info("=" * 60)
logger.info("🔬 Research Environment Initialized")

2025-06-20 23:32:57,811 - INFO - 🌦️ Melbourne HOURLY Weather Data Collection System (Open-Meteo)
2025-06-20 23:32:57,814 - INFO - 🔬 Research Environment Initialized


## 2. Configuration and Research Parameters

In [7]:
# =============================================================================
# CELL 2: RESEARCH CONFIGURATION FOR HOURLY WEATHER DATA
# =============================================================================

class WeatherDataConfig:
    def __init__(self):
        self.BASE_URL = "https://archive-api.open-meteo.com/v1/archive"
        self.START_DATE = "2020-11-25"
        self.END_DATE = "2025-01-04"
        self.REQUEST_TIMEOUT = 60
        
        # The output is now clearly marked as 'hourly'
        self.OUTPUT_CSV_PATH = f"../../data/raw/melbourne_raw_weather_openmeteo_{self.START_DATE}_to_{self.END_DATE}.csv"
        
        self.MONITORING_LOCATIONS = {
            "Melbourne CBD": [-37.8136, 144.9631], 
            "Footscray": [-37.7997, 144.9020],
            "Brooklyn": [-37.8161, 144.8415], 
            "Alphington": [-37.7833, 145.0333],
            "Spotswood": [-37.8335, 144.8863], 
            "Box Hill": [-37.8185, 145.1225],
            "Brighton": [-37.9056, 145.0028], 
            "Dandenong": [-37.9875, 145.2149],
            "Mooroolbark": [-37.7825, 145.3168], 
            "Altona North": [-37.8410, 144.8490],
            "Melton": [-37.6833, 144.5833], 
            "Point Cook": [-37.9148, 144.7509],
            "Macleod": [-37.7333, 145.0667], 
            "Carlton": [-37.8001, 144.9656],
            "Richmond": [-37.8183, 145.0014], 
            "St Kilda": [-37.8676, 144.9801],
            "Yarraville": [-37.8167, 144.9000], 
            "Frankston": [-38.1421, 145.1256],
            "Ringwood": [-37.8136, 145.2306], 
            "Werribee": [-37.9009, 144.6590],
            "Craigieburn": [-37.5986, 144.9425], 
            "Pakenham": [-38.0753, 145.4834],
            "Broadmeadows": [-37.6839, 144.9169],
        }

# --- Initialize configuration ---
try:
    config = WeatherDataConfig()
    logger.info(f"🎯 Weather Research Configuration Loaded")
    logger.info(f"💾 Raw hourly data will be saved to: {config.OUTPUT_CSV_PATH}")
except Exception as e:
    logger.error(f"Could not initialize configuration: {e}")

2025-06-20 23:32:57,860 - INFO - 🎯 Weather Research Configuration Loaded
2025-06-20 23:32:57,861 - INFO - 💾 Raw hourly data will be saved to: ../../data/raw/melbourne_raw_weather_openmeteo_2020-11-25_to_2025-01-04.csv


## 3. Data Collection Functions

In [8]:
# =============================================================================
# CELL 3: CORE HOURLY DATA COLLECTION CLASS
# =============================================================================

class WeatherDataCollector:
    """Handles fetching and saving raw hourly weather data from Open-Meteo."""
    
    def __init__(self, config: WeatherDataConfig):
        self.config = config
        self.session = requests.Session()
        self.all_hourly_dataframes = [] # A list to hold DataFrames for each location
        self.failed_locations = []

    def fetch_for_location(self, location: str, lat: float, lon: float):
        """Fetches and processes data for a single location, adding it to the main list."""
        params = {
            "latitude": lat, "longitude": lon,
            "start_date": self.config.START_DATE, "end_date": self.config.END_DATE,
            "hourly": "temperature_2m,relative_humidity_2m,precipitation,surface_pressure,wind_speed_10m",
            "timezone": "Australia/Melbourne"
        }
        max_retries = 5
        base_backoff_seconds = 5

        for attempt in range(max_retries):
            try:
                response = self.session.get(self.config.BASE_URL, params=params, timeout=self.config.REQUEST_TIMEOUT)
                if response.status_code == 429:
                    wait_time = base_backoff_seconds * (2 ** attempt) + (os.urandom(1)[0] / 255.0)
                    logger.warning(f"🚦 Rate limit for {location}. Retrying in {wait_time:.1f}s... (Attempt {attempt + 1}/{max_retries})")
                    time.sleep(wait_time)
                    continue
                response.raise_for_status()

                data = response.json()
                df = pd.DataFrame(data['hourly'])
                df['location'] = location
                self.all_hourly_dataframes.append(df)
                logger.info(f"✅ Fetched {len(df)} hourly records for {location}.")
                return

            except requests.exceptions.RequestException as e:
                logger.error(f"❌ Request failed for {location} on attempt {attempt + 1}: {e}")
                time.sleep(base_backoff_seconds)

        logger.error(f"💥 All {max_retries} retries failed for {location}. It will be skipped.")
        self.failed_locations.append(location)

    def run_collection_and_save(self) -> Tuple[bool, int]:
        """Main orchestration method."""
        logger.info("🚀 Starting hourly weather data collection...")
        self.all_hourly_dataframes = []
        self.failed_locations = []
        
        location_progress = tqdm(self.config.MONITORING_LOCATIONS.items(), desc="🌍 Fetching Locations", unit="location")
        for location, (lat, lon) in location_progress:
            location_progress.set_postfix_str(f"📍 {location}")
            self.fetch_for_location(location, lat, lon)

        if not self.all_hourly_dataframes:
            logger.error("No data was collected. Aborting save.")
            return False, 0
        
        # Combine all collected dataframes and save to a single CSV
        try:
            final_df = pd.concat(self.all_hourly_dataframes, ignore_index=True)
            final_df.rename(columns={
                'time': 'datetime',
                'temperature_2m': 'temperature',
                'relative_humidity_2m': 'humidity',
                'precipitation': 'precipitation',
                'surface_pressure': 'pressure',
                'wind_speed_10m': 'wind_speed'
            }, inplace=True)
            
            # Reorder columns for consistency
            cols_order = ['location', 'datetime', 'temperature', 'humidity', 'precipitation', 'pressure', 'wind_speed']
            final_df = final_df[cols_order]

            final_df.to_csv(self.config.OUTPUT_CSV_PATH, index=False, encoding='utf-8')
            total_records = len(final_df)
            logger.info(f"💾 Combined and saved {total_records:,} hourly records to disk.")
            return not self.failed_locations, total_records
        except Exception as e:
            logger.critical(f"💥 CRITICAL ERROR during final save: {e}")
            return False, 0

# --- Initialize the collector ---
collector = WeatherDataCollector(config)
logger.info("🔧 Hourly weather data collector class defined and instance created.")

2025-06-20 23:32:57,945 - INFO - 🔧 Hourly weather data collector class defined and instance created.


## 4. Execute Data Collection

In [9]:
# =============================================================================
# CELL 4: DATA COLLECTION ORCHESTRATOR
# =============================================================================

def run_data_collection_pipeline():
    logger.info(f"🕐 Collection started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    start_time = datetime.now()
    
    success, total_records = collector.run_collection_and_save()
    
    end_time = datetime.now()
    duration = end_time - start_time
    
    logger.info("=" * 60)
    if success:
        logger.info("✅ HOURLY DATA COLLECTION COMPLETED SUCCESSFULLY!")
    else:
        logger.warning(f"⚠️ COLLECTION COMPLETED WITH ERRORS. Failed locations: {collector.failed_locations}")
    
    logger.info(f"📁 Data saved to: {config.OUTPUT_CSV_PATH}")
    logger.info(f"📊 Total hourly records collected: {total_records:,}")
    logger.info(f"⏱️  Total duration: {duration}")
    if duration.total_seconds() > 0:
        logger.info(f"🎯 Collection efficiency: {total_records / duration.total_seconds():.2f} records/second")
    
    if success:
        print("\n🎉 Raw hourly data is ready for the next processing step.")
    else:
        print("\n❌ DATA COLLECTION FAILED for some locations. Check logs for details.")

# --- Execute the Pipeline ---
run_data_collection_pipeline()

2025-06-20 23:32:58,021 - INFO - 🕐 Collection started at: 2025-06-20 23:32:58
2025-06-20 23:32:58,023 - INFO - 🚀 Starting hourly weather data collection...


🌍 Fetching Locations:   0%|          | 0/23 [00:00<?, ?location/s]

2025-06-20 23:33:01,142 - INFO - ✅ Fetched 36048 hourly records for Melbourne CBD.
2025-06-20 23:33:02,656 - INFO - ✅ Fetched 36048 hourly records for Footscray.
2025-06-20 23:33:04,159 - INFO - ✅ Fetched 36048 hourly records for Brooklyn.
2025-06-20 23:33:06,926 - INFO - ✅ Fetched 36048 hourly records for Alphington.
2025-06-20 23:33:08,698 - INFO - ✅ Fetched 36048 hourly records for Spotswood.
2025-06-20 23:33:10,239 - INFO - ✅ Fetched 36048 hourly records for Box Hill.
2025-06-20 23:33:11,941 - INFO - ✅ Fetched 36048 hourly records for Brighton.
2025-06-20 23:33:13,533 - INFO - ✅ Fetched 36048 hourly records for Dandenong.
2025-06-20 23:33:15,009 - INFO - ✅ Fetched 36048 hourly records for Mooroolbark.
2025-06-20 23:33:16,514 - INFO - ✅ Fetched 36048 hourly records for Altona North.
2025-06-20 23:33:18,088 - INFO - ✅ Fetched 36048 hourly records for Melton.
2025-06-20 23:33:19,560 - INFO - ✅ Fetched 36048 hourly records for Point Cook.
2025-06-20 23:33:21,072 - INFO - ✅ Fetched 3604

## 5. Data Validation (Quick Check)

In [11]:
# =============================================================================
# CELL 5: QUICK DATA VALIDATION
# =============================================================================

def validate_output_file():
    """Performs a quick check on the final output file."""
    if not os.path.exists(config.OUTPUT_CSV_PATH):
        logger.error(f"❌ No data file found at '{config.OUTPUT_CSV_PATH}'.")
        return
    
    logger.info(f"📊 Validating output file: '{config.OUTPUT_CSV_PATH}'...")
    try:
        df = pd.read_csv(config.OUTPUT_CSV_PATH)
        print("\n--- File Validation Report ---")
        print(f"   - File shape (rows, columns): {df.shape}")
        print(f"   - Number of unique locations: {df['location'].nunique()}")
        print(f"\nColumns found:")
        print(df.columns.tolist())
        print("\nData types:")
        print(df.dtypes)
        print("\nSample data:")
        display(df.head())
        print("--- End of Report ---")
        
    except Exception as e:
        logger.error(f"❌ Validation failed: {e}")

# --- Run the validation ---
validate_output_file()

2025-06-20 23:36:41,000 - INFO - 📊 Validating output file: '../../data/raw/melbourne_raw_weather_openmeteo_2020-11-25_to_2025-01-04.csv'...

--- File Validation Report ---
   - File shape (rows, columns): (829104, 7)
   - Number of unique locations: 23

Columns found:
['location', 'datetime', 'temperature', 'humidity', 'precipitation', 'pressure', 'wind_speed']

Data types:
location          object
datetime          object
temperature      float64
humidity           int64
precipitation    float64
pressure         float64
wind_speed       float64
dtype: object

Sample data:


Unnamed: 0,location,datetime,temperature,humidity,precipitation,pressure,wind_speed
0,Melbourne CBD,2020-11-25T00:00,15.1,87,0.0,1010.9,4.2
1,Melbourne CBD,2020-11-25T01:00,14.1,90,0.0,1010.2,5.1
2,Melbourne CBD,2020-11-25T02:00,14.4,90,0.0,1009.8,0.4
3,Melbourne CBD,2020-11-25T03:00,12.6,97,0.0,1009.6,3.1
4,Melbourne CBD,2020-11-25T04:00,11.7,99,0.0,1009.5,4.5


--- End of Report ---
