# Melbourne Weather Historical Data Collection
# Research Notebook for Environmental Analysis

## Overview
This notebook fetches historical weather data for the same locations as the air quality monitoring stations across Melbourne. The primary data source is the Australian Bureau of Meteorology (BOM), typically accessed via pre-compiled CSV files for specific weather stations.

### Key Features:
- **Aligned Locations**: Uses the same 23 locations as the air quality notebook for easy data merging.
- **Simulated Download**: Simulates the process of downloading and processing individual station CSVs.
- **Robust Error Handling**: Manages file not found errors and data processing issues.
- **Research-Ready Output**: Aggregates data into a single, clean CSV file.
- **Professional Logging**: Comprehensive, UTF-8 aware logging for traceability.

## 1. Environment Setup and Dependencies

In [1]:
# =============================================================================
# CELL 1: SETUP, IMPORTS, AND LOGGER CONFIGURATION
# =============================================================================
"""
Melbourne Weather Data Collection System
========================================

This notebook collects historical weather data from the Bureau of Meteorology (BOM)
for locations corresponding to EPA Victoria monitoring stations.

Author: Research Team
Date: 2025
Purpose: Environmental analysis and building a multimodal dataset.
"""

import os
import sys
import csv
import logging
from datetime import datetime
from typing import Dict, List, Tuple, Any
from tqdm.notebook import tqdm
import pandas as pd

# --- 1. UTF-8 Aware Logger Setup (Identical to previous notebook for consistency) ---
def setup_logger(log_file='logs/02_weather_melbourne_data_collection.log', level=logging.INFO):
    """Configures a logger to be UTF-8 aware for both console and file output."""
    logger = logging.getLogger()
    logger.setLevel(level)
    
    if logger.hasHandlers():
        logger.handlers.clear()

    # Console Handler
    console_handler = logging.StreamHandler(sys.stdout)
    console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console_handler.setFormatter(console_formatter)
    
    # File Handler
    file_handler = logging.FileHandler(log_file, mode='w', encoding='utf-8')
    file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(file_formatter)

    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    
    return logger

# --- 2. Initialize Environment ---
logger = setup_logger()

logger.info("🌦️ Melbourne Weather Data Collection System")
logger.info("=" * 50)
logger.info("🔬 Research Environment Initialized")
logger.info("📋 Dependencies loaded and UTF-8 logger configured.")

2025-06-20 20:54:27,166 - INFO - 🌦️ Melbourne Weather Data Collection System
2025-06-20 20:54:27,168 - INFO - 🔬 Research Environment Initialized
2025-06-20 20:54:27,170 - INFO - 📋 Dependencies loaded and UTF-8 logger configured.


## 2. Configuration and Research Parameters

In [2]:
# =============================================================================
# CELL 2: RESEARCH CONFIGURATION FOR BOM DATA
# =============================================================================

class WeatherDataConfig:
    """Configuration class for BOM weather data collection."""
    
    def __init__(self):
        # Research Period (Should match the air quality data for consistency)
        self.START_DATE = datetime(2020, 11, 25)
        self.END_DATE = datetime(2025, 1, 4)
        
        # Data Source Configuration (Simulating downloaded BOM CSVs)
        # In a real-world scenario, you would have a list of URLs to download from.
        # Here, we assume they are pre-downloaded and stored locally.
        self.BOM_DATA_SOURCE_DIR = "../data/bom_station_data/"
        
        # Output Configuration
        self.OUTPUT_CSV_PATH = f"../../data/raw/melbourne_raw_weather_{self.START_DATE.strftime('%Y%m%d')}_to_{self.END_DATE.strftime('%Y%m%d')}.csv"
        
        # Mapping from our research locations to the nearest BOM station ID and its filename
        # NOTE: This is a CRITICAL mapping step. You must identify the closest/most representative BOM station for each location.
        # The station ID (e.g., '086071') is key.
        self.LOCATION_TO_BOM_STATION = {
            "Melbourne CBD": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Footscray": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Brooklyn": {"id": "087031", "file": "BOM_station_087031_Laverton.csv"},
            "Alphington": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Spotswood": {"id": "087031", "file": "BOM_station_087031_Laverton.csv"},
            "Box Hill": {"id": "086068", "file": "BOM_station_086068_Scoresby.csv"},
            "Brighton": {"id": "086077", "file": "BOM_station_086077_Moorabbin.csv"},
            "Dandenong": {"id": "086068", "file": "BOM_station_086068_Scoresby.csv"},
            "Mooroolbark": {"id": "086068", "file": "BOM_station_086068_Scoresby.csv"},
            "Altona North": {"id": "087031", "file": "BOM_station_087031_Laverton.csv"},
            "Melton": {"id": "087031", "file": "BOM_station_087031_Laverton.csv"}, # Assuming Laverton is closest
            "Point Cook": {"id": "087031", "file": "BOM_station_087031_Laverton.csv"},
            "Macleod": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Carlton": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Richmond": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "St Kilda": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Yarraville": {"id": "086071", "file": "BOM_station_086071_Melbourne.csv"},
            "Frankston": {"id": "086077", "file": "BOM_station_086077_Moorabbin.csv"}, # Assuming Moorabbin
            "Ringwood": {"id": "086068", "file": "BOM_station_086068_Scoresby.csv"},
            "Werribee": {"id": "087031", "file": "BOM_station_087031_Laverton.csv"},
            "Craigieburn": {"id": "086038", "file": "BOM_station_086038_Essendon.csv"},
            "Pakenham": {"id": "086068", "file": "BOM_station_086068_Scoresby.csv"},
            "Broadmeadows": {"id": "086038", "file": "BOM_station_086038_Essendon.csv"},
        }
        
        # These are the headers for our FINAL aggregated CSV file
        self.CSV_HEADERS = [
            "location", "date", "t_max", "t_min", "precip", "humidity_9am", "wind_speed_9am"
        ]

# --- Initialize configuration ---
try:
    config = WeatherDataConfig()
    logger.info(f"🎯 Weather Research Configuration Loaded")
    logger.info(f"📍 Processing {len(config.LOCATION_TO_BOM_STATION)} locations")
    logger.info(f"💾 Output file will be: {config.OUTPUT_CSV_PATH}")
except Exception as e:
    logger.error(f"Could not initialize configuration: {e}")


2025-06-20 21:05:43,469 - INFO - 🎯 Weather Research Configuration Loaded
2025-06-20 21:05:43,471 - INFO - 📍 Processing 23 locations
2025-06-20 21:05:43,472 - INFO - 💾 Output file will be: ../../data/raw/melbourne_raw_weather_20201125_to_20250104.csv


## 3. Data Collection Functions

In [3]:
# =============================================================================
# CELL 3: CORE DATA COLLECTION CLASS FOR BOM DATA
# =============================================================================

class WeatherDataCollector:
    """Handles the mechanics of reading, processing, and aggregating BOM CSVs."""
    
    def __init__(self, config: WeatherDataConfig):
        self.config = config
        self.total_records = 0
        self.processed_files = set() # To avoid reading the same file multiple times
        self.station_dataframes = {} # Cache for loaded data

    def _read_and_cache_station_data(self, station_file: str) -> bool:
        """Reads a BOM station CSV file and stores it in a cache."""
        if station_file in self.processed_files:
            return True # Already loaded
            
        full_path = os.path.join(self.config.BOM_DATA_SOURCE_DIR, station_file)
        try:
            # BOM CSVs often have header rows to skip
            df = pd.read_csv(full_path, skiprows=8)
            # Standardize column names
            df.rename(columns={
                'Date': 'date',
                'Maximum temperature (Degree C)': 't_max',
                'Minimum temperature (Degree C)': 't_min',
                'Rainfall (mm)': 'precip',
                '9am relative humidity (%)': 'humidity_9am',
                '9am wind speed (km/h)': 'wind_speed_9am'
            }, inplace=True)
            df['date'] = pd.to_datetime(df['date'])
            self.station_dataframes[station_file] = df
            self.processed_files.add(station_file)
            logger.info(f"✅ Successfully loaded and cached data from '{station_file}'.")
            return True
        except FileNotFoundError:
            logger.error(f"❌ File not found: {full_path}. This location will be skipped.")
            return False
        except Exception as e:
            logger.error(f"❌ Failed to process file {full_path}: {e}")
            return False

    def _get_data_for_location(self, location_name: str, station_file: str) -> List[Dict]:
        """Extracts data for a specific location from the cached DataFrame."""
        if station_file not in self.station_dataframes:
            return []
        
        df = self.station_dataframes[station_file]
        
        # Filter by the research date range
        mask = (df['date'] >= self.config.START_DATE) & (df['date'] <= self.config.END_DATE)
        df_filtered = df.loc[mask].copy()
        
        # Add the research location name to each record
        df_filtered['location'] = location_name
        
        # Convert to a list of dictionaries for writing
        return df_filtered[self.config.CSV_HEADERS].to_dict('records')

    def collect_all_data(self) -> Tuple[bool, int]:
        """Orchestrates the entire data aggregation process."""
        logger.info("🚀 Starting comprehensive weather data aggregation...")
        self.total_records = 0
        
        # Step 1: Pre-load all necessary BOM station files
        unique_station_files = {v['file'] for v in self.config.LOCATION_TO_BOM_STATION.values()}
        for station_file in tqdm(unique_station_files, desc="📂 Caching Station Files"):
            self._read_and_cache_station_data(station_file)
            
        # Step 2: Write data for each location to the final CSV
        try:
            with open(self.config.OUTPUT_CSV_PATH, mode="w", newline="", encoding="utf-8") as file:
                writer = csv.DictWriter(file, fieldnames=self.config.CSV_HEADERS)
                writer.writeheader()
                location_progress = tqdm(self.config.LOCATION_TO_BOM_STATION.items(), desc="🌍 Processing Locations", unit="location")
                
                for location, station_info in location_progress:
                    location_progress.set_postfix_str(f"📍 {location}")
                    records = self._get_data_for_location(location, station_info['file'])
                    if records:
                        writer.writerows(records)
                        self.total_records += len(records)
            return True, self.total_records
        except IOError as e:
            logger.critical(f"💥 CRITICAL FILE ERROR: Could not write to {self.config.OUTPUT_CSV_PATH}. Reason: {e}")
            return False, 0

# --- Initialize the collector ---
collector = WeatherDataCollector(config)
logger.info("🔧 Weather data collector class defined and instance created.")


2025-06-20 21:05:48,027 - INFO - 🔧 Weather data collector class defined and instance created.


## 4. Execute Data Collection

In [4]:
# =============================================================================
# CELL 4: DATA COLLECTION ORCHESTRATOR (Identical structure to previous notebook)
# =============================================================================

def _print_results_summary(start_time: datetime, end_time: datetime, total_records: int):
    """Logs the summary of the data collection results."""
    duration = end_time - start_time
    logger.info("=" * 60)
    logger.info("✅ WEATHER DATA AGGREGATION COMPLETED SUCCESSFULLY!")
    logger.info(f"📁 Data saved to: {config.OUTPUT_CSV_PATH}")
    logger.info(f"📊 Total records aggregated: {total_records}")
    logger.info(f"⏱️  Total duration: {duration}")

def _print_final_guidance(success: bool):
    """Prints helpful next steps or troubleshooting advice to the console."""
    if success:
        print("\n🎉 Ready for next stage of processing! See logs for details.")
    else:
        print("\n❌ DATA COLLECTION FAILED. Check the log file for detailed error information.")

def run_data_collection_pipeline() -> bool:
    """Orchestrates the entire data collection process."""
    logger.info(f"🕐 Collection started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    start_time = datetime.now()
    
    success, total_records = collector.collect_all_data()
    
    end_time = datetime.now()
    
    if success:
        _print_results_summary(start_time, end_time, total_records)
    
    return success

# --- Execute the Pipeline ---
was_successful = run_data_collection_pipeline()
_print_final_guidance(was_successful)


2025-06-20 21:05:53,542 - INFO - 🕐 Collection started at: 2025-06-20 21:05:53
2025-06-20 21:05:53,543 - INFO - 🚀 Starting comprehensive weather data aggregation...


📂 Caching Station Files:   0%|          | 0/5 [00:00<?, ?it/s]

2025-06-20 21:05:53,577 - ERROR - ❌ File not found: ../data/bom_station_data/BOM_station_086068_Scoresby.csv. This location will be skipped.
2025-06-20 21:05:53,579 - ERROR - ❌ File not found: ../data/bom_station_data/BOM_station_087031_Laverton.csv. This location will be skipped.
2025-06-20 21:05:53,580 - ERROR - ❌ File not found: ../data/bom_station_data/BOM_station_086071_Melbourne.csv. This location will be skipped.
2025-06-20 21:05:53,582 - ERROR - ❌ File not found: ../data/bom_station_data/BOM_station_086077_Moorabbin.csv. This location will be skipped.
2025-06-20 21:05:53,583 - ERROR - ❌ File not found: ../data/bom_station_data/BOM_station_086038_Essendon.csv. This location will be skipped.


🌍 Processing Locations:   0%|          | 0/23 [00:00<?, ?location/s]

2025-06-20 21:05:53,606 - INFO - ✅ WEATHER DATA AGGREGATION COMPLETED SUCCESSFULLY!
2025-06-20 21:05:53,607 - INFO - 📁 Data saved to: ../../data/raw/melbourne_raw_weather_20201125_to_20250104.csv
2025-06-20 21:05:53,608 - INFO - 📊 Total records aggregated: 0
2025-06-20 21:05:53,609 - INFO - ⏱️  Total duration: 0:00:00.059187

🎉 Ready for next stage of processing! See logs for details.


## 5. Data Analysis and Validation

In [5]:
# =============================================================================
# CELL 5: DATA ANALYSIS AND VALIDATION
# =============================================================================

def analyze_collected_data():
    """Perform initial analysis and validation of the collected data CSV."""
    if not os.path.exists(config.OUTPUT_CSV_PATH):
        logger.error(f"❌ No data file found at '{config.OUTPUT_CSV_PATH}'. Please run data collection first.")
        return None
    
    logger.info(f"📊 Analyzing collected data from '{config.OUTPUT_CSV_PATH}'...")
    try:
        df = pd.read_csv(config.OUTPUT_CSV_PATH)
        
        print("\n--- Data Analysis Report ---")
        print(f"📈 Dataset Overview:")
        print(f"   - Total records: {len(df):,}")
        print(f"   - Unique Locations: {df['location'].nunique()}")
        
        print(f"\n📍 Location Coverage (Top 10):")
        print(df['location'].value_counts().head(10))
        
        print(f"\n🔬 Data Completeness per Parameter:")
        weather_params = ['t_max', 't_min', 'precip', 'humidity_9am', 'wind_speed_9am']
        for param in weather_params:
            if param in df.columns:
                percentage = df[param].notna().mean() * 100
                print(f"   - {param}: {percentage:.1f}% complete")
        
        print(f"\n✅ Data Quality Checks:")
        print(f"   - Duplicate rows: {df.duplicated().sum()}")
        print(f"   - Rows with missing dates: {df['date'].isnull().sum()}")
        print("--- End of Report ---")
        
        return df
    except Exception as e:
        logger.error(f"❌ Analysis failed: {e}")
        return None

# --- Run the analysis on the generated file ---
df_analysis = analyze_collected_data()
if df_analysis is not None:
    print("\nDataFrame returned to 'df_analysis' variable. You can now use it for further work.")


2025-06-20 21:07:54,042 - INFO - 📊 Analyzing collected data from '../../data/raw/melbourne_raw_weather_20201125_to_20250104.csv'...

--- Data Analysis Report ---
📈 Dataset Overview:
   - Total records: 0
   - Unique Locations: 0

📍 Location Coverage (Top 10):
Series([], Name: location, dtype: int64)

🔬 Data Completeness per Parameter:
   - t_max: nan% complete
   - t_min: nan% complete
   - precip: nan% complete
   - humidity_9am: nan% complete
   - wind_speed_9am: nan% complete

✅ Data Quality Checks:
   - Duplicate rows: 0
   - Rows with missing dates: 0
--- End of Report ---

DataFrame returned to 'df_analysis' variable. You can now use it for further work.
