In [None]:
# %% [markdown]
# # 02. Raw Measurements Data Exploration
#
# ## Objective
# Load and explore the raw measurement data from airquality.am

# %% [code]
# Import libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import custom modules
from src.data.data_loader import AirQualityDataLoader

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("Setup complete!")

# %% [code]
# Initialize loader with path to raw data
data_dir = Path("../data/raw")
loader = AirQualityDataLoader(data_dir)

# Get data summary
summary = loader.get_data_summary()

print("üìä Data Summary:")
print("-" * 50)
print(f"Measurements directory: {summary['measurements_directory']}")
print(f"Sensors file: {summary['sensors_file']}")

if summary['measurements']['exists']:
    print(f"\nüìÅ Measurement Files:")
    print(f"  Total files: {summary['measurements']['file_count']}")
    print(f"  Total size: {summary['measurements']['total_size_gb']} GB")
    print(f"  Years available: {summary['measurements']['years_available']}")

    print("\n  Recent files:")
    for f in summary['measurements']['files'][-5:]:
        print(f"    üìÑ {f['filename']} ({f['size_mb']} MB)")

if summary['sensors']['exists']:
    print(f"\nüìç Sensors:")
    print(f"  Total sensors: {summary['sensors']['count']}")
    print(f"  Columns: {summary['sensors']['columns']}")

# %% [code]
# Load sensors metadata
sensors_df = loader.load_sensors_metadata()
print("\nüîç First few sensors:")
sensors_df.head()

# %% [code]
# Load a sample of recent measurements (e.g., 2025 data)
print("Loading 2025 measurements...")
df_2025 = loader.get_pm25_data(years=2025, include_metadata=True)

print(f"\nüìä Data shape: {df_2025.shape}")
print(f"Date range: {df_2025['date'].min()} to {df_2025['date'].max()}")
print(f"Number of sensors: {df_2025['sensor_id'].nunique()}")
print(f"Number of stations: {df_2025['station_id'].nunique() if 'station_id' in df_2025.columns else 'N/A'}")

# %% [code]
# Basic statistics
print("üìà PM2.5 Statistics for 2025:")
print(df_2025['pm25'].describe())

# %% [code]
# Plot PM2.5 distribution by sensor location
if 'latitude' in df_2025.columns and 'longitude' in df_2025.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))

    # Box plot by station
    if 'station_id' in df_2025.columns:
        station_avg = df_2025.groupby('station_id')['pm25'].mean().sort_values()
        axes[0].barh(range(len(station_avg)), station_avg.values)
        axes[0].set_yticks(range(len(station_avg)))
        axes[0].set_yticklabels(station_avg.index)
        axes[0].set_xlabel('Mean PM2.5 (¬µg/m¬≥)')
        axes[0].set_title('Average PM2.5 by Station')

    # Scatter plot by location
    sensor_avg = df_2025.groupby(['sensor_id', 'latitude', 'longitude'])['pm25'].mean().reset_index()
    scatter = axes[1].scatter(sensor_avg['longitude'], sensor_avg['latitude'],
                             c=sensor_avg['pm25'], cmap='viridis', s=100, alpha=0.7)
    axes[1].set_xlabel('Longitude')
    axes[1].set_ylabel('Latitude')
    axes[1].set_title('PM2.5 Levels by Sensor Location')
    plt.colorbar(scatter, ax=axes[1], label='Mean PM2.5')

    plt.tight_layout()
    plt.show()

# %% [code]
# Time series for top 5 sensors
top_sensors = df_2025.groupby('sensor_id')['pm25'].mean().nlargest(5).index

fig, ax = plt.subplots(figsize=(15, 6))
for sensor_id in top_sensors:
    sensor_data = df_2025[df_2025['sensor_id'] == sensor_id]
    # Resample to daily for cleaner plot
    daily_avg = sensor_data.set_index('date').resample('D')['pm25'].mean()
    ax.plot(daily_avg.index, daily_avg.values, label=f"Sensor {sensor_id}", alpha=0.7)

ax.set_xlabel('Date')
ax.set_ylabel('PM2.5 (¬µg/m¬≥)')
ax.set_title('Daily PM2.5 Levels - Top 5 Sensors (2025)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()