# Air Quality Staged Data EDA

This notebook analyzes the staged air quality data to verify the staging pipeline worked correctly.

## Objectives
1. Load and inspect all staged air quality Parquet files
2. Verify data quality and structure
3. Check station mapping with surrogate keys
4. Analyze temporal coverage and data completeness
5. Compare with expected outputs from staging pipeline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import glob
import yaml
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

# Set plot style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries loaded successfully")

## 1. Load Configuration and Discover Files

In [None]:
# Define paths
staged_dir = Path("/home/jovyan/work/data/staged")
config_dir = Path("/home/jovyan/work/src/configs")

# Load station mapping configuration
with open(config_dir / "station_mapping.yaml", 'r') as f:
    station_config = yaml.safe_load(f)

print(f"Staged data directory: {staged_dir}")
print(f"Directory exists: {staged_dir.exists()}")

# Discover all air quality staged files
air_quality_files = list(staged_dir.glob("air_quality_*.parquet"))
print(f"\nFound {len(air_quality_files)} air quality staged files:")
for file in sorted(air_quality_files):
    print(f"  - {file.name} ({file.stat().st_size / 1024:.1f} KB)")

## 2. Load and Inspect Station Mapping

In [None]:
# Extract station mappings
stations = station_config['wind_stations']['station_mappings']
air_quality_metrics = station_config['air_quality_metrics']

# Create station lookup
station_lookup = {station['station_pk']: station for station in stations}
pollutant_lookup = {metric['metric_code']: metric for metric in air_quality_metrics}

print("Station Mapping:")
for pk, station in station_lookup.items():
    print(f"  {pk}: {station['station_name']} ({station['station_code']})")

print("\nAir Quality Pollutants:")
for code, metric in pollutant_lookup.items():
    print(f"  {code}: {metric['metric_name']} ({metric['unit']})")

## 3. Load and Analyze All Staged Data

In [None]:
# Load all air quality files
air_quality_data = {}
file_summary = []

for file_path in sorted(air_quality_files):
    try:
        # Extract pollutant and year from filename
        filename = file_path.stem
        parts = filename.split('_')
        if len(parts) >= 3:
            pollutant = parts[2]  # air_quality_[pollutant]_year
            year = parts[3] if len(parts) >= 4 else 'unknown'
        else:
            pollutant = 'unknown'
            year = 'unknown'
        
        # Load data
        df = pd.read_parquet(file_path)
        
        # Store in dictionary
        key = f"{pollutant}_{year}"
        air_quality_data[key] = df
        
        # Collect summary info - split the line for length
        date_info = "No datetime"
        if 'datetime' in df.columns:
            date_info = (
                f"{df['datetime'].min().date()} to "
                f"{df['datetime'].max().date()}"
            )
        
        file_summary.append({
            'file': file_path.name,
            'pollutant': pollutant,
            'year': year,
            'rows': len(df),
            'columns': len(df.columns),
            'date_range': date_info,
            'memory_mb': df.memory_usage(deep=True).sum() / 1024 / 1024
        })
        
        print(f"✅ Loaded {key}: {len(df)} rows, {len(df.columns)} columns")
        
    except Exception as e:
        print(f"❌ Error loading {file_path.name}: {e}")

# Create summary DataFrame
summary_df = pd.DataFrame(file_summary)
print(f"\nLoaded {len(air_quality_data)} datasets successfully")
print(f"Total memory usage: {summary_df['memory_mb'].sum():.2f} MB")

In [None]:
# Display file summary
print("File Summary:")
display(summary_df)

## 4. Visualizations

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Air Quality Staged Data Analysis', fontsize=16)

# 1. File count by year and pollutant
if len(summary_df) > 0:
    year_pollutant_counts = (
        summary_df.groupby(['year', 'pollutant'])
        .size()
        .unstack(fill_value=0)
    )
    year_pollutant_counts.plot(kind='bar', ax=axes[0,0])
    axes[0,0].set_title('Files by Year and Pollutant')
    axes[0,0].set_xlabel('Year')
    axes[0,0].set_ylabel('Number of Files')
    axes[0,0].legend(
        title='Pollutant', 
        bbox_to_anchor=(1.05, 1), 
        loc='upper left'
    )

# 2. Data volume by dataset
if len(summary_df) > 0:
    summary_df.plot(x='file', y='rows', kind='bar', ax=axes[0,1])
    axes[0,1].set_title('Records per Dataset')
    axes[0,1].set_xlabel('Dataset')
    axes[0,1].set_ylabel('Number of Records')
    axes[0,1].tick_params(axis='x', rotation=45)

# 3. Simple data overview
overview_text = (
    f'Total Files: {len(air_quality_files)}\\n'
    f'Total Records: {summary_df["rows"].sum():,}\\n'
    f'Years: {", ".join(sorted(summary_df["year"].unique()))}\\n'
    f'Pollutants: {", ".join(sorted(summary_df["pollutant"].unique()))}'
)
axes[1,0].text(
    0.5, 0.5, overview_text, 
    ha='center', va='center', fontsize=12, 
    transform=axes[1,0].transAxes
)
axes[1,0].set_title('Data Overview')
axes[1,0].axis('off')

# 4. Memory usage
if len(summary_df) > 0:
    summary_df.plot(x='pollutant', y='memory_mb', kind='bar', ax=axes[1,1])
    axes[1,1].set_title('Memory Usage by Pollutant')
    axes[1,1].set_xlabel('Pollutant')
    axes[1,1].set_ylabel('Memory (MB)')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Summary and Validation

In [None]:
print("AIR QUALITY STAGING PIPELINE VALIDATION SUMMARY")
print("=" * 60)

# File statistics
print(f"\n📊 DATA VOLUME:")
print(f"  • Total staged files: {len(air_quality_files)}")
print(f"  • Total records: {summary_df['rows'].sum():,}")
print(f"  • Total memory usage: {summary_df['memory_mb'].sum():.2f} MB")
print(f"  • Years covered: {sorted(summary_df['year'].unique())}")
print(f"  • Pollutants found: {sorted(summary_df['pollutant'].unique())}")

# Expected outputs validation
expected_years = ['2019', '2020', '2021', '2022']
expected_pollutants = ['no2', 'o3', 'pm10', 'pm25', 'so2']

# Note: 2019 may not have PM2.5
expected_files_2019 = ['no2', 'o3', 'pm10', 'so2']  # 4 files
expected_files_2020_plus = ['no2', 'o3', 'pm10', 'pm25', 'so2']  # 5 files each

print(f"\n🎯 VALIDATION RESULTS:")
actual_files_by_year = summary_df.groupby('year')['pollutant'].count()
# Split long calculation across lines
files_2019 = len(expected_files_2019)
files_other_years = (len(expected_years) - 1) * len(expected_files_2020_plus)
total_expected_files = files_2019 + files_other_years
total_actual_files = len(air_quality_files)

validation_passed = total_actual_files == total_expected_files
print(f"  • Expected total files: {total_expected_files}")
print(f"  • Actual total files: {total_actual_files}")
status_text = 'PASSED' if validation_passed else 'FAILED'
print(f"  • File count validation: {status_text}")

print(f"\n🚀 STATUS:")
if validation_passed:
    print(f"  • ✅ Air quality staging pipeline working correctly")
    print(f"  • ✅ Ready to proceed with normalization pipeline")
    print(f"  • ✅ Data structure ready for analysis")
else:
    print(f"  • ❌ Review staging pipeline for issues")

final_status = 'SUCCESS' if validation_passed else 'NEEDS REVIEW'
print(f"\n📝 STAGING PIPELINE STATUS: {final_status}")