In [3]:
# Fill Missing Data for Pollutant and Weather Datasets
# This notebook:
# 1. Fills missing weather data (2016-02-07 to 2016-04-14) using historical reference data
# 2. Fills any remaining null values in weather data using interpolation
# 3. Fills null values in pollutant data using weekly moving average

import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

# ============================================================================
# 1. LOAD DATASETS
# ============================================================================

print("\n" + "="*80)
print("LOADING DATASETS")
print("="*80)

# Load pollutant data
print("\nLoading pollutant data...")
pollutant_df = pd.read_csv('pollutant_data.csv')
pollutant_df['Date'] = pd.to_datetime(pollutant_df['Date'])
print(f"  ✓ Loaded {len(pollutant_df):,} pollutant records")
print(f"  Date range: {pollutant_df['Date'].min()} to {pollutant_df['Date'].max()}")

# Load weather data for all years
print("\nLoading weather data...")
weather_dfs = []
for year in range(2016, 2025):
    filename = f'weather_{year}.csv'
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['Date'] = pd.to_datetime(df['Date'])
        weather_dfs.append(df)
        print(f"  ✓ Loaded weather_{year}.csv ({len(df):,} records)")

weather_df = pd.concat(weather_dfs, ignore_index=True)
weather_df = weather_df.sort_values(['Date', 'Region']).reset_index(drop=True)
print(f"\n✓ Combined weather data: {len(weather_df):,} records")
print(f"  Date range: {weather_df['Date'].min()} to {weather_df['Date'].max()}")

# ============================================================================
# 2. ANALYZE MISSING DATA
# ============================================================================

print("\n" + "="*80)
print("ANALYZING MISSING DATA")
print("="*80)

print("\n📊 POLLUTANT DATA - Missing Values:")
pollutant_missing = pollutant_df.isnull().sum()
pollutant_missing_pct = (pollutant_missing / len(pollutant_df) * 100).round(2)
for col in pollutant_df.columns:
    if pollutant_missing[col] > 0:
        print(f"  {col}: {pollutant_missing[col]:,} ({pollutant_missing_pct[col]}%)")

print("\n🌤️  WEATHER DATA - Missing Values:")
weather_missing = weather_df.isnull().sum()
weather_missing_pct = (weather_missing / len(weather_df) * 100).round(2)
for col in weather_df.columns:
    if weather_missing[col] > 0:
        print(f"  {col}: {weather_missing[col]:,} ({weather_missing_pct[col]}%)")

# Check date gap
pollutant_start = pollutant_df['Date'].min()
weather_start = weather_df['Date'].min()
date_gap = (weather_start - pollutant_start).days

print(f"\n⚠️  DATE GAP DETECTED:")
print(f"  Pollutant starts: {pollutant_start.date()}")
print(f"  Weather starts: {weather_start.date()}")
print(f"  Gap: {date_gap} days")

# ============================================================================
# 3. LOAD HISTORICAL WEATHER REFERENCE DATA
# ============================================================================

print("\n" + "="*80)
print("LOADING HISTORICAL WEATHER REFERENCE DATA")
print("="*80)

historical_weather = {}
for year in range(2016, 2025):
    filename = f'another/weather_{year}.csv'
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        # Try to parse date column (might be named differently)
        date_col = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
        if date_col:
            df['Date'] = pd.to_datetime(df[date_col[0]], errors='coerce')
            historical_weather[year] = df
            print(f"  ✓ Loaded another/weather_{year}.csv ({len(df):,} records)")
        else:
            print(f"  ⚠️  Skipping another/weather_{year}.csv - no date column found")
    else:
        print(f"  ⚠️  another/weather_{year}.csv not found")

# ============================================================================
# 4. FILL MISSING WEATHER DATA (2016-02-07 to 2016-04-14)
# ============================================================================

print("\n" + "="*80)
print("FILLING MISSING WEATHER DATA FOR GAP PERIOD")
print("="*80)

def fill_weather_gap_from_historical(weather_df, historical_weather, start_date, end_date):
    """
    Fill weather data gap using historical reference data
    Uses same day-of-year from historical data to estimate values
    """
    print(f"\nFilling weather gap from {start_date.date()} to {end_date.date()}...")
    
    # Generate date range for missing period
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    regions = ['Central', 'East', 'North', 'South', 'West']
    
    new_records = []
    
    for date in date_range:
        for region in regions:
            # Check if record already exists
            existing = weather_df[(weather_df['Date'] == date) & (weather_df['Region'] == region)]
            if len(existing) > 0:
                continue
            
            # Get historical data for same day of year from reference dataset
            if date.year in historical_weather:
                hist_df = historical_weather[date.year]
                
                # Try to find matching date
                hist_date = hist_df[hist_df['Date'].dt.date == date.date()]
                
                if len(hist_date) > 0:
                    # Use historical values
                    temp = hist_date['Temperature'].mean() if 'Temperature' in hist_date.columns else None
                    humidity = hist_date['RelativeHumidity'].mean() if 'RelativeHumidity' in hist_date.columns else None
                    wind = hist_date['WindSpeed'].mean() if 'WindSpeed' in hist_date.columns else None
                else:
                    # If exact date not found, use average from nearby dates in historical data
                    window_start = date - timedelta(days=3)
                    window_end = date + timedelta(days=3)
                    hist_window = hist_df[(hist_df['Date'] >= window_start) & (hist_df['Date'] <= window_end)]
                    
                    temp = hist_window['Temperature'].mean() if 'Temperature' in hist_window.columns and len(hist_window) > 0 else None
                    humidity = hist_window['RelativeHumidity'].mean() if 'RelativeHumidity' in hist_window.columns and len(hist_window) > 0 else None
                    wind = hist_window['WindSpeed'].mean() if 'WindSpeed' in hist_window.columns and len(hist_window) > 0 else None
            else:
                # If no historical data, use average from next available period
                future_data = weather_df[(weather_df['Date'] >= end_date) & 
                                        (weather_df['Date'] <= end_date + timedelta(days=30)) &
                                        (weather_df['Region'] == region)]
                temp = future_data['Temperature'].mean() if len(future_data) > 0 else 27.0
                humidity = future_data['RelativeHumidity'].mean() if len(future_data) > 0 else 80.0
                wind = future_data['WindSpeed'].mean() if len(future_data) > 0 else 3.5
            
            # Create new record
            new_record = {
                'Country': 'Singapore',
                'Region': region,
                'Date': date,
                'Temperature': round(temp, 2) if pd.notna(temp) else None,
                'RelativeHumidity': round(humidity, 2) if pd.notna(humidity) else None,
                'WindSpeed': round(wind, 2) if pd.notna(wind) else None
            }
            new_records.append(new_record)
    
    if new_records:
        new_df = pd.DataFrame(new_records)
        weather_df = pd.concat([weather_df, new_df], ignore_index=True)
        weather_df = weather_df.sort_values(['Date', 'Region']).reset_index(drop=True)
        print(f"  ✓ Added {len(new_records)} records to fill the gap")
    else:
        print(f"  ℹ️  No gap records needed")
    
    return weather_df

# Fill the gap
if date_gap > 0:
    weather_df = fill_weather_gap_from_historical(
        weather_df, 
        historical_weather, 
        pollutant_start, 
        weather_start - timedelta(days=1)
    )

# ============================================================================
# 5. FILL REMAINING NULL VALUES IN WEATHER DATA
# ============================================================================

print("\n" + "="*80)
print("FILLING REMAINING NULL VALUES IN WEATHER DATA")
print("="*80)

def fill_weather_nulls_by_region(df):
    """
    Fill null values in weather data using interpolation within each region
    Uses linear interpolation and forward/backward fill
    """
    print("\nFilling null values by region...")
    
    numeric_cols = ['Temperature', 'RelativeHumidity', 'WindSpeed']
    
    for region in df['Region'].unique():
        region_mask = df['Region'] == region
        region_data = df[region_mask].copy()
        
        before_null = region_data[numeric_cols].isnull().sum()
        
        # Sort by date
        region_data = region_data.sort_values('Date')
        
        # Interpolate (linear)
        region_data[numeric_cols] = region_data[numeric_cols].interpolate(method='linear', limit_direction='both')
        
        # Fill any remaining nulls with forward/backward fill
        region_data[numeric_cols] = region_data[numeric_cols].fillna(method='ffill').fillna(method='bfill')
        
        # Update main dataframe
        df.loc[region_mask, numeric_cols] = region_data[numeric_cols].values
        
        after_null = df[region_mask][numeric_cols].isnull().sum()
        filled = before_null - after_null
        
        if filled.sum() > 0:
            print(f"  {region}:")
            for col in numeric_cols:
                if filled[col] > 0:
                    print(f"    - Filled {filled[col]} null values in {col}")
    
    return df

weather_df = fill_weather_nulls_by_region(weather_df)

# ============================================================================
# 6. FILL NULL VALUES IN POLLUTANT DATA
# ============================================================================

print("\n" + "="*80)
print("FILLING NULL VALUES IN POLLUTANT DATA")
print("="*80)

def fill_pollutant_nulls_by_region(df):
    """
    Fill null values in pollutant data using 7-day rolling average within each region
    """
    print("\nFilling null values using 7-day rolling average by region...")
    
    numeric_cols = ['pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']
    
    for region in df['Region'].unique():
        region_mask = df['Region'] == region
        region_data = df[region_mask].copy().sort_values('Date')
        
        before_null = region_data[numeric_cols].isnull().sum()
        
        # Calculate 7-day rolling mean (centered)
        for col in numeric_cols:
            # Create rolling mean
            rolling_mean = region_data[col].rolling(window=7, center=True, min_periods=1).mean()
            
            # Fill nulls with rolling mean
            null_mask = region_data[col].isnull()
            region_data.loc[null_mask, col] = rolling_mean[null_mask]
        
        # If still nulls, use 3-day rolling average
        for col in numeric_cols:
            if region_data[col].isnull().sum() > 0:
                rolling_mean_3d = region_data[col].rolling(window=3, center=True, min_periods=1).mean()
                null_mask = region_data[col].isnull()
                region_data.loc[null_mask, col] = rolling_mean_3d[null_mask]
        
        # If still nulls, forward/backward fill
        region_data[numeric_cols] = region_data[numeric_cols].fillna(method='ffill').fillna(method='bfill')
        
        # Round to 2 decimal places
        region_data[numeric_cols] = region_data[numeric_cols].round(2)
        
        # Update main dataframe
        df.loc[region_mask, numeric_cols] = region_data[numeric_cols].values
        
        after_null = df[region_mask][numeric_cols].isnull().sum()
        filled = before_null - after_null
        
        if filled.sum() > 0:
            print(f"  {region}:")
            for col in numeric_cols:
                if filled[col] > 0:
                    print(f"    - Filled {filled[col]} null values in {col}")
    
    return df

pollutant_df = fill_pollutant_nulls_by_region(pollutant_df)

# ============================================================================
# 7. VERIFY AND SAVE CLEANED DATA
# ============================================================================

print("\n" + "="*80)
print("VERIFICATION AFTER FILLING")
print("="*80)

print("\n📊 POLLUTANT DATA - Remaining Missing Values:")
pollutant_missing_after = pollutant_df.isnull().sum()
if pollutant_missing_after.sum() == 0:
    print("  ✓ No missing values!")
else:
    for col in pollutant_df.columns:
        if pollutant_missing_after[col] > 0:
            print(f"  {col}: {pollutant_missing_after[col]:,}")

print("\n🌤️  WEATHER DATA - Remaining Missing Values:")
weather_missing_after = weather_df.isnull().sum()
if weather_missing_after.sum() == 0:
    print("  ✓ No missing values!")
else:
    for col in weather_df.columns:
        if weather_missing_after[col] > 0:
            print(f"  {col}: {weather_missing_after[col]:,}")

print("\n📅 DATE RANGES:")
print(f"  Pollutant: {pollutant_df['Date'].min().date()} to {pollutant_df['Date'].max().date()}")
print(f"  Weather: {weather_df['Date'].min().date()} to {weather_df['Date'].max().date()}")

# ============================================================================
# 8. SAVE CLEANED DATA
# ============================================================================

print("\n" + "="*80)
print("SAVING CLEANED DATA")
print("="*80)

# Ensure Date is in datetime format
pollutant_df['Date_dt'] = pd.to_datetime(pollutant_df['Date'])
pollutant_df['Date'] = pollutant_df['Date_dt'].dt.strftime('%Y-%m-%d')

# Save data by year
for year in range(2016, 2025):
    year_data = pollutant_df[pollutant_df['Date_dt'].dt.year == year].copy()
    if len(year_data) > 0:
        year_data = year_data.drop('Date_dt', axis=1)
        filename = f'pollutant_{year}_filled.csv'
        year_data.to_csv(filename, index=False)
        print(f"✓ Saved {filename} ({len(year_data):,} records)")

# Save weather data by year
weather_df['Date_dt'] = pd.to_datetime(weather_df['Date'])
for year in range(2016, 2025):
    year_data = weather_df[weather_df['Date_dt'].dt.year == year].copy()
    if len(year_data) > 0:
        year_data['Date'] = year_data['Date_dt'].dt.strftime('%Y-%m-%d')
        year_data = year_data.drop('Date_dt', axis=1)
        filename = f'weather_{year}_filled.csv'
        year_data.to_csv(filename, index=False)
        print(f"✓ Saved {filename} ({len(year_data):,} records)")

# Save full combined dataset
pollutant_df.to_csv('pollutant_data_filled.csv', index=False)
print(f"✓ Saved pollutant_data_filled.csv ({len(pollutant_df):,} records)")
# Also save combined weather data
weather_df['Date'] = weather_df['Date_dt'].dt.strftime('%Y-%m-%d')
weather_df = weather_df.drop('Date_dt', axis=1)
weather_df.to_csv('weather_data_filled.csv', index=False)
print(f"✓ Saved weather_data_filled.csv (combined, {len(weather_df):,} records)")

# ============================================================================
# 9. SUMMARY REPORT
# ============================================================================

print("\n" + "="*80)
print("SUMMARY REPORT")
print("="*80)

print("\n✅ COMPLETED TASKS:")
print("  1. ✓ Filled weather data gap (2016-02-07 to 2016-04-14) using historical reference")
print("  2. ✓ Filled remaining null values in weather data using interpolation")
print("  3. ✓ Filled null values in pollutant data using 7-day rolling average")
print("  4. ✓ Saved cleaned datasets with '_filled' suffix")

print("\n📁 OUTPUT FILES:")
print("  - pollutant_data_filled.csv (main pollutant file)")
print("  - weather_data_filled.csv (combined weather file)")
print("  - weather_2016_filled.csv through weather_2024_filled.csv (yearly files)")

print("\n" + "="*80)
print("✓ ALL DONE!")
print("="*80)

Libraries imported successfully!

LOADING DATASETS

Loading pollutant data...
  ✓ Loaded 18,749 pollutant records
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00

Loading weather data...
  ✓ Loaded weather_2016.csv (1,480 records)
  ✓ Loaded weather_2017.csv (1,797 records)
  ✓ Loaded weather_2018.csv (1,813 records)
  ✓ Loaded weather_2019.csv (1,790 records)
  ✓ Loaded weather_2020.csv (1,828 records)
  ✓ Loaded weather_2021.csv (1,810 records)
  ✓ Loaded weather_2022.csv (1,825 records)
  ✓ Loaded weather_2023.csv (1,780 records)
  ✓ Loaded weather_2024.csv (1,804 records)

✓ Combined weather data: 15,927 records
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00

ANALYZING MISSING DATA

📊 POLLUTANT DATA - Missing Values:

🌤️  WEATHER DATA - Missing Values:

⚠️  DATE GAP DETECTED:
  Pollutant starts: 2016-02-07
  Weather starts: 2016-02-07
  Gap: 0 days

LOADING HISTORICAL WEATHER REFERENCE DATA
  ✓ Loaded another/weather_2016.csv (366 records)
  ✓ Loaded another/weat