# Combine Weather and Pollutant Data - All Regions

This notebook merges the regional weather and pollutant data into one master file containing all regions.

**Input Files:**
- Regional weather files: `central_weather.csv`, `east_weather.csv`, etc.
- Regional pollutant files: `central_pollutant.csv`, `east_pollutant.csv`, etc.

**Main Output:**
- `SG.csv` - Single file with all 5 regions
- Columns: Country, Region, Date, AQI, Temperature, RelativeHumidity, WindSpeed


In [1]:
import pandas as pd
import os
from pathlib import Path


In [2]:
# --- CONFIGURATION ---
weather_folder = "../../data/singapore/clean/weather"
pollutant_folder = "../../data/singapore/clean/pollutant"
output_file = "../../data/singapore/SG.csv"

regions = ['Central', 'East', 'North', 'South', 'West', 'National']

print(f"Regions to combine: {regions}")


Regions to combine: ['Central', 'East', 'North', 'South', 'West', 'National']


In [3]:
# --- START ---
all_regions_data = []
combined_data_summary = []

print("="*80)
print("COMBINING WEATHER AND POLLUTANT DATA FOR ALL REGIONS")
print("="*80)

for region in regions:
    weather_file = os.path.join(weather_folder, f"{region.lower()}_weather.csv")
    pollutant_file = os.path.join(pollutant_folder, f"{region.lower()}_pollutant.csv")

    print(f"\nProcessing {region} region...")

    # Check if both files exist
    if not os.path.exists(weather_file):
        print(f"  ⚠ Warning: {weather_file} not found! Skipping {region}.")
        continue

    if not os.path.exists(pollutant_file):
        print(f"  ⚠ Warning: {pollutant_file} not found! Skipping {region}.")
        continue

    # --- READ FILES ---
    print(f"  Reading {weather_file}...")
    weather_df = pd.read_csv(weather_file)

    print(f"  Reading {pollutant_file}...")
    pollutant_df = pd.read_csv(pollutant_file)

    # --- CLEAN DATES ---
    weather_df['Date'] = pd.to_datetime(weather_df['Date'], errors='coerce')
    pollutant_df['Date'] = pd.to_datetime(pollutant_df['Date'], errors='coerce')

    # Drop invalid rows (if any)
    weather_df = weather_df.dropna(subset=['Date'])
    pollutant_df = pollutant_df.dropna(subset=['Date'])

    # --- MERGE ---
    print(f"  Merging weather and pollutant data...")
    combined_df = pd.merge(
        weather_df,
        pollutant_df[['Region', 'Date', 'aqi']],  # Only Region, Date, and AQI needed
        on=['Region', 'Date'],
        how='inner'
    )

    # --- REORDER & CLEAN ---
    expected_columns = ['Country', 'Region', 'Date', 'aqi', 'Temperature', 'RelativeHumidity', 'WindSpeed']
    combined_df = combined_df[[col for col in expected_columns if col in combined_df.columns]]
    combined_df.rename(columns={'aqi': 'AQI'}, inplace=True)
    combined_df = combined_df.sort_values('Date')

    # --- STORE ---
    all_regions_data.append(combined_df)

    print(f"  ✓ Added {len(combined_df):,} rows for {region}")
    print(f"    Date range: {combined_df['Date'].min().date()} → {combined_df['Date'].max().date()}")
    print(f"    Weather records: {len(weather_df):,}, Pollutant records: {len(pollutant_df):,}, Combined: {len(combined_df):,}")

    combined_data_summary.append({
        'Region': region,
        'Weather_Records': len(weather_df),
        'Pollutant_Records': len(pollutant_df),
        'Combined_Records': len(combined_df)
    })

# --- MASTER FILE ---
if all_regions_data:
    print("\n" + "="*80)
    print("CREATING MASTER FILE - ALL REGIONS COMBINED")
    print("="*80)

    master_df = pd.concat(all_regions_data, ignore_index=True)
    master_df = master_df.sort_values(['Region', 'Date'])

    # Round numeric columns to 2 decimal places
    numeric_cols = master_df.select_dtypes(include=['float64', 'int64']).columns
    master_df[numeric_cols] = master_df[numeric_cols].round(2)
    master_df.to_csv(output_file, index=False)

    print(f"\n✅ SUCCESSFULLY CREATED: {output_file}")
    print(f"   Total rows: {len(master_df):,}")
    print(f"   File size: {os.path.getsize(output_file) / (1024 * 1024):.2f} MB")
    print(f"   Columns: {list(master_df.columns)}")

    print("\n   Region breakdown:")
    for region, count in master_df['Region'].value_counts().sort_index().items():
        print(f"   - {region}: {count:,} rows")
else:
    print("\n⚠ No data found to create master file.")

COMBINING WEATHER AND POLLUTANT DATA FOR ALL REGIONS

Processing Central region...
  Reading ../../data/singapore/clean/weather/central_weather.csv...
  Reading ../../data/singapore/clean/pollutant/central_pollutant.csv...
  Merging weather and pollutant data...
  ✓ Added 2,790 rows for Central
    Date range: 2016-02-07 → 2023-12-31
    Weather records: 2,833, Pollutant records: 3,196, Combined: 2,790

Processing East region...
  Reading ../../data/singapore/clean/weather/east_weather.csv...
  Reading ../../data/singapore/clean/pollutant/east_pollutant.csv...
  Merging weather and pollutant data...
  ✓ Added 2,790 rows for East
    Date range: 2016-02-07 → 2023-12-31
    Weather records: 2,833, Pollutant records: 3,196, Combined: 2,790

Processing North region...
  Reading ../../data/singapore/clean/weather/north_weather.csv...
  Reading ../../data/singapore/clean/pollutant/north_pollutant.csv...
  Merging weather and pollutant data...
  ✓ Added 2,790 rows for North
    Date range: 20

In [4]:
# Display master file info and preview
master_file = "../../data/singapore/SG.csv"

if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    
    print("\n" + "="*80)
    print("MASTER FILE INFO - SG.csv")
    print("="*80)
    
    print(f"\nTotal records: {len(master_df):,}")
    print(f"Date range: {pd.to_datetime(master_df['Date']).min().date()} to {pd.to_datetime(master_df['Date']).max().date()}")
    print(f"\nColumns: {list(master_df.columns)}")
    print(f"\nRegion distribution:")
    print(master_df['Region'].value_counts().sort_index())
    
    print("\n" + "-"*80)
    print("FIRST 10 ROWS:")
    print("-"*80)
    display(master_df.head(10))
    
    print("\n" + "-"*80)
    print("SAMPLE FROM EACH REGION:")
    print("-"*80)
    for region in regions:
        region_sample = master_df[master_df['Region'] == region].head(3)
        if len(region_sample) > 0:
            print(f"\n{region}:")
            display(region_sample)



MASTER FILE INFO - SG.csv

Total records: 13,908
Date range: 2016-02-07 to 2023-12-31

Columns: ['Country', 'Region', 'Date', 'AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']

Region distribution:
Region
Central    2790
East       2790
North      2790
South      2748
West       2790
Name: count, dtype: int64

--------------------------------------------------------------------------------
FIRST 10 ROWS:
--------------------------------------------------------------------------------


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
0,Singapore,Central,2016-02-07,47.0,25.87,87.26,17.8
1,Singapore,Central,2016-02-08,57.0,26.26,86.48,16.4
2,Singapore,Central,2016-02-09,57.88,25.97,87.06,9.5
3,Singapore,Central,2016-02-10,54.67,25.68,87.64,12.0
4,Singapore,Central,2016-02-11,32.79,26.36,86.28,11.0
5,Singapore,Central,2016-02-12,39.0,26.66,85.68,10.2
6,Singapore,Central,2016-02-13,47.46,27.54,83.92,13.8
7,Singapore,Central,2016-02-14,37.71,26.56,85.88,9.2
8,Singapore,Central,2016-02-15,49.5,27.73,83.54,11.7
9,Singapore,Central,2016-02-16,50.74,27.44,84.12,16.0



--------------------------------------------------------------------------------
SAMPLE FROM EACH REGION:
--------------------------------------------------------------------------------

Central:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
0,Singapore,Central,2016-02-07,47.0,25.87,87.26,17.8
1,Singapore,Central,2016-02-08,57.0,26.26,86.48,16.4
2,Singapore,Central,2016-02-09,57.88,25.97,87.06,9.5



East:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
2790,Singapore,East,2016-02-07,47.0,26.17,84.66,18.6
2791,Singapore,East,2016-02-08,59.0,26.56,83.88,17.2
2792,Singapore,East,2016-02-09,61.5,26.27,84.46,10.3



North:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
5580,Singapore,North,2016-02-07,37.0,25.47,90.06,17.6
5581,Singapore,North,2016-02-08,54.0,25.86,89.28,16.2
5582,Singapore,North,2016-02-09,57.21,25.57,89.86,9.3



South:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
8370,Singapore,South,2016-02-07,44.0,26.27,83.46,18.9
8371,Singapore,South,2016-02-08,57.0,26.66,82.68,17.5
8372,Singapore,South,2016-02-09,58.46,26.37,83.26,10.6



West:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
11118,Singapore,West,2016-02-07,34.0,25.67,88.66,17.8
11119,Singapore,West,2016-02-08,56.0,26.06,87.88,16.4
11120,Singapore,West,2016-02-09,58.0,25.77,88.46,9.5


In [5]:
# Display statistics by region from master file
master_file = "../../data/singapore/SG.csv"

if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    
    print("\n" + "="*80)
    print("STATISTICS BY REGION - SG.csv")
    print("="*80)
    
    for region in regions:
        region_df = master_df[master_df['Region'] == region]
        
        if len(region_df) > 0:
            print(f"\n{region.upper()} REGION:")
            print(f"  Total records: {len(region_df):,}")
            print(f"  AQI - Mean: {region_df['AQI'].mean():.2f}, Min: {region_df['AQI'].min():.2f}, Max: {region_df['AQI'].max():.2f}")
            print(f"  Temperature - Mean: {region_df['Temperature'].mean():.2f}°C, Min: {region_df['Temperature'].min():.2f}°C, Max: {region_df['Temperature'].max():.2f}°C")
            print(f"  Humidity - Mean: {region_df['RelativeHumidity'].mean():.2f}%, Min: {region_df['RelativeHumidity'].min():.2f}%, Max: {region_df['RelativeHumidity'].max():.2f}%")
            print(f"  Wind Speed - Mean: {region_df['WindSpeed'].mean():.2f} km/h, Min: {region_df['WindSpeed'].min():.2f} km/h, Max: {region_df['WindSpeed'].max():.2f} km/h")



STATISTICS BY REGION - SG.csv

CENTRAL REGION:
  Total records: 2,790
  AQI - Mean: 48.06, Min: 14.29, Max: 111.25
  Temperature - Mean: 27.99°C, Min: 22.73°C, Max: 30.60°C
  Humidity - Mean: 81.04%, Min: 61.06%, Max: 99.26%
  Wind Speed - Mean: 6.82 km/h, Min: 2.84 km/h, Max: 17.80 km/h

EAST REGION:
  Total records: 2,790
  AQI - Mean: 48.60, Min: 16.71, Max: 120.42
  Temperature - Mean: 28.23°C, Min: 23.21°C, Max: 30.78°C
  Humidity - Mean: 79.22%, Min: 60.84%, Max: 97.95%
  Wind Speed - Mean: 5.67 km/h, Min: 2.43 km/h, Max: 18.60 km/h

NORTH REGION:
  Total records: 2,790
  AQI - Mean: 47.42, Min: 15.79, Max: 131.79
  Temperature - Mean: 27.75°C, Min: 22.67°C, Max: 30.76°C
  Humidity - Mean: 79.33%, Min: 62.99%, Max: 97.41%
  Wind Speed - Mean: 4.42 km/h, Min: 1.57 km/h, Max: 17.60 km/h

SOUTH REGION:
  Total records: 2,748
  AQI - Mean: 47.79, Min: 16.62, Max: 128.50
  Temperature - Mean: 28.40°C, Min: 23.04°C, Max: 31.90°C
  Humidity - Mean: 78.37%, Min: 55.16%, Max: 99.80%
  Wi

In [6]:
# Check for missing values in master file
master_file = "../../data/singapore/SG.csv"

if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    
    print("\n" + "="*80)
    print("DATA QUALITY CHECK - MISSING VALUES")
    print("="*80)
    
    # Overall missing values
    missing = master_df.isnull().sum()
    missing_pct = (missing / len(master_df)) * 100
    
    print(f"\nOVERALL ({len(master_df):,} total rows):")
    for col in master_df.columns:
        if missing[col] > 0:
            print(f"  - {col}: {missing[col]:,} missing ({missing_pct[col]:.2f}%)")
        else:
            print(f"  - {col}: ✓ No missing values")
    
    # Missing values by region
    print("\n" + "-"*80)
    print("BY REGION:")
    print("-"*80)
    
    for region in regions:
        region_df = master_df[master_df['Region'] == region]
        
        if len(region_df) > 0:
            region_missing = region_df.isnull().sum()
            region_missing_pct = (region_missing / len(region_df)) * 100
            
            print(f"\n{region.upper()} ({len(region_df):,} rows):")
            has_missing = False
            for col in region_df.columns:
                if region_missing[col] > 0:
                    print(f"  - {col}: {region_missing[col]:,} missing ({region_missing_pct[col]:.2f}%)")
                    has_missing = True
            if not has_missing:
                print("  ✓ No missing values!")



DATA QUALITY CHECK - MISSING VALUES

OVERALL (13,908 total rows):
  - Country: ✓ No missing values
  - Region: ✓ No missing values
  - Date: ✓ No missing values
  - AQI: ✓ No missing values
  - Temperature: ✓ No missing values
  - RelativeHumidity: ✓ No missing values
  - WindSpeed: ✓ No missing values

--------------------------------------------------------------------------------
BY REGION:
--------------------------------------------------------------------------------

CENTRAL (2,790 rows):
  ✓ No missing values!

EAST (2,790 rows):
  ✓ No missing values!

NORTH (2,790 rows):
  ✓ No missing values!

SOUTH (2,748 rows):
  ✓ No missing values!

WEST (2,790 rows):
  ✓ No missing values!


In [7]:
# Display summary of processing
print("\n" + "="*80)
print("PROCESSING SUMMARY")
print("="*80)

summary_df = pd.DataFrame(combined_data_summary)
print("\nRecords per region:")
display(summary_df)



PROCESSING SUMMARY

Records per region:


Unnamed: 0,Region,Weather_Records,Pollutant_Records,Combined_Records
0,Central,2833,3196,2790
1,East,2833,3196,2790
2,North,2833,3196,2790
3,South,2791,3196,2748
4,West,2833,3196,2790


In [8]:
# Final verification and completion message
master_file = "../../data/singapore/SG.csv"

print("\n" + "="*80)
print("FINAL VERIFICATION")
print("="*80)

if os.path.exists(master_file):
    file_size = os.path.getsize(master_file) / (1024 * 1024)  # Convert to MB
    df_check = pd.read_csv(master_file)
    
    print(f"\n✅ SUCCESS! Master file created:")
    print(f"\n   📁 File: {master_file}")
    print(f"   📊 Total rows: {len(df_check):,}")
    print(f"   💾 File size: {file_size:.2f} MB")
    print(f"   📋 Columns: {list(df_check.columns)}")
    print(f"\n   🌍 Regions included: {', '.join(df_check['Region'].unique())}")
    
    print("\n" + "="*80)
    print("✅ PROCESS COMPLETED SUCCESSFULLY!")
    print("="*80)
    print(f"\nYour combined data is ready in: {master_file}")
    print(f"This file contains weather and pollutant data for all {len(regions)} regions")
    print(f"from 2016-2024 with {len(df_check):,} total records.")
else:
    print(f"\n❌ ERROR: {master_file} was not created!")
    print("Please check the error messages above.")



FINAL VERIFICATION

✅ SUCCESS! Master file created:

   📁 File: ../../data/singapore/SG.csv
   📊 Total rows: 13,908
   💾 File size: 0.66 MB
   📋 Columns: ['Country', 'Region', 'Date', 'AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']

   🌍 Regions included: Central, East, North, South, West

✅ PROCESS COMPLETED SUCCESSFULLY!

Your combined data is ready in: ../../data/singapore/SG.csv
This file contains weather and pollutant data for all 6 regions
from 2016-2024 with 13,908 total records.
