# Separate Pollutant Data by Region

This notebook processes pollutant data (2016-2024) and separates it by region.

**Input:** pollutant_data.csv (combined pollutant data from all years)

**Output:** Separate CSV files for each region (Central, East, North, South, West, National)


In [1]:
import pandas as pd
import os
from pathlib import Path


In [2]:
# Define the regions to separate
input_folder = "../../data/singapore/raw/pollutant"
output_folder = "../../data/singapore/clean/pollutant"
regions = ['Central', 'East', 'North', 'South', 'West', 'National']

print(f"Regions to separate: {regions}")
os.makedirs(output_folder, exist_ok=True)


Regions to separate: ['Central', 'East', 'North', 'South', 'West', 'National']


In [3]:
# --- READ ALL POLLUTANT FILES ---
all_pollutant_data = []
for file in sorted(os.listdir(input_folder)):
    if file.endswith(".csv"):
        filepath = os.path.join(input_folder, file)
        print(f"📥 Reading {filepath}...")
        df = pd.read_csv(filepath)
        print(f"  - Loaded {len(df):,} rows from {file}")
        all_pollutant_data.append(df)

# Combine all pollutant data into one DataFrame
if all_pollutant_data:
    pollutant_df = pd.concat(all_pollutant_data, ignore_index=True)
    print(f"\n✅ Total combined pollutant rows: {len(pollutant_df):,}")
else:
    raise FileNotFoundError(f"No CSV files found in {input_folder}")

📥 Reading ../../data/singapore/raw/pollutant/pollutant_2016.csv...
  - Loaded 1,914 rows from pollutant_2016.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2017.csv...
  - Loaded 2,166 rows from pollutant_2017.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2018.csv...
  - Loaded 2,190 rows from pollutant_2018.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2019.csv...
  - Loaded 2,160 rows from pollutant_2019.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2020.csv...
  - Loaded 2,172 rows from pollutant_2020.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2021.csv...
  - Loaded 2,118 rows from pollutant_2021.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2022.csv...
  - Loaded 2,184 rows from pollutant_2022.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2023.csv...
  - Loaded 2,015 rows from pollutant_2023.csv
📥 Reading ../../data/singapore/raw/pollutant/pollutant_2024.csv...
  - Loaded 1,830 rows from po

In [4]:
# --- CLEAN & PREPARE DATA ---
# Convert and clean Date column
pollutant_df['Date'] = pd.to_datetime(pollutant_df['Date'], errors='coerce')
pollutant_df = pollutant_df.dropna(subset=['Date'])

# Ensure no temporary columns remain
pollutant_df = pollutant_df.loc[:, ~pollutant_df.columns.str.contains('Date_dt', case=False)]

# --- INFO & SUMMARY ---
print("\nPollutant Data Info:")
print(pollutant_df.info())
print("\nFirst few rows:")
display(pollutant_df.head(10))
print("\nColumn names:")
print(list(pollutant_df.columns))



Pollutant Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18749 entries, 0 to 18748
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    18749 non-null  datetime64[ns]
 1   Region  18749 non-null  object        
 2   pm25    18749 non-null  float64       
 3   pm10    18749 non-null  float64       
 4   o3      18749 non-null  float64       
 5   no2     18749 non-null  float64       
 6   so2     18749 non-null  float64       
 7   co      18749 non-null  float64       
 8   aqi     18749 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 1.3+ MB
None

First few rows:


Unnamed: 0,Date,Region,pm25,pm10,o3,no2,so2,co,aqi
0,2016-02-07,Central,11.0,21.0,59.0,10.0,3.0,0.45,47.0
1,2016-02-07,East,11.0,19.0,56.0,4.0,2.0,0.41,47.0
2,2016-02-07,National,11.0,24.0,66.0,10.0,6.0,0.45,47.0
3,2016-02-07,North,9.0,22.0,66.0,4.0,6.0,0.25,37.0
4,2016-02-07,South,11.0,24.0,46.0,7.0,2.0,0.45,44.0
5,2016-02-07,West,8.0,16.0,56.0,5.0,3.0,0.25,34.0
6,2016-02-08,Central,17.0,34.0,49.0,15.0,4.0,0.42,57.0
7,2016-02-08,East,19.0,35.0,46.0,12.0,3.0,0.39,59.0
8,2016-02-08,National,19.0,43.0,49.0,22.0,6.0,0.42,59.0
9,2016-02-08,North,15.0,30.0,41.0,22.0,6.0,0.34,54.0



Column names:
['Date', 'Region', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']


In [5]:
# Check unique regions in the data
unique_regions = pollutant_df['Region'].unique()
print(f"Unique regions found in data: {unique_regions}")
print(f"\nRegion counts:")
print(pollutant_df['Region'].value_counts())


Unique regions found in data: ['Central' 'East' 'National' 'North' 'South' 'West']

Region counts:
Region
Central     3196
East        3196
North       3196
South       3196
West        3196
National    2769
Name: count, dtype: int64


In [6]:
# --- SAVE BY REGION ---
for region in regions:
    region_df = pollutant_df[pollutant_df['Region'] == region].copy()

    if len(region_df) > 0:
        region_df = region_df.sort_values('Date')
        output_filename = os.path.join(output_folder, f"{region.lower()}_pollutant.csv")

        region_df.to_csv(output_filename, index=False)
        print(f"✓ Saved {len(region_df):,} rows to {output_filename}")
        print(f"  Date range: {region_df['Date'].min().date()} → {region_df['Date'].max().date()}")
    else:
        print(f"⚠ Warning: No data found for {region} region")

print("\n✅ All regional pollutant files created successfully!")

✓ Saved 3,196 rows to ../../data/singapore/clean/pollutant/central_pollutant.csv
  Date range: 2016-02-07 → 2024-12-31
✓ Saved 3,196 rows to ../../data/singapore/clean/pollutant/east_pollutant.csv
  Date range: 2016-02-07 → 2024-12-31
✓ Saved 3,196 rows to ../../data/singapore/clean/pollutant/north_pollutant.csv
  Date range: 2016-02-07 → 2024-12-31
✓ Saved 3,196 rows to ../../data/singapore/clean/pollutant/south_pollutant.csv
  Date range: 2016-02-07 → 2024-12-31
✓ Saved 3,196 rows to ../../data/singapore/clean/pollutant/west_pollutant.csv
  Date range: 2016-02-07 → 2024-12-31
✓ Saved 2,769 rows to ../../data/singapore/clean/pollutant/national_pollutant.csv
  Date range: 2016-02-07 → 2023-10-30

✅ All regional pollutant files created successfully!


In [7]:
# Display summary statistics for each region
print("\n" + "="*70)
print("SUMMARY STATISTICS BY REGION")
print("="*70)

for region in regions:
    region_df = pollutant_df[pollutant_df['Region'] == region]
    
    if len(region_df) > 0:
        print(f"\n{region.upper()} REGION:")
        print(f"  Total records: {len(region_df):,}")
        print(f"  PM2.5 - Mean: {region_df['pm25'].mean():.2f}, Min: {region_df['pm25'].min():.2f}, Max: {region_df['pm25'].max():.2f}")
        print(f"  PM10  - Mean: {region_df['pm10'].mean():.2f}, Min: {region_df['pm10'].min():.2f}, Max: {region_df['pm10'].max():.2f}")
        print(f"  O3    - Mean: {region_df['o3'].mean():.2f}, Min: {region_df['o3'].min():.2f}, Max: {region_df['o3'].max():.2f}")
        print(f"  NO2   - Mean: {region_df['no2'].mean():.2f}, Min: {region_df['no2'].min():.2f}, Max: {region_df['no2'].max():.2f}")
        print(f"  SO2   - Mean: {region_df['so2'].mean():.2f}, Min: {region_df['so2'].min():.2f}, Max: {region_df['so2'].max():.2f}")
        print(f"  CO    - Mean: {region_df['co'].mean():.2f}, Min: {region_df['co'].min():.2f}, Max: {region_df['co'].max():.2f}")
        print(f"  AQI   - Mean: {region_df['aqi'].mean():.2f}, Min: {region_df['aqi'].min():.2f}, Max: {region_df['aqi'].max():.2f}")



SUMMARY STATISTICS BY REGION

CENTRAL REGION:
  Total records: 3,196
  PM2.5 - Mean: 13.60, Min: 3.09, Max: 65.83
  PM10  - Mean: 26.69, Min: 7.21, Max: 91.25
  O3    - Mean: 27.96, Min: 2.92, Max: 93.46
  NO2   - Mean: 27.17, Min: 4.00, Max: 81.50
  SO2   - Mean: 5.91, Min: 1.00, Max: 42.67
  CO    - Mean: 0.52, Min: 0.12, Max: 1.94
  AQI   - Mean: 48.25, Min: 14.29, Max: 111.25

EAST REGION:
  Total records: 3,196
  PM2.5 - Mean: 13.55, Min: 4.08, Max: 74.42
  PM10  - Mean: 26.48, Min: 9.04, Max: 118.33
  O3    - Mean: 23.82, Min: 2.08, Max: 73.79
  NO2   - Mean: 24.55, Min: 4.00, Max: 117.83
  SO2   - Mean: 6.23, Min: 1.00, Max: 41.50
  CO    - Mean: 0.55, Min: 0.12, Max: 1.69
  AQI   - Mean: 48.40, Min: 16.71, Max: 120.42

NORTH REGION:
  Total records: 3,196
  PM2.5 - Mean: 13.09, Min: 3.96, Max: 85.25
  PM10  - Mean: 26.23, Min: 9.12, Max: 114.75
  O3    - Mean: 27.84, Min: 2.33, Max: 93.75
  NO2   - Mean: 24.78, Min: 2.80, Max: 83.67
  SO2   - Mean: 5.33, Min: 1.00, Max: 32.79


In [9]:
# Check for missing values in each region's data
print("\n" + "="*70)
print("DATA QUALITY CHECK - MISSING VALUES")
print("="*70)

for region in regions:
    region_df = pollutant_df[pollutant_df['Region'] == region]
    
    if len(region_df) > 0:
        missing = region_df.isnull().sum()
        missing_pct = (missing / len(region_df)) * 100
        
        print(f"\n{region.upper()} REGION:")
        print(f"  Total records: {len(region_df):,}")
        for col in region_df.columns:
            if missing[col] > 0:
                print(f"  - {col}: {missing[col]:,} missing ({missing_pct[col]:.2f}%)")
        if missing.sum() == 0:
            print("  ✓ No missing values!")



DATA QUALITY CHECK - MISSING VALUES

CENTRAL REGION:
  Total records: 3,196
  ✓ No missing values!

EAST REGION:
  Total records: 3,196
  ✓ No missing values!

NORTH REGION:
  Total records: 3,196
  ✓ No missing values!

SOUTH REGION:
  Total records: 3,196
  ✓ No missing values!

WEST REGION:
  Total records: 3,196
  ✓ No missing values!

NATIONAL REGION:
  Total records: 2,769
  ✓ No missing values!
