# Separate Pollutant Data by Region

This notebook processes pollutant data (2016-2024) and separates it by region.

**Input:** pollutant_data.csv (combined pollutant data from all years)

**Output:** Separate CSV files for each region (Central, East, North, South, West, National)


In [1]:
import pandas as pd
import os
from pathlib import Path


In [2]:
# Define the regions to separate
regions = ['Central', 'East', 'North', 'South', 'West', 'National']

print(f"Regions to separate: {regions}")


Regions to separate: ['Central', 'East', 'North', 'South', 'West', 'National']


In [3]:
# Read the combined pollutant data file
filename = "pollutant_data.csv"

if os.path.exists(filename):
    print(f"Reading {filename}...")
    pollutant_df = pd.read_csv(filename)
    print(f"✓ Loaded {len(pollutant_df):,} rows")
else:
    print(f"Error: {filename} not found!")
    raise FileNotFoundError(f"{filename} does not exist")


Reading pollutant_data.csv...
✓ Loaded 437,382 rows


In [4]:
# Display basic information about the pollutant data
print("\nPollutant Data Info:")
print(pollutant_df.info())
print("\nFirst few rows:")
display(pollutant_df.head(10))
print("\nColumn names:")
print(list(pollutant_df.columns))



Pollutant Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437382 entries, 0 to 437381
Data columns (total 9 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Region  437382 non-null  object 
 1   Date    437382 non-null  object 
 2   pm25    437367 non-null  float64
 3   pm10    437367 non-null  float64
 4   o3      437306 non-null  float64
 5   no2     437257 non-null  float64
 6   so2     437367 non-null  float64
 7   co      406459 non-null  float64
 8   aqi     437367 non-null  float64
dtypes: float64(7), object(2)
memory usage: 30.0+ MB
None

First few rows:


Unnamed: 0,Region,Date,pm25,pm10,o3,no2,so2,co,aqi
0,Central,2016-02-07,11.0,21.0,59.0,10.0,3.0,0.45,47.0
1,East,2016-02-07,11.0,19.0,56.0,4.0,2.0,0.41,47.0
2,National,2016-02-07,11.0,24.0,66.0,10.0,6.0,0.45,47.0
3,North,2016-02-07,9.0,22.0,66.0,4.0,6.0,0.25,37.0
4,South,2016-02-07,11.0,24.0,46.0,7.0,2.0,0.45,44.0
5,West,2016-02-07,8.0,16.0,56.0,5.0,3.0,0.25,34.0
6,Central,2016-02-08,17.0,34.0,49.0,15.0,4.0,0.42,57.0
7,East,2016-02-08,19.0,35.0,46.0,12.0,3.0,0.39,59.0
8,National,2016-02-08,19.0,43.0,49.0,22.0,6.0,0.42,59.0
9,North,2016-02-08,15.0,30.0,41.0,22.0,6.0,0.34,54.0



Column names:
['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']


In [5]:
# Check unique regions in the data
unique_regions = pollutant_df['Region'].unique()
print(f"Unique regions found in data: {unique_regions}")
print(f"\nRegion counts:")
print(pollutant_df['Region'].value_counts())


Unique regions found in data: ['Central' 'East' 'National' 'North' 'South' 'West']

Region counts:
Region
Central     74591
East        74591
North       74591
South       74591
West        74591
National    64427
Name: count, dtype: int64


In [6]:
# Separate data by region and save to individual CSV files
for region in regions:
    # Filter data for the current region
    region_df = pollutant_df[pollutant_df['Region'] == region].copy()
    
    if len(region_df) > 0:
        # Sort by date for better organization
        region_df['Date'] = pd.to_datetime(region_df['Date'])
        region_df = region_df.sort_values('Date')
        
        # Create filename
        output_filename = f"{region.lower()}_pollutant.csv"
        
        # Save to CSV
        region_df.to_csv(output_filename, index=False)
        
        print(f"✓ Saved {len(region_df):,} rows to {output_filename}")
        print(f"  Date range: {region_df['Date'].min()} to {region_df['Date'].max()}")
    else:
        print(f"⚠ Warning: No data found for {region} region")

print("\n✅ All regional pollutant files created successfully!")


✓ Saved 74,591 rows to central_pollutant.csv
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00
✓ Saved 74,591 rows to east_pollutant.csv
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00
✓ Saved 74,591 rows to north_pollutant.csv
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00
✓ Saved 74,591 rows to south_pollutant.csv
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00
✓ Saved 74,591 rows to west_pollutant.csv
  Date range: 2016-02-07 00:00:00 to 2024-12-31 00:00:00
✓ Saved 64,427 rows to national_pollutant.csv
  Date range: 2016-02-07 00:00:00 to 2023-10-30 00:00:00

✅ All regional pollutant files created successfully!


In [7]:
# Display summary statistics for each region
print("\n" + "="*70)
print("SUMMARY STATISTICS BY REGION")
print("="*70)

for region in regions:
    region_df = pollutant_df[pollutant_df['Region'] == region]
    
    if len(region_df) > 0:
        print(f"\n{region.upper()} REGION:")
        print(f"  Total records: {len(region_df):,}")
        print(f"  PM2.5 - Mean: {region_df['pm25'].mean():.2f}, Min: {region_df['pm25'].min():.2f}, Max: {region_df['pm25'].max():.2f}")
        print(f"  PM10  - Mean: {region_df['pm10'].mean():.2f}, Min: {region_df['pm10'].min():.2f}, Max: {region_df['pm10'].max():.2f}")
        print(f"  O3    - Mean: {region_df['o3'].mean():.2f}, Min: {region_df['o3'].min():.2f}, Max: {region_df['o3'].max():.2f}")
        print(f"  NO2   - Mean: {region_df['no2'].mean():.2f}, Min: {region_df['no2'].min():.2f}, Max: {region_df['no2'].max():.2f}")
        print(f"  SO2   - Mean: {region_df['so2'].mean():.2f}, Min: {region_df['so2'].min():.2f}, Max: {region_df['so2'].max():.2f}")
        print(f"  CO    - Mean: {region_df['co'].mean():.2f}, Min: {region_df['co'].min():.2f}, Max: {region_df['co'].max():.2f}")
        print(f"  AQI   - Mean: {region_df['aqi'].mean():.2f}, Min: {region_df['aqi'].min():.2f}, Max: {region_df['aqi'].max():.2f}")



SUMMARY STATISTICS BY REGION

CENTRAL REGION:
  Total records: 74,591
  PM2.5 - Mean: 13.59, Min: 3.00, Max: 84.00
  PM10  - Mean: 26.69, Min: 5.00, Max: 113.00
  O3    - Mean: 27.91, Min: 1.00, Max: 153.00
  NO2   - Mean: 27.13, Min: 1.00, Max: 158.00
  SO2   - Mean: 5.92, Min: 1.00, Max: 54.00
  CO    - Mean: 0.48, Min: 0.06, Max: 5.55
  AQI   - Mean: 48.23, Min: 12.00, Max: 131.00

EAST REGION:
  Total records: 74,591
  PM2.5 - Mean: 13.53, Min: 3.00, Max: 97.00
  PM10  - Mean: 26.46, Min: 7.00, Max: 147.00
  O3    - Mean: 23.74, Min: 1.00, Max: 121.00
  NO2   - Mean: 24.56, Min: 2.00, Max: 632.00
  SO2   - Mean: 6.24, Min: 1.00, Max: 45.00
  CO    - Mean: 0.51, Min: 0.08, Max: 2.00
  AQI   - Mean: 48.35, Min: 14.00, Max: 145.00

NORTH REGION:
  Total records: 74,591
  PM2.5 - Mean: 13.08, Min: 3.00, Max: 96.00
  PM10  - Mean: 26.21, Min: 8.00, Max: 139.00
  O3    - Mean: 27.68, Min: 1.00, Max: 191.00
  NO2   - Mean: 24.81, Min: 1.00, Max: 212.00
  SO2   - Mean: 5.33, Min: 1.00, Ma

In [8]:
# Verify the created files
print("\n" + "="*70)
print("VERIFICATION OF OUTPUT FILES")
print("="*70)

for region in regions:
    filename = f"{region.lower()}_pollutant.csv"
    if os.path.exists(filename):
        file_size = os.path.getsize(filename) / (1024 * 1024)  # Convert to MB
        df_check = pd.read_csv(filename)
        print(f"\n✓ {filename}")
        print(f"  - Rows: {len(df_check):,}")
        print(f"  - File size: {file_size:.2f} MB")
        print(f"  - Columns: {list(df_check.columns)}")
    else:
        print(f"\n✗ {filename} - NOT FOUND")



VERIFICATION OF OUTPUT FILES

✓ central_pollutant.csv
  - Rows: 74,591
  - File size: 3.72 MB
  - Columns: ['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']

✓ east_pollutant.csv
  - Rows: 74,591
  - File size: 3.50 MB
  - Columns: ['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']

✓ north_pollutant.csv
  - Rows: 74,591
  - File size: 3.57 MB
  - Columns: ['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']

✓ south_pollutant.csv
  - Rows: 74,591
  - File size: 3.56 MB
  - Columns: ['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']

✓ west_pollutant.csv
  - Rows: 74,591
  - File size: 3.49 MB
  - Columns: ['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']

✓ national_pollutant.csv
  - Rows: 64,427
  - File size: 3.34 MB
  - Columns: ['Region', 'Date', 'pm25', 'pm10', 'o3', 'no2', 'so2', 'co', 'aqi']


In [9]:
# Check for missing values in each region's data
print("\n" + "="*70)
print("DATA QUALITY CHECK - MISSING VALUES")
print("="*70)

for region in regions:
    region_df = pollutant_df[pollutant_df['Region'] == region]
    
    if len(region_df) > 0:
        missing = region_df.isnull().sum()
        missing_pct = (missing / len(region_df)) * 100
        
        print(f"\n{region.upper()} REGION:")
        print(f"  Total records: {len(region_df):,}")
        for col in region_df.columns:
            if missing[col] > 0:
                print(f"  - {col}: {missing[col]:,} missing ({missing_pct[col]:.2f}%)")
        if missing.sum() == 0:
            print("  ✓ No missing values!")



DATA QUALITY CHECK - MISSING VALUES

CENTRAL REGION:
  Total records: 74,591
  - pm25: 3 missing (0.00%)
  - pm10: 3 missing (0.00%)
  - o3: 13 missing (0.02%)
  - no2: 25 missing (0.03%)
  - so2: 3 missing (0.00%)
  - co: 5,578 missing (7.48%)
  - aqi: 3 missing (0.00%)

EAST REGION:
  Total records: 74,591
  - pm25: 3 missing (0.00%)
  - pm10: 3 missing (0.00%)
  - o3: 13 missing (0.02%)
  - no2: 25 missing (0.03%)
  - so2: 3 missing (0.00%)
  - co: 6,399 missing (8.58%)
  - aqi: 3 missing (0.00%)

NORTH REGION:
  Total records: 74,591
  - pm25: 3 missing (0.00%)
  - pm10: 3 missing (0.00%)
  - o3: 19 missing (0.03%)
  - no2: 25 missing (0.03%)
  - so2: 3 missing (0.00%)
  - co: 3,584 missing (4.80%)
  - aqi: 3 missing (0.00%)

SOUTH REGION:
  Total records: 74,591
  - pm25: 3 missing (0.00%)
  - pm10: 3 missing (0.00%)
  - o3: 13 missing (0.02%)
  - no2: 25 missing (0.03%)
  - so2: 3 missing (0.00%)
  - co: 8,863 missing (11.88%)
  - aqi: 3 missing (0.00%)

WEST REGION:
  Total rec