# Combine Weather and Pollutant Data - All Regions

This notebook merges the regional weather and pollutant data into one master file containing all regions.

**Input Files:**
- Regional weather files: `central_weather.csv`, `east_weather.csv`, etc.
- Regional pollutant files: `central_pollutant.csv`, `east_pollutant.csv`, etc.

**Main Output:**
- `SG.csv` - Single file with all 5 regions
- Columns: Country, Region, Date, AQI, Temperature, RelativeHumidity, WindSpeed


In [1]:
import pandas as pd
import os
from pathlib import Path


In [2]:
# Define the regions to process (excluding 'National' as it's not in weather data)
regions = ['Central', 'East', 'North', 'South', 'West']

print(f"Regions to combine: {regions}")


Regions to combine: ['Central', 'East', 'North', 'South', 'West']


In [3]:
# Combine weather and pollutant data for all regions
all_regions_data = []
combined_data_summary = []

print("="*80)
print("COMBINING WEATHER AND POLLUTANT DATA FOR ALL REGIONS")
print("="*80)

for region in regions:
    weather_file = f"{region.lower()}_weather.csv"
    pollutant_file = f"{region.lower()}_pollutant.csv"
    
    print(f"\nProcessing {region} region...")
    
    # Check if both files exist
    if not os.path.exists(weather_file):
        print(f"  ⚠ Warning: {weather_file} not found! Skipping {region}.")
        continue
    
    if not os.path.exists(pollutant_file):
        print(f"  ⚠ Warning: {pollutant_file} not found! Skipping {region}.")
        continue
    
    # Read the weather data
    print(f"  Reading {weather_file}...")
    weather_df = pd.read_csv(weather_file)
    
    # Read the pollutant data
    print(f"  Reading {pollutant_file}...")
    pollutant_df = pd.read_csv(pollutant_file)
    
    # Convert Date columns to datetime for proper merging
    weather_df['Date'] = pd.to_datetime(weather_df['Date'])
    pollutant_df['Date'] = pd.to_datetime(pollutant_df['Date'])
    
    # Merge on Region and Date
    print(f"  Merging weather and pollutant data...")
    combined_df = pd.merge(
        weather_df,
        pollutant_df[['Region', 'Date', 'aqi']],  # Only take Region, Date, and aqi from pollutant
        on=['Region', 'Date'],
        how='inner'  # Only keep dates that exist in both datasets
    )
    
    # Select and reorder columns as specified
    combined_df = combined_df[['Country', 'Region', 'Date', 'aqi', 'Temperature', 'RelativeHumidity', 'WindSpeed']]
    
    # Rename 'aqi' to 'AQI' for consistency
    combined_df.rename(columns={'aqi': 'AQI'}, inplace=True)
    
    # Sort by Date
    combined_df = combined_df.sort_values('Date')
    
    # Add to all_regions_data list for master file
    all_regions_data.append(combined_df)
    
    print(f"  ✓ Added {len(combined_df):,} rows for {region}")
    print(f"    Date range: {combined_df['Date'].min().date()} to {combined_df['Date'].max().date()}")
    print(f"    Weather records: {len(weather_df):,}, Pollutant records: {len(pollutant_df):,}, Combined: {len(combined_df):,}")
    
    # Store summary info
    combined_data_summary.append({
        'Region': region,
        'Weather_Records': len(weather_df),
        'Pollutant_Records': len(pollutant_df),
        'Combined_Records': len(combined_df)
    })

# Create the master file with all regions
if all_regions_data:
    print("\n" + "="*80)
    print("CREATING MASTER FILE - ALL REGIONS COMBINED")
    print("="*80)
    
    master_df = pd.concat(all_regions_data, ignore_index=True)
    
    # Sort by Region and Date
    master_df = master_df.sort_values(['Region', 'Date'])
    
    # Save master file
    master_file = "SG.csv"
    master_df.to_csv(master_file, index=False)
    
    print(f"\n✅ SUCCESSFULLY CREATED: {master_file}")
    print(f"   Total rows: {len(master_df):,}")
    print(f"   File size: {os.path.getsize(master_file) / (1024 * 1024):.2f} MB")
    print(f"   Columns: {list(master_df.columns)}")
    print(f"\n   Region breakdown:")
    for region, count in master_df['Region'].value_counts().sort_index().items():
        print(f"   - {region}: {count:,} rows")
else:
    print("\n⚠ No data found to create master file.")


COMBINING WEATHER AND POLLUTANT DATA FOR ALL REGIONS

Processing Central region...
  Reading central_weather.csv...
  Reading central_pollutant.csv...
  Merging weather and pollutant data...
  ✓ Added 72,227 rows for Central
    Date range: 2016-04-15 to 2024-12-31
    Weather records: 3,126, Pollutant records: 74,591, Combined: 72,227

Processing East region...
  Reading east_weather.csv...
  Reading east_pollutant.csv...
  Merging weather and pollutant data...
  ✓ Added 72,347 rows for East
    Date range: 2016-04-15 to 2024-12-31
    Weather records: 3,131, Pollutant records: 74,591, Combined: 72,347

Processing North region...
  Reading north_weather.csv...
  Reading north_pollutant.csv...
  Merging weather and pollutant data...
  ✓ Added 72,347 rows for North
    Date range: 2016-04-15 to 2024-12-31
    Weather records: 3,131, Pollutant records: 74,591, Combined: 72,347

Processing South region...
  Reading south_weather.csv...
  Reading south_pollutant.csv...
  Merging weather an

In [4]:
# Display master file info and preview
master_file = "SG.csv"

if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    
    print("\n" + "="*80)
    print("MASTER FILE INFO - SG.csv")
    print("="*80)
    
    print(f"\nTotal records: {len(master_df):,}")
    print(f"Date range: {pd.to_datetime(master_df['Date']).min().date()} to {pd.to_datetime(master_df['Date']).max().date()}")
    print(f"\nColumns: {list(master_df.columns)}")
    print(f"\nRegion distribution:")
    print(master_df['Region'].value_counts().sort_index())
    
    print("\n" + "-"*80)
    print("FIRST 10 ROWS:")
    print("-"*80)
    display(master_df.head(10))
    
    print("\n" + "-"*80)
    print("SAMPLE FROM EACH REGION:")
    print("-"*80)
    for region in regions:
        region_sample = master_df[master_df['Region'] == region].head(3)
        if len(region_sample) > 0:
            print(f"\n{region}:")
            display(region_sample)



MASTER FILE INFO - SG.csv

Total records: 360,146
Date range: 2016-04-15 to 2024-12-31

Columns: ['Country', 'Region', 'Date', 'AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']

Region distribution:
Region
Central    72227
East       72347
North      72347
South      70878
West       72347
Name: count, dtype: int64

--------------------------------------------------------------------------------
FIRST 10 ROWS:
--------------------------------------------------------------------------------


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
0,Singapore,Central,2016-04-15,57.0,29.8,,
1,Singapore,Central,2016-04-15,62.0,29.8,,
2,Singapore,Central,2016-04-15,59.0,29.8,,
3,Singapore,Central,2016-04-15,60.0,29.8,,
4,Singapore,Central,2016-04-15,60.0,29.8,,
5,Singapore,Central,2016-04-15,61.0,29.8,,
6,Singapore,Central,2016-04-15,65.0,29.8,,
7,Singapore,Central,2016-04-15,64.0,29.8,,
8,Singapore,Central,2016-04-15,65.0,29.8,,
9,Singapore,Central,2016-04-15,65.0,29.8,,



--------------------------------------------------------------------------------
SAMPLE FROM EACH REGION:
--------------------------------------------------------------------------------

Central:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
0,Singapore,Central,2016-04-15,57.0,29.8,,
1,Singapore,Central,2016-04-15,62.0,29.8,,
2,Singapore,Central,2016-04-15,59.0,29.8,,



East:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
72227,Singapore,East,2016-04-15,55.0,29.55,,
72228,Singapore,East,2016-04-15,60.0,29.55,,
72229,Singapore,East,2016-04-15,58.0,29.55,,



North:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
144574,Singapore,North,2016-04-15,57.0,29.7,,
144575,Singapore,North,2016-04-15,61.0,29.7,,
144576,Singapore,North,2016-04-15,59.0,29.7,,



South:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
216921,Singapore,South,2016-04-15,60.0,29.53,,
216922,Singapore,South,2016-04-15,65.0,29.53,,
216923,Singapore,South,2016-04-15,64.0,29.53,,



West:


Unnamed: 0,Country,Region,Date,AQI,Temperature,RelativeHumidity,WindSpeed
287799,Singapore,West,2016-04-15,54.0,29.98,,
287800,Singapore,West,2016-04-15,58.0,29.98,,
287801,Singapore,West,2016-04-15,56.0,29.98,,


In [5]:
# Display statistics by region from master file
master_file = "SG.csv"

if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    
    print("\n" + "="*80)
    print("STATISTICS BY REGION - SG.csv")
    print("="*80)
    
    for region in regions:
        region_df = master_df[master_df['Region'] == region]
        
        if len(region_df) > 0:
            print(f"\n{region.upper()} REGION:")
            print(f"  Total records: {len(region_df):,}")
            print(f"  AQI - Mean: {region_df['AQI'].mean():.2f}, Min: {region_df['AQI'].min():.2f}, Max: {region_df['AQI'].max():.2f}")
            print(f"  Temperature - Mean: {region_df['Temperature'].mean():.2f}°C, Min: {region_df['Temperature'].min():.2f}°C, Max: {region_df['Temperature'].max():.2f}°C")
            print(f"  Humidity - Mean: {region_df['RelativeHumidity'].mean():.2f}%, Min: {region_df['RelativeHumidity'].min():.2f}%, Max: {region_df['RelativeHumidity'].max():.2f}%")
            print(f"  Wind Speed - Mean: {region_df['WindSpeed'].mean():.2f} km/h, Min: {region_df['WindSpeed'].min():.2f} km/h, Max: {region_df['WindSpeed'].max():.2f} km/h")



STATISTICS BY REGION - SG.csv

CENTRAL REGION:
  Total records: 72,227
  AQI - Mean: 48.06, Min: 12.00, Max: 131.00
  Temperature - Mean: 28.05°C, Min: 22.73°C, Max: 30.60°C
  Humidity - Mean: 80.57%, Min: 61.06%, Max: 99.26%
  Wind Speed - Mean: 6.56 km/h, Min: 2.17 km/h, Max: 15.99 km/h

EAST REGION:
  Total records: 72,347
  AQI - Mean: 48.20, Min: 14.00, Max: 145.00
  Temperature - Mean: 28.29°C, Min: 23.21°C, Max: 30.78°C
  Humidity - Mean: 78.91%, Min: 60.84%, Max: 97.95%
  Wind Speed - Mean: 5.45 km/h, Min: 2.43 km/h, Max: 11.80 km/h

NORTH REGION:
  Total records: 72,347
  AQI - Mean: 46.82, Min: 14.00, Max: 143.00
  Temperature - Mean: 27.83°C, Min: 22.67°C, Max: 30.76°C
  Humidity - Mean: 79.09%, Min: 62.99%, Max: 97.41%
  Wind Speed - Mean: 4.09 km/h, Min: 1.57 km/h, Max: 11.69 km/h

SOUTH REGION:
  Total records: 70,878
  AQI - Mean: 46.90, Min: 13.00, Max: 154.00
  Temperature - Mean: 28.45°C, Min: 23.04°C, Max: 31.90°C
  Humidity - Mean: 78.03%, Min: 55.16%, Max: 99.80%


In [6]:
# Check for missing values in master file
master_file = "SG.csv"

if os.path.exists(master_file):
    master_df = pd.read_csv(master_file)
    
    print("\n" + "="*80)
    print("DATA QUALITY CHECK - MISSING VALUES")
    print("="*80)
    
    # Overall missing values
    missing = master_df.isnull().sum()
    missing_pct = (missing / len(master_df)) * 100
    
    print(f"\nOVERALL ({len(master_df):,} total rows):")
    for col in master_df.columns:
        if missing[col] > 0:
            print(f"  - {col}: {missing[col]:,} missing ({missing_pct[col]:.2f}%)")
        else:
            print(f"  - {col}: ✓ No missing values")
    
    # Missing values by region
    print("\n" + "-"*80)
    print("BY REGION:")
    print("-"*80)
    
    for region in regions:
        region_df = master_df[master_df['Region'] == region]
        
        if len(region_df) > 0:
            region_missing = region_df.isnull().sum()
            region_missing_pct = (region_missing / len(region_df)) * 100
            
            print(f"\n{region.upper()} ({len(region_df):,} rows):")
            has_missing = False
            for col in region_df.columns:
                if region_missing[col] > 0:
                    print(f"  - {col}: {region_missing[col]:,} missing ({region_missing_pct[col]:.2f}%)")
                    has_missing = True
            if not has_missing:
                print("  ✓ No missing values!")



DATA QUALITY CHECK - MISSING VALUES

OVERALL (360,146 total rows):
  - Country: ✓ No missing values
  - Region: ✓ No missing values
  - Date: ✓ No missing values
  - AQI: 13 missing (0.00%)
  - Temperature: 1,458 missing (0.40%)
  - RelativeHumidity: 8,721 missing (2.42%)
  - WindSpeed: 31,989 missing (8.88%)

--------------------------------------------------------------------------------
BY REGION:
--------------------------------------------------------------------------------

CENTRAL (72,227 rows):
  - AQI: 3 missing (0.00%)
  - Temperature: 282 missing (0.39%)
  - RelativeHumidity: 1,725 missing (2.39%)
  - WindSpeed: 6,247 missing (8.65%)

EAST (72,347 rows):
  - AQI: 3 missing (0.00%)
  - Temperature: 282 missing (0.39%)
  - RelativeHumidity: 1,725 missing (2.38%)
  - WindSpeed: 5,581 missing (7.71%)

NORTH (72,347 rows):
  - AQI: 3 missing (0.00%)
  - Temperature: 282 missing (0.39%)
  - RelativeHumidity: 1,725 missing (2.38%)
  - WindSpeed: 5,581 missing (7.71%)

SOUTH (70,8

In [7]:
# Display summary of processing
print("\n" + "="*80)
print("PROCESSING SUMMARY")
print("="*80)

summary_df = pd.DataFrame(combined_data_summary)
print("\nRecords per region:")
display(summary_df)



PROCESSING SUMMARY

Records per region:


Unnamed: 0,Region,Weather_Records,Pollutant_Records,Combined_Records
0,Central,3126,74591,72227
1,East,3131,74591,72347
2,North,3131,74591,72347
3,South,3068,74591,70878
4,West,3131,74591,72347


In [8]:
# Final verification and completion message
master_file = "SG.csv"

print("\n" + "="*80)
print("FINAL VERIFICATION")
print("="*80)

if os.path.exists(master_file):
    file_size = os.path.getsize(master_file) / (1024 * 1024)  # Convert to MB
    df_check = pd.read_csv(master_file)
    
    print(f"\n✅ SUCCESS! Master file created:")
    print(f"\n   📁 File: {master_file}")
    print(f"   📊 Total rows: {len(df_check):,}")
    print(f"   💾 File size: {file_size:.2f} MB")
    print(f"   📋 Columns: {list(df_check.columns)}")
    print(f"\n   🌍 Regions included: {', '.join(df_check['Region'].unique())}")
    
    print("\n" + "="*80)
    print("✅ PROCESS COMPLETED SUCCESSFULLY!")
    print("="*80)
    print(f"\nYour combined data is ready in: {master_file}")
    print(f"This file contains weather and pollutant data for all {len(regions)} regions")
    print(f"from 2016-2024 with {len(df_check):,} total records.")
else:
    print(f"\n❌ ERROR: {master_file} was not created!")
    print("Please check the error messages above.")



FINAL VERIFICATION

✅ SUCCESS! Master file created:

   📁 File: SG.csv
   📊 Total rows: 360,146
   💾 File size: 16.57 MB
   📋 Columns: ['Country', 'Region', 'Date', 'AQI', 'Temperature', 'RelativeHumidity', 'WindSpeed']

   🌍 Regions included: Central, East, North, South, West

✅ PROCESS COMPLETED SUCCESSFULLY!

Your combined data is ready in: SG.csv
This file contains weather and pollutant data for all 5 regions
from 2016-2024 with 360,146 total records.
