# Separate Weather Data by Region

This notebook processes weather data from 2016-2024 and separates it by region (Central, East, North, South, West).

**Output:** Combined CSV files for each region containing data from all years (2016-2024)


In [1]:
import pandas as pd
import os
from pathlib import Path


In [2]:
# Define input folder
input_folder = "../../data/singapore/raw/weather"

# Define the years to process
years = range(2016, 2024)  # 2016 to 2024

# Define the regions
regions = ['Central', 'East', 'North', 'South', 'West']

print(f"Processing weather data for years: {list(years)}")
print(f"Regions to separate: {regions}")


Processing weather data for years: [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
Regions to separate: ['Central', 'East', 'North', 'South', 'West']


In [3]:
# Read all weather data files and combine them
all_weather_data = []

for year in years:
    filename = os.path.join(input_folder, f"weather_{year}.csv")
    
    if os.path.exists(filename):
        print(f"📥 Reading {filename}...")
        df = pd.read_csv(filename)
        print(f"  - Loaded {len(df):,} rows from {year}")
        all_weather_data.append(df)
    else:
        print(f"⚠️ Warning: {filename} not found!")

# Combine all years into one dataframe (if any files were found)
if all_weather_data:
    combined_df = pd.concat(all_weather_data, ignore_index=True)
    print(f"\n✅ Total combined rows: {len(combined_df):,}")
else:
    combined_df = pd.DataFrame()
    print("\n⚠️ No weather data files found!")


📥 Reading ../../data/singapore/raw/weather/weather_2016.csv...
  - Loaded 1,480 rows from 2016
📥 Reading ../../data/singapore/raw/weather/weather_2017.csv...
  - Loaded 1,797 rows from 2017
📥 Reading ../../data/singapore/raw/weather/weather_2018.csv...
  - Loaded 1,813 rows from 2018
📥 Reading ../../data/singapore/raw/weather/weather_2019.csv...
  - Loaded 1,790 rows from 2019
📥 Reading ../../data/singapore/raw/weather/weather_2020.csv...
  - Loaded 1,828 rows from 2020
📥 Reading ../../data/singapore/raw/weather/weather_2021.csv...
  - Loaded 1,810 rows from 2021
📥 Reading ../../data/singapore/raw/weather/weather_2022.csv...
  - Loaded 1,825 rows from 2022
📥 Reading ../../data/singapore/raw/weather/weather_2023.csv...
  - Loaded 1,780 rows from 2023

✅ Total combined rows: 14,123


In [4]:
# Display basic information about the combined data
print("\nCombined Data Info:")
print(combined_df.info())
print("\nFirst few rows:")
display(combined_df.head(10))



Combined Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14123 entries, 0 to 14122
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country           14123 non-null  object 
 1   Region            14123 non-null  object 
 2   Date              14123 non-null  object 
 3   Temperature       14123 non-null  float64
 4   RelativeHumidity  14123 non-null  float64
 5   WindSpeed         14123 non-null  float64
dtypes: float64(3), object(3)
memory usage: 662.1+ KB
None

First few rows:


Unnamed: 0,Country,Region,Date,Temperature,RelativeHumidity,WindSpeed
0,Singapore,Central,2016-02-07,25.87,87.26,17.8
1,Singapore,East,2016-02-07,26.17,84.66,18.6
2,Singapore,North,2016-02-07,25.47,90.06,17.6
3,Singapore,South,2016-02-07,26.27,83.46,18.9
4,Singapore,West,2016-02-07,25.67,88.66,17.8
5,Singapore,Central,2016-02-08,26.26,86.48,16.4
6,Singapore,East,2016-02-08,26.56,83.88,17.2
7,Singapore,North,2016-02-08,25.86,89.28,16.2
8,Singapore,South,2016-02-08,26.66,82.68,17.5
9,Singapore,West,2016-02-08,26.06,87.88,16.4


In [5]:
# Check unique regions in the data
unique_regions = combined_df['Region'].unique()
print(f"Unique regions found in data: {unique_regions}")
print(f"\nRegion counts:")
print(combined_df['Region'].value_counts())


Unique regions found in data: ['Central' 'East' 'North' 'South' 'West']

Region counts:
Region
Central    2833
East       2833
North      2833
West       2833
South      2791
Name: count, dtype: int64


In [6]:
# Ensure the output folder exists
output_folder = "../../data/singapore/clean/weather"
os.makedirs(output_folder, exist_ok=True)

# Separate data by region and save to individual CSV files
for region in regions:
    # Filter data for the current region
    region_df = combined_df[combined_df['Region'] == region].copy()
    
    if len(region_df) > 0:
        # Sort by date for better organization
        region_df['Date'] = pd.to_datetime(region_df['Date'])
        region_df = region_df.sort_values('Date')
        
        # Create full output path (folder + filename)
        output_filename = os.path.join(output_folder, f"{region.lower()}_weather.csv")
        
        # Save to CSV
        region_df.to_csv(output_filename, index=False)
        
        print(f"✓ Saved {len(region_df):,} rows to {output_filename}")
        print(f"  Date range: {region_df['Date'].min().date()} to {region_df['Date'].max().date()}")
    else:
        print(f"⚠ Warning: No data found for {region} region")

print("\n All regional weather files created successfully!")

✓ Saved 2,833 rows to ../../data/singapore/clean/weather/central_weather.csv
  Date range: 2016-02-07 to 2023-12-31
✓ Saved 2,833 rows to ../../data/singapore/clean/weather/east_weather.csv
  Date range: 2016-02-07 to 2023-12-31
✓ Saved 2,833 rows to ../../data/singapore/clean/weather/north_weather.csv
  Date range: 2016-02-07 to 2023-12-31
✓ Saved 2,791 rows to ../../data/singapore/clean/weather/south_weather.csv
  Date range: 2016-02-07 to 2023-12-31
✓ Saved 2,833 rows to ../../data/singapore/clean/weather/west_weather.csv
  Date range: 2016-02-07 to 2023-12-31

 All regional weather files created successfully!


In [7]:
# Display summary statistics for each region
print("\n" + "="*60)
print("SUMMARY STATISTICS BY REGION")
print("="*60)

for region in regions:
    region_df = combined_df[combined_df['Region'] == region]
    
    if len(region_df) > 0:
        print(f"\n{region.upper()} REGION:")
        print(f"  Total records: {len(region_df)}")
        print(f"  Temperature - Mean: {region_df['Temperature'].mean():.2f}°C, Min: {region_df['Temperature'].min():.2f}°C, Max: {region_df['Temperature'].max():.2f}°C")
        print(f"  Humidity - Mean: {region_df['RelativeHumidity'].mean():.2f}%, Min: {region_df['RelativeHumidity'].min():.2f}%, Max: {region_df['RelativeHumidity'].max():.2f}%")
        print(f"  Wind Speed - Mean: {region_df['WindSpeed'].mean():.2f} km/h, Min: {region_df['WindSpeed'].min():.2f} km/h, Max: {region_df['WindSpeed'].max():.2f} km/h")



SUMMARY STATISTICS BY REGION

CENTRAL REGION:
  Total records: 2833
  Temperature - Mean: 27.99°C, Min: 22.73°C, Max: 30.60°C
  Humidity - Mean: 81.03%, Min: 61.06%, Max: 99.26%
  Wind Speed - Mean: 6.82 km/h, Min: 2.84 km/h, Max: 17.80 km/h

EAST REGION:
  Total records: 2833
  Temperature - Mean: 28.23°C, Min: 23.21°C, Max: 30.78°C
  Humidity - Mean: 79.22%, Min: 60.84%, Max: 97.95%
  Wind Speed - Mean: 5.67 km/h, Min: 2.43 km/h, Max: 18.60 km/h

NORTH REGION:
  Total records: 2833
  Temperature - Mean: 27.75°C, Min: 22.67°C, Max: 30.76°C
  Humidity - Mean: 79.32%, Min: 62.99%, Max: 97.41%
  Wind Speed - Mean: 4.43 km/h, Min: 1.57 km/h, Max: 17.60 km/h

SOUTH REGION:
  Total records: 2791
  Temperature - Mean: 28.39°C, Min: 23.04°C, Max: 31.90°C
  Humidity - Mean: 78.36%, Min: 55.16%, Max: 99.80%
  Wind Speed - Mean: 4.18 km/h, Min: 0.70 km/h, Max: 18.90 km/h

WEST REGION:
  Total records: 2833
  Temperature - Mean: 27.86°C, Min: 23.05°C, Max: 30.70°C
  Humidity - Mean: 79.65%, Min:

In [8]:
base_path = "../../data/singapore/clean/weather"  

print("\n" + "="*60)
print("VERIFICATION OF OUTPUT FILES")
print("="*60)

for region in regions:
    filename = f"{region.lower()}_weather.csv"
    file_path = os.path.join(base_path, filename)  # <-- full path
    
    if os.path.exists(file_path):
        file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
        df_check = pd.read_csv(file_path)
        print(f"\n✓ {filename}")
        print(f"  - Rows: {len(df_check):,}")
        print(f"  - File size: {file_size:.2f} MB")
        print(f"  - Columns: {list(df_check.columns)}")
    else:
        print(f"\n✗ {filename} - NOT FOUND")


VERIFICATION OF OUTPUT FILES

✓ central_weather.csv
  - Rows: 2,833
  - File size: 0.13 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ east_weather.csv
  - Rows: 2,833
  - File size: 0.12 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ north_weather.csv
  - Rows: 2,833
  - File size: 0.12 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ south_weather.csv
  - Rows: 2,791
  - File size: 0.12 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ west_weather.csv
  - Rows: 2,833
  - File size: 0.12 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']
