# Separate Weather Data by Region

This notebook processes weather data from 2016-2024 and separates it by region (Central, East, North, South, West).

**Output:** Combined CSV files for each region containing data from all years (2016-2024)


In [1]:
import pandas as pd
import os
from pathlib import Path


In [2]:
# Define the years to process
years = range(2016, 2025)  # 2016 to 2024

# Define the regions
regions = ['Central', 'East', 'North', 'South', 'West']

print(f"Processing weather data for years: {list(years)}")
print(f"Regions to separate: {regions}")


Processing weather data for years: [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
Regions to separate: ['Central', 'East', 'North', 'South', 'West']


In [3]:
# Read all weather data files and combine them
all_weather_data = []

for year in years:
    filename = f"weather_{year}.csv"
    
    if os.path.exists(filename):
        print(f"Reading {filename}...")
        df = pd.read_csv(filename)
        print(f"  - Loaded {len(df)} rows from {year}")
        all_weather_data.append(df)
    else:
        print(f"Warning: {filename} not found!")

# Combine all years into one dataframe
combined_df = pd.concat(all_weather_data, ignore_index=True)
print(f"\nTotal combined rows: {len(combined_df)}")


Reading weather_2016.csv...
  - Loaded 1140 rows from 2016
Reading weather_2017.csv...
  - Loaded 1797 rows from 2017
Reading weather_2018.csv...
  - Loaded 1813 rows from 2018
Reading weather_2019.csv...
  - Loaded 1790 rows from 2019
Reading weather_2020.csv...
  - Loaded 1828 rows from 2020
Reading weather_2021.csv...
  - Loaded 1810 rows from 2021
Reading weather_2022.csv...
  - Loaded 1825 rows from 2022
Reading weather_2023.csv...
  - Loaded 1780 rows from 2023
Reading weather_2024.csv...
  - Loaded 1804 rows from 2024

Total combined rows: 15587


In [4]:
# Display basic information about the combined data
print("\nCombined Data Info:")
print(combined_df.info())
print("\nFirst few rows:")
display(combined_df.head(10))



Combined Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15587 entries, 0 to 15586
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Country           15587 non-null  object 
 1   Region            15587 non-null  object 
 2   Date              15587 non-null  object 
 3   Temperature       15500 non-null  float64
 4   RelativeHumidity  15208 non-null  float64
 5   WindSpeed         14183 non-null  float64
dtypes: float64(3), object(3)
memory usage: 730.8+ KB
None

First few rows:


Unnamed: 0,Country,Region,Date,Temperature,RelativeHumidity,WindSpeed
0,Singapore,Central,2016-04-15,29.8,,
1,Singapore,East,2016-04-15,29.55,,
2,Singapore,North,2016-04-15,29.7,,
3,Singapore,South,2016-04-15,29.53,,
4,Singapore,West,2016-04-15,29.98,,
5,Singapore,Central,2016-04-29,28.12,,
6,Singapore,East,2016-04-29,28.49,,
7,Singapore,North,2016-04-29,27.55,,
8,Singapore,South,2016-04-29,28.9,,
9,Singapore,West,2016-04-29,27.96,,


In [5]:
# Check unique regions in the data
unique_regions = combined_df['Region'].unique()
print(f"Unique regions found in data: {unique_regions}")
print(f"\nRegion counts:")
print(combined_df['Region'].value_counts())


Unique regions found in data: ['Central' 'East' 'North' 'South' 'West']

Region counts:
Region
East       3131
North      3131
West       3131
Central    3126
South      3068
Name: count, dtype: int64


In [6]:
# Separate data by region and save to individual CSV files
for region in regions:
    # Filter data for the current region
    region_df = combined_df[combined_df['Region'] == region].copy()
    
    if len(region_df) > 0:
        # Sort by date for better organization
        region_df['Date'] = pd.to_datetime(region_df['Date'])
        region_df = region_df.sort_values('Date')
        
        # Create filename
        output_filename = f"{region.lower()}_weather.csv"
        
        # Save to CSV
        region_df.to_csv(output_filename, index=False)
        
        print(f"✓ Saved {len(region_df)} rows to {output_filename}")
        print(f"  Date range: {region_df['Date'].min()} to {region_df['Date'].max()}")
    else:
        print(f"⚠ Warning: No data found for {region} region")

print("\n✅ All regional weather files created successfully!")


✓ Saved 3126 rows to central_weather.csv
  Date range: 2016-04-15 00:00:00 to 2024-12-31 00:00:00
✓ Saved 3131 rows to east_weather.csv
  Date range: 2016-04-15 00:00:00 to 2024-12-31 00:00:00
✓ Saved 3131 rows to north_weather.csv
  Date range: 2016-04-15 00:00:00 to 2024-12-31 00:00:00
✓ Saved 3068 rows to south_weather.csv
  Date range: 2016-04-15 00:00:00 to 2024-12-31 00:00:00
✓ Saved 3131 rows to west_weather.csv
  Date range: 2016-04-15 00:00:00 to 2024-12-31 00:00:00

✅ All regional weather files created successfully!


In [7]:
# Display summary statistics for each region
print("\n" + "="*60)
print("SUMMARY STATISTICS BY REGION")
print("="*60)

for region in regions:
    region_df = combined_df[combined_df['Region'] == region]
    
    if len(region_df) > 0:
        print(f"\n{region.upper()} REGION:")
        print(f"  Total records: {len(region_df)}")
        print(f"  Temperature - Mean: {region_df['Temperature'].mean():.2f}°C, Min: {region_df['Temperature'].min():.2f}°C, Max: {region_df['Temperature'].max():.2f}°C")
        print(f"  Humidity - Mean: {region_df['RelativeHumidity'].mean():.2f}%, Min: {region_df['RelativeHumidity'].min():.2f}%, Max: {region_df['RelativeHumidity'].max():.2f}%")
        print(f"  Wind Speed - Mean: {region_df['WindSpeed'].mean():.2f} km/h, Min: {region_df['WindSpeed'].min():.2f} km/h, Max: {region_df['WindSpeed'].max():.2f} km/h")



SUMMARY STATISTICS BY REGION

CENTRAL REGION:
  Total records: 3126
  Temperature - Mean: 28.05°C, Min: 22.73°C, Max: 30.60°C
  Humidity - Mean: 80.59%, Min: 61.06%, Max: 99.26%
  Wind Speed - Mean: 6.57 km/h, Min: 2.17 km/h, Max: 15.99 km/h

EAST REGION:
  Total records: 3131
  Temperature - Mean: 28.28°C, Min: 23.21°C, Max: 30.78°C
  Humidity - Mean: 78.94%, Min: 60.84%, Max: 97.95%
  Wind Speed - Mean: 5.45 km/h, Min: 2.43 km/h, Max: 11.80 km/h

NORTH REGION:
  Total records: 3131
  Temperature - Mean: 27.82°C, Min: 22.67°C, Max: 30.76°C
  Humidity - Mean: 79.10%, Min: 62.99%, Max: 97.41%
  Wind Speed - Mean: 4.10 km/h, Min: 1.57 km/h, Max: 11.69 km/h

SOUTH REGION:
  Total records: 3068
  Temperature - Mean: 28.44°C, Min: 23.04°C, Max: 31.90°C
  Humidity - Mean: 78.05%, Min: 55.16%, Max: 99.80%
  Wind Speed - Mean: 3.67 km/h, Min: 0.70 km/h, Max: 20.70 km/h

WEST REGION:
  Total records: 3131
  Temperature - Mean: 27.94°C, Min: 23.05°C, Max: 30.70°C
  Humidity - Mean: 79.56%, Min:

In [8]:
# Verify the created files
print("\n" + "="*60)
print("VERIFICATION OF OUTPUT FILES")
print("="*60)

for region in regions:
    filename = f"{region.lower()}_weather.csv"
    if os.path.exists(filename):
        file_size = os.path.getsize(filename) / (1024 * 1024)  # Convert to MB
        df_check = pd.read_csv(filename)
        print(f"\n✓ {filename}")
        print(f"  - Rows: {len(df_check):,}")
        print(f"  - File size: {file_size:.2f} MB")
        print(f"  - Columns: {list(df_check.columns)}")
    else:
        print(f"\n✗ {filename} - NOT FOUND")



VERIFICATION OF OUTPUT FILES

✓ central_weather.csv
  - Rows: 3,126
  - File size: 0.14 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ east_weather.csv
  - Rows: 3,131
  - File size: 0.13 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ north_weather.csv
  - Rows: 3,131
  - File size: 0.13 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ south_weather.csv
  - Rows: 3,068
  - File size: 0.13 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']

✓ west_weather.csv
  - Rows: 3,131
  - File size: 0.13 MB
  - Columns: ['Country', 'Region', 'Date', 'Temperature', 'RelativeHumidity', 'WindSpeed']
