In [1]:
# Standard library
import os
import re
import gc
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# Geospatial
import h3
from osgeo import gdal, osr
import geopandas as gpd
import dask_geopandas as dgpd
from shapely.geometry import Point, shape
import rasterio
from rasterstats import zonal_stats

# Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# Custom
from validator import validate

# Project paths
BASE_DIR = Path('/Users/wenlanzhang/PycharmProjects/Mapineq/src/data-wrangling/')
DATA_DIR = Path('/Users/wenlanzhang/Downloads/PhD_UCL/Data/Oxford')

# Climatology

In [2]:
WC_Climatology_ITL = pd.read_csv(DATA_DIR/'GEE/WC/WC_Climatology_ITL.csv')
WC_Climatology_EURO = pd.read_csv(DATA_DIR/'GEE/WC/WC_Climatology_EURO.csv')
WC_Climatology_NUTS = pd.read_csv(DATA_DIR/'GEE/WC/WC_Climatology_NUTS.csv')

cols_to_keep = [
    'geo', 'month', 'geo_source', 
    'prec_max', 'prec_mean', 'prec_median', 'prec_min', 
    'prec_p10', 'prec_p25', 'prec_p75', 'prec_p90', 'prec_stdDev', 
    'tavg_max', 'tavg_mean', 'tavg_median', 'tavg_min', 
    'tavg_p10', 'tavg_p25', 'tavg_p75', 'tavg_p90', 'tavg_stdDev',
    'tmax_max', 'tmax_mean', 'tmax_median', 'tmax_min', 
    'tmax_p10', 'tmax_p25', 'tmax_p75', 'tmax_p90', 'tmax_stdDev', 
    'tmin_max', 'tmin_mean', 'tmin_median', 'tmin_min', 
    'tmin_p10', 'tmin_p25', 'tmin_p75', 'tmin_p90', 'tmin_stdDev'
]

WC_Climatology_ITL = WC_Climatology_ITL[cols_to_keep]
WC_Climatology_EURO = WC_Climatology_EURO[cols_to_keep]
WC_Climatology_NUTS = WC_Climatology_NUTS[cols_to_keep]

print(f"WC_Climatology_ITL: {len(WC_Climatology_ITL)}, "
      f"WC_Climatology_NUTS: {len(WC_Climatology_NUTS)}, "
      f"WC_Climatology_EURO: {len(WC_Climatology_EURO)}")

WC_Climatology_ITL

WC_Climatology_ITL: 5664, WC_Climatology_NUTS: 162204, WC_Climatology_EURO: 51384


Unnamed: 0,geo,month,geo_source,prec_max,prec_mean,prec_median,prec_min,prec_p10,prec_p25,prec_p75,...,tmax_stdDev,tmin_max,tmin_mean,tmin_median,tmin_min,tmin_p10,tmin_p25,tmin_p75,tmin_p90,tmin_stdDev
0,TLC,1,ITL2021,154.0,75.289812,69.000000,53.0,57.000000,61.000000,86.000000,...,9.178151,9.0,-6.452846,-4.0,-42.0,-19.0,-13.0,0.0,3.0,8.615390
1,TLD,1,ITL2021,188.0,102.855623,101.450308,58.0,71.518972,82.724042,121.437899,...,10.781052,18.0,-2.721406,-1.0,-44.0,-17.0,-9.0,6.0,10.0,10.575961
2,TLE,1,ITL2021,168.0,75.810575,65.000000,49.0,53.000000,58.000000,84.000000,...,7.833384,13.0,2.955453,6.0,-32.0,-8.0,0.0,8.0,9.0,7.459656
3,TLF,1,ITL2021,143.0,58.487949,54.000000,45.0,49.000000,50.000000,58.000000,...,3.498544,12.0,1.027150,0.0,-20.0,-3.0,-1.0,3.0,6.0,3.495255
4,TLG,1,ITL2021,162.0,69.491206,65.000000,51.0,56.000000,59.000000,75.000000,...,6.234675,16.0,2.024078,2.0,-25.0,-4.0,0.0,5.0,8.0,4.765306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5659,TLN0C,12,ITL2025,161.0,116.384822,113.000000,97.0,105.000000,108.000000,122.000000,...,5.187600,22.0,12.432127,14.0,-7.0,4.0,9.0,16.0,18.0,5.139118
5660,TLN0D,12,ITL2025,140.0,95.948451,92.000000,85.0,86.000000,88.000000,103.000000,...,3.963826,22.0,16.589226,17.0,0.0,11.0,14.0,20.0,21.0,3.808553
5661,TLN0E,12,ITL2025,125.0,101.098678,102.000000,85.0,91.000000,94.000000,107.000000,...,3.160662,23.0,18.338487,19.0,6.0,15.0,17.0,21.0,22.0,2.993828
5662,TLN0F,12,ITL2025,158.0,114.062798,113.000000,91.0,97.000000,104.000000,123.000000,...,4.997842,21.0,11.544876,12.0,-4.0,5.0,8.0,15.0,17.0,4.768186


In [3]:
WC_Climatology = pd.concat(
    [WC_Climatology_ITL, WC_Climatology_EURO, WC_Climatology_NUTS],
    axis=0,        # Stack them vertically (rows)
    ignore_index=True  # Reset the index after concatenation
)
WC_Climatology

Unnamed: 0,geo,month,geo_source,prec_max,prec_mean,prec_median,prec_min,prec_p10,prec_p25,prec_p75,...,tmax_stdDev,tmin_max,tmin_mean,tmin_median,tmin_min,tmin_p10,tmin_p25,tmin_p75,tmin_p90,tmin_stdDev
0,TLC,1,ITL2021,154.0,75.289812,69.000000,53.0,57.000000,61.000000,86.000000,...,9.178151,9.0,-6.452846,-4.0,-42.0,-19.0,-13.0,0.0,3.0,8.615390
1,TLD,1,ITL2021,188.0,102.855623,101.450308,58.0,71.518972,82.724042,121.437899,...,10.781052,18.0,-2.721406,-1.0,-44.0,-17.0,-9.0,6.0,10.0,10.575961
2,TLE,1,ITL2021,168.0,75.810575,65.000000,49.0,53.000000,58.000000,84.000000,...,7.833384,13.0,2.955453,6.0,-32.0,-8.0,0.0,8.0,9.0,7.459656
3,TLF,1,ITL2021,143.0,58.487949,54.000000,45.0,49.000000,50.000000,58.000000,...,3.498544,12.0,1.027150,0.0,-20.0,-3.0,-1.0,3.0,6.0,3.495255
4,TLG,1,ITL2021,162.0,69.491206,65.000000,51.0,56.000000,59.000000,75.000000,...,6.234675,16.0,2.024078,2.0,-25.0,-4.0,0.0,5.0,8.0,4.765306
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219247,SI037,12,NUTS2024,140.0,116.959414,118.000000,94.0,103.000000,109.000000,125.000000,...,13.103196,20.0,-8.056547,-6.0,-44.0,-23.0,-14.0,-1.0,3.0,9.964836
219248,SI038,12,NUTS2024,134.0,118.166502,118.000000,103.0,112.000000,115.000000,121.000000,...,16.797606,24.0,-7.297249,-7.0,-54.0,-28.0,-19.0,4.0,14.0,15.498433
219249,SI043,12,NUTS2024,110.0,96.237323,96.000000,82.0,90.000000,94.000000,99.000000,...,28.621039,37.0,-20.031658,-21.0,-96.0,-57.0,-37.0,-1.0,19.0,27.324950
219250,SK041,12,NUTS2024,140.0,52.375534,51.000000,31.0,42.000000,47.000000,56.000000,...,10.587606,-31.0,-52.746562,-50.0,-113.0,-69.0,-61.0,-43.0,-40.0,12.453078


In [4]:
(13517 + 472 + 4282) * 12

219252

In [5]:
# Melt all columns except geo, month, geo_source
id_vars = ['geo', 'month', 'geo_source']

Climatology_long = WC_Climatology.melt(
    id_vars=id_vars,
    var_name="variable",
    value_name="obsValue"
)

# Split the 'variable' column into 'band' and 'metric'
Climatology_long[['band', 'statistic']] = Climatology_long['variable'].str.split('_', n=1, expand=True)

Climatology_long['obsTime'] = 2000

Climatology_long['statistic'] = Climatology_long['statistic'].str.replace('stdDev', 'std_dev', regex=False)

# Drop the old combined column
Climatology_long = Climatology_long.drop(columns=['variable'])

# Reorder columns for clarity
Climatology_long = Climatology_long[['geo', 'month', 'obsTime', 'geo_source', 'band', 'statistic', 'obsValue']]

# Preview result
Climatology_long

Unnamed: 0,geo,month,obsTime,geo_source,band,statistic,obsValue
0,TLC,1,2000,ITL2021,prec,max,154.000000
1,TLD,1,2000,ITL2021,prec,max,188.000000
2,TLE,1,2000,ITL2021,prec,max,168.000000
3,TLF,1,2000,ITL2021,prec,max,143.000000
4,TLG,1,2000,ITL2021,prec,max,162.000000
...,...,...,...,...,...,...,...
7893067,SI037,12,2000,NUTS2024,tmin,std_dev,9.964836
7893068,SI038,12,2000,NUTS2024,tmin,std_dev,15.498433
7893069,SI043,12,2000,NUTS2024,tmin,std_dev,27.324950
7893070,SK041,12,2000,NUTS2024,tmin,std_dev,12.453078


In [6]:
# Create a mapping dictionary for the percentiles
rename_dict = {
    'p10': '10th_percentile',
    'p25': '25th_percentile',
    'p75': '75th_percentile',
    'p90': '90th_percentile'
}

# If your dataframe is long-format and the percentile info is in 'metric' column:
Climatology_long['statistic'] = Climatology_long['statistic'].replace(rename_dict)

In [7]:
Climatology_long['statistic'].unique()

array(['max', 'mean', 'median', 'min', '10th_percentile',
       '25th_percentile', '75th_percentile', '90th_percentile', 'std_dev'],
      dtype=object)

In [8]:
# Climatology_long.to_csv(DATA_DIR/"GEE/WC/Output/WC_Climatology_AllMonths.csv", index=True, index_label="id")

In [9]:
Climatology = pd.read_csv(DATA_DIR/"GEE/WC/Output/WC_Climatology_AllMonths.csv")
Climatology

Unnamed: 0,id,geo,month,obsTime,geo_source,band,statistic,obsValue
0,0,TLC,1,2000,ITL2021,prec,max,154.000000
1,1,TLD,1,2000,ITL2021,prec,max,188.000000
2,2,TLE,1,2000,ITL2021,prec,max,168.000000
3,3,TLF,1,2000,ITL2021,prec,max,143.000000
4,4,TLG,1,2000,ITL2021,prec,max,162.000000
...,...,...,...,...,...,...,...,...
7893067,7893067,SI037,12,2000,NUTS2024,tmin,std_dev,9.964836
7893068,7893068,SI038,12,2000,NUTS2024,tmin,std_dev,15.498433
7893069,7893069,SI043,12,2000,NUTS2024,tmin,std_dev,27.324950
7893070,7893070,SK041,12,2000,NUTS2024,tmin,std_dev,12.453078


In [10]:
validate(data=Climatology)

Unnamed: 0,id,geo,month,obsTime,geo_source,band,statistic,obsValue
0,0,TLC,1,2000,ITL2021,prec,max,154.000000
1,1,TLD,1,2000,ITL2021,prec,max,188.000000
2,2,TLE,1,2000,ITL2021,prec,max,168.000000
3,3,TLF,1,2000,ITL2021,prec,max,143.000000
4,4,TLG,1,2000,ITL2021,prec,max,162.000000
...,...,...,...,...,...,...,...,...
7893067,7893067,SI037,12,2000,NUTS2024,tmin,std_dev,9.964836
7893068,7893068,SI038,12,2000,NUTS2024,tmin,std_dev,15.498433
7893069,7893069,SI043,12,2000,NUTS2024,tmin,std_dev,27.324950
7893070,7893070,SK041,12,2000,NUTS2024,tmin,std_dev,12.453078


# WC_Bio

In [11]:
WC_Bio_ITL = pd.read_csv(DATA_DIR/'GEE/WC/WC_Bio_AllBands_ITL.csv')
WC_Bio_NUTS = pd.read_csv(DATA_DIR/'GEE/WC/WC_Bio_AllBands_NUTS.csv')
WC_Bio_EURO = pd.read_csv(DATA_DIR/'GEE/WC/WC_Bio_AllBands_EURO.csv')

WC_Bio_NUTS = WC_Bio_NUTS.drop(columns=['system:index', 'CNTR_CODE', 'LEVL_CODE', 'NUTS_NAME', 'NUTS_ID', '.geo'])
WC_Bio_ITL = WC_Bio_ITL.drop(columns=['system:index', 'BNG_E', 'BNG_N', 'GlobalID', 'ITL_CODE', 'ITL_LEVEL', 'ITL_NAME', 'LAT', 'LONG', '.geo'])
WC_Bio_EURO = WC_Bio_EURO.drop(columns=['system:index', 'CNTR_CODE', 'EURO_CODE', 'EURO_LEVEL', 'EURO_NAME', '.geo'])

WC_Bio_EURO

Unnamed: 0,bio01_max,bio01_mean,bio01_median,bio01_min,bio01_p10,bio01_p25,bio01_p75,bio01_p90,bio01_stdDev,bio02_max,...,bio19_median,bio19_min,bio19_p10,bio19_p25,bio19_p75,bio19_p90,bio19_stdDev,geo,geo_source,obsTime
0,167.0,114.048609,115.328540,-2.0,68.367653,91.000000,143.000000,155.098134,33.084896,108.0,...,369.348889,210.0,301.505896,333.573657,425.506011,521.401569,89.582099,AL,EURO2021,2000
1,167.0,114.048609,115.328540,-2.0,68.367653,91.000000,143.000000,155.098134,33.084896,108.0,...,369.348889,210.0,301.505896,333.573657,425.506011,521.401569,89.582099,AL0,EURO2021,2000
2,160.0,105.164668,108.000000,-2.0,56.000000,81.000000,132.000000,152.000000,34.563004,99.0,...,368.466599,267.0,314.926123,334.435665,446.562475,552.663158,91.573271,AL01,EURO2021,2000
3,162.0,121.037974,129.000000,27.0,73.563232,100.409976,147.000000,153.000000,30.443386,101.0,...,357.000000,275.0,320.601661,334.585069,377.000000,405.000000,32.779834,AL02,EURO2021,2000
4,167.0,118.990407,119.045618,21.0,78.000000,95.443646,147.376739,159.555301,31.050343,108.0,...,373.387812,210.0,265.650541,329.549627,437.490735,541.515669,99.715921,AL03,EURO2021,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4277,111.0,97.077030,98.000000,60.0,85.000000,90.000000,105.000000,109.000000,9.232372,109.0,...,155.000000,135.0,142.000000,147.000000,162.000000,169.000000,10.322335,XK006,EURO2025,2000
4278,123.0,107.013977,112.000000,8.0,93.000000,104.000000,117.000000,120.000000,18.360986,99.0,...,269.000000,215.0,225.000000,242.000000,297.000000,313.000000,31.444018,XK007,EURO2025,2000
4279,125.0,91.641323,97.000000,-2.0,60.000000,84.000000,106.000000,113.000000,21.783624,109.0,...,211.000000,135.0,162.000000,178.000000,255.000000,292.000000,47.405136,XK00,EURO2025,2000
4280,125.0,91.641323,97.000000,-2.0,60.000000,84.000000,106.000000,113.000000,21.783624,109.0,...,211.000000,135.0,162.000000,178.000000,255.000000,292.000000,47.405136,XK0,EURO2025,2000


In [12]:
# Check whether the 3 df has the same columns

dfs = {
    "WC_Bio_NUTS": WC_Bio_NUTS,
    "WC_Bio_ITL": WC_Bio_ITL,
    "WC_Bio_EURO": WC_Bio_EURO
}

# 1. Check total rows and individual row counts
total_rows = sum(len(df) for df in dfs.values())
row_counts = {name: len(df) for name, df in dfs.items()}

print("Row counts per dataframe:", row_counts)
print("Total rows across all dataframes:", total_rows)

# 2. Check if all dataframes have the same columns
all_columns = [set(df.columns) for df in dfs.values()]
same_columns = all(col_set == all_columns[0] for col_set in all_columns)

if same_columns:
    print("\n✅ All dataframes have the same columns.")
else:
    print("\n⚠️ Dataframes do NOT have the same columns.")
    for name, df in dfs.items():
        missing_cols = set(all_columns[0]) - set(df.columns)
        extra_cols   = set(df.columns) - set(all_columns[0])
        if missing_cols or extra_cols:
            print(f"\n{name}:")
            if missing_cols:
                print(f"  Missing columns: {sorted(missing_cols)}")
            if extra_cols:
                print(f"  Extra columns:   {sorted(extra_cols)}")


Row counts per dataframe: {'WC_Bio_NUTS': 13517, 'WC_Bio_ITL': 472, 'WC_Bio_EURO': 4282}
Total rows across all dataframes: 18271

✅ All dataframes have the same columns.


In [13]:
WC_Bio = pd.concat(
    [WC_Bio_NUTS, WC_Bio_ITL, WC_Bio_EURO],
    axis=0,        # Stack them vertically (rows)
    ignore_index=True  # Reset the index after concatenation
)
WC_Bio

Unnamed: 0,bio01_max,bio01_mean,bio01_median,bio01_min,bio01_p10,bio01_p25,bio01_p75,bio01_p90,bio01_stdDev,bio02_max,...,bio19_median,bio19_min,bio19_p10,bio19_p25,bio19_p75,bio19_p90,bio19_stdDev,geo,geo_source,obsTime
0,102.0,57.532553,65.075738,-77.0,10.317962,38.256663,83.273108,91.456379,31.962754,110.0,...,178.616439,93.0,112.888820,130.475550,232.648775,276.486968,64.218941,AT,NUTS2003,2000
1,93.0,45.683650,53.012718,-77.0,-5.000000,21.000000,76.000000,85.000000,34.693861,110.0,...,216.737629,108.0,154.552540,178.530539,260.501997,296.625483,56.367467,AT3,NUTS2003,2000
2,102.0,82.065464,86.000000,-2.0,61.000000,73.000000,94.000000,98.000000,15.254315,103.0,...,117.000000,93.0,105.589316,110.104368,133.000000,156.000000,22.524612,AT1,NUTS2003,2000
3,98.0,50.587501,52.509672,-77.0,14.000000,32.368959,72.706237,84.421014,26.837527,107.0,...,194.468506,106.0,122.646568,150.951004,234.753962,276.487202,57.597071,AT2,NUTS2003,2000
4,92.0,45.438731,46.000000,-47.0,5.000000,23.000000,70.000000,88.000000,30.153037,96.0,...,214.555988,149.0,188.488020,202.610205,230.610998,274.770234,38.642606,AT34,NUTS2003,2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18266,111.0,97.077030,98.000000,60.0,85.000000,90.000000,105.000000,109.000000,9.232372,109.0,...,155.000000,135.0,142.000000,147.000000,162.000000,169.000000,10.322335,XK006,EURO2025,2000
18267,123.0,107.013977,112.000000,8.0,93.000000,104.000000,117.000000,120.000000,18.360986,99.0,...,269.000000,215.0,225.000000,242.000000,297.000000,313.000000,31.444018,XK007,EURO2025,2000
18268,125.0,91.641323,97.000000,-2.0,60.000000,84.000000,106.000000,113.000000,21.783624,109.0,...,211.000000,135.0,162.000000,178.000000,255.000000,292.000000,47.405136,XK00,EURO2025,2000
18269,125.0,91.641323,97.000000,-2.0,60.000000,84.000000,106.000000,113.000000,21.783624,109.0,...,211.000000,135.0,162.000000,178.000000,255.000000,292.000000,47.405136,XK0,EURO2025,2000


In [14]:
id_vars = ['geo', 'obsTime', 'geo_source']  # Keep these columns intact

# Melt all the other columns
WC_Bio_long = WC_Bio.melt(
    id_vars=id_vars,
    var_name="variable",
    value_name="obsValue"
)

# Split 'variable' into 'band' and 'statistic'
WC_Bio_long[['band', 'statistic']] = WC_Bio_long['variable'].str.rsplit('_', n=1, expand=True)

# Standardize the 'stdDev' name to 'std_dev'
WC_Bio_long['statistic'] = WC_Bio_long['statistic'].str.replace('stdDev', 'std_dev', regex=False)

# Drop the original 'variable' column
WC_Bio_long = WC_Bio_long.drop(columns=['variable'])

# Reorder columns for clarity
WC_Bio_long = WC_Bio_long[['geo', 'geo_source', 'band', 'obsTime', 'statistic', 'obsValue']]

# Preview result
WC_Bio_long

Unnamed: 0,geo,geo_source,band,obsTime,statistic,obsValue
0,AT,NUTS2003,bio01,2000,max,102.000000
1,AT3,NUTS2003,bio01,2000,max,93.000000
2,AT1,NUTS2003,bio01,2000,max,102.000000
3,AT2,NUTS2003,bio01,2000,max,98.000000
4,AT34,NUTS2003,bio01,2000,max,92.000000
...,...,...,...,...,...,...
3124336,XK006,EURO2025,bio19,2000,std_dev,10.322335
3124337,XK007,EURO2025,bio19,2000,std_dev,31.444018
3124338,XK00,EURO2025,bio19,2000,std_dev,47.405136
3124339,XK0,EURO2025,bio19,2000,std_dev,47.405136


In [16]:
# If your dataframe is long-format and the percentile info is in 'metric' column:
WC_Bio_long['statistic'] = WC_Bio_long['statistic'].replace(rename_dict)

WC_Bio_long['statistic'].unique()

array(['max', 'mean', 'median', 'min', '10th_percentile',
       '25th_percentile', '75th_percentile', '90th_percentile', 'std_dev'],
      dtype=object)

In [18]:
# WC_Bio_long.to_csv(DATA_DIR/"GEE/WC/Output/WC_Bio_50years.csv", index=True, index_label="id")

In [19]:
Bio = pd.read_csv(DATA_DIR/"GEE/WC/Output/WC_Bio_50years.csv")
Bio

Unnamed: 0,id,geo,geo_source,band,obsTime,statistic,obsValue
0,0,AT,NUTS2003,bio01,2000,max,102.000000
1,1,AT3,NUTS2003,bio01,2000,max,93.000000
2,2,AT1,NUTS2003,bio01,2000,max,102.000000
3,3,AT2,NUTS2003,bio01,2000,max,98.000000
4,4,AT34,NUTS2003,bio01,2000,max,92.000000
...,...,...,...,...,...,...,...
3124336,3124336,XK006,EURO2025,bio19,2000,std_dev,10.322335
3124337,3124337,XK007,EURO2025,bio19,2000,std_dev,31.444018
3124338,3124338,XK00,EURO2025,bio19,2000,std_dev,47.405136
3124339,3124339,XK0,EURO2025,bio19,2000,std_dev,47.405136


In [21]:
validate(data=Bio)

Unnamed: 0,id,geo,geo_source,band,obsTime,statistic,obsValue
0,0,AT,NUTS2003,bio01,2000,max,102.000000
1,1,AT3,NUTS2003,bio01,2000,max,93.000000
2,2,AT1,NUTS2003,bio01,2000,max,102.000000
3,3,AT2,NUTS2003,bio01,2000,max,98.000000
4,4,AT34,NUTS2003,bio01,2000,max,92.000000
...,...,...,...,...,...,...,...
3124336,3124336,XK006,EURO2025,bio19,2000,std_dev,10.322335
3124337,3124337,XK007,EURO2025,bio19,2000,std_dev,31.444018
3124338,3124338,XK00,EURO2025,bio19,2000,std_dev,47.405136
3124339,3124339,XK0,EURO2025,bio19,2000,std_dev,47.405136
