In [1]:
import os
import gc
import rasterio
import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm
from rasterstats import zonal_stats

In [5]:
def reclass_reproject_raster(in_path, out_path, dst_crs, settlement_class):
    with rasterio.open(in_path) as src:
        transform, width, height = rasterio.warp.calculate_default_transform(
            src.crs, dst_crs, src.width, src.height, *src.bounds)
        kwargs = src.meta.copy()
        kwargs.update({
            'crs': dst_crs,
            'transform': transform,
            'width': width,
            'height': height
        })

        # keep only those rasters with a lable of settlement_class
        a = src.read(1)
        new_src = a.copy()
        new_src[np.where(a!=settlement_class)]=0
        new_src[np.where(a==settlement_class)]=1

        with rasterio.open(out_path, 'w', **kwargs) as dst:
            rasterio.warp.reproject(
                source=new_src,
                destination=rasterio.band(dst, 1),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=rasterio.warp.Resampling.nearest)


In [13]:
# Processing GHS-SMOD for NUTs
settlement_dict = {30: 'URBAN CENTRE GRID CELL',
                   23: 'DENSE URBAN CLUSTER GRID CELL',
                   22: 'SEMI-DENSE URBAN CLUSTER GRID CELL',
                   21: 'SUBURBAN OR PERI-URBAN GRID CELL',
                   13: 'RURAL CLUSTER GRID CELL',
                   12: 'LOW DENSITY RURAL GRID CELL',
                   11: 'VERY LOW DENSITY RURAL GRID CELL',
                   10: 'WATER GRID CELL'}

for settlement_class in tqdm(settlement_dict.keys()):
    read_root = r'C:\1-Data\GHS_SMOD\raw data'
    out_folder = r'C:\1-Data\GHS_SMOD\reclassify_nuts' + '\\' + str(settlement_class)
    dst_crs = 'EPSG:3035'
    indicator = settlement_dict[settlement_class]
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

    # Reproject and reclass only the files not processed before. This is to avoid replicate work upon interruption and rerunning the script
    for folder in tqdm([i for i in os.listdir(read_root) if i not in os.listdir(out_folder)]):
        for pop_file in os.listdir(read_root + '\\' + folder):
            if pop_file.endswith('.tif'):
                in_path = read_root + '\\' + folder + '\\' + pop_file
                out_path = out_folder + '\\' + pop_file
                reclass_reproject_raster(in_path, out_path, dst_crs, settlement_class)

    # Zonal statistics looping all years of NUTS regions
    nuts_folder = r'C:\1-Data\NUTS'
    for nuts_file in tqdm(os.listdir(nuts_folder)):
        if nuts_file.endswith('.shp'):
            nuts_path = nuts_folder + '\\' + nuts_file
            nuts = gpd.read_file(nuts_path)
            nuts_year = nuts_file.split('_')[-2]
            df_comb = None
            for tif in os.listdir(out_folder):
                if tif.endswith('.tif'):
                    zs_temp = pd.DataFrame(zonal_stats(nuts_path, out_folder + '\\' + tif, stats="sum"))
                    df_temp = pd.merge(nuts[['NUTS_ID']], zs_temp, left_index=True, right_index=True)
                    df_temp.columns = ['geo','ObsValue']
                    df_temp['Indicator'] = indicator
                    df_temp['freq'] = '5 year'
                    df_temp['ObsTime'] = tif.split('_')[2][1:]
                    df_temp['unit'] = 'km2'
                    df_comb = df_temp if df_comb is None else pd.concat([df_comb, df_temp])
            geo_source = 'NUTS' + nuts_year
            df_comb['geo_source'] = geo_source
            df_comb.set_index('geo').to_csv(r'C:\2-Case studies\GHS_SMOD\GHS_SMOD_' + geo_source + '_' + indicator + '.csv') 


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
  0%|                                                                                           | 0/12 [00:00<?, ?it/s][A
  8%|██████▋                                                                         | 1/12 [05:56<1:05:24, 356.81s/it][A
 17%|█████████████▋                                                                    | 2/12 [10:45<52:47, 316.75s/it][A
 25%|████████████████████▌                                                             | 3/12 [16:09<48:01, 320.13s/it][A
 33%|███████████████████████████▎                                                      | 4/12 [21:32<42:48, 321.00s/it][A
 42%|██████████████████████████████████▏                                               | 5/12 [26:50<37:19, 319.97s/it][A
 50%|█████████████████████████████████████████                                         | 6/12 [32:00<31:39, 316.63s/it][A
 58%|██████████████

In [7]:
# Processing GHS-SMOD for GADM
settlement_dict = {30: 'URBAN CENTRE GRID CELL',
                   23: 'DENSE URBAN CLUSTER GRID CELL',
                   22: 'SEMI-DENSE URBAN CLUSTER GRID CELL',
                   21: 'SUBURBAN OR PERI-URBAN GRID CELL',
                   13: 'RURAL CLUSTER GRID CELL',
                   12: 'LOW DENSITY RURAL GRID CELL',
                   11: 'VERY LOW DENSITY RURAL GRID CELL',
                   10: 'WATER GRID CELL'}

for settlement_class in tqdm(settlement_dict.keys()):
    read_root = r'C:\1-Data\GHS_SMOD\raw data'
    out_folder = r'C:\1-Data\GHS_SMOD\reclassify_gadm' + '\\' + str(settlement_class)
    dst_crs = 'EPSG:4326'
    indicator = settlement_dict[settlement_class]
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    
    # Reproject and reclass only the files not processed before. This is to avoid replicate work upon interruption and rerunning the script
    for folder in tqdm([i for i in os.listdir(read_root) if i not in os.listdir(out_folder)]):
        if folder+'.tif' in os.listdir(out_folder):
            continue
        else:
            for pop_file in os.listdir(read_root + '\\' + folder):
                if pop_file.endswith('.tif'):
                    in_path = read_root + '\\' + folder + '\\' + pop_file
                    out_path = out_folder + '\\' + pop_file
                    reclass_reproject_raster(in_path, out_path, dst_crs, settlement_class)

    # Zonal statistics looping for GADM regions
    df_comb = None
    gadm_path = r'C:\1-Data\GADM\gadm_410.gpkg'
    gadm = gpd.read_file(gadm_path)
    for tif in tqdm(os.listdir(out_folder)):
        if tif.endswith('.tif'):
            zs_temp = pd.DataFrame(zonal_stats(gadm_path, out_folder + '\\' + tif, stats="sum"))
            df_temp = pd.merge(gadm[['NAME_0','NAME_2']], zs_temp, left_index=True, right_index=True)
            df_temp.columns = ['geo_country', 'geo_city', 'ObsValue']
            df_temp['Indicator'] = indicator
            df_temp['freq'] = '5 year'
            df_temp['ObsTime'] = tif.split('_')[2][1:]
            df_temp['unit'] = 'km2'
            df_comb = df_temp if df_comb is None else pd.concat([df_comb, df_temp])
    # save the table
    geo_source = 'GADM'
    df_comb['geo_source'] = geo_source
    df_comb.to_csv(r'C:\2-Case studies\GHS_SMOD\GHS_SMOD_' + geo_source + '_' + indicator + '.csv')

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 11980.87it/s][A

  0%|                                                                                           | 0/12 [00:00<?, ?it/s][A
  8%|██████▌                                                                        | 1/12 [36:22<6:40:10, 2182.82s/it][A
 17%|████████████▋                                                               | 2/12 [2:08:47<11:33:21, 4160.11s/it][A
 25%|███████████████████                                                         | 3/12 [4:41:46<16:07:46, 6451.82s/it][A
 33%|█████████████████████████▎                                                  | 4/12 [7:29:53<17:31:37, 7887.13s/it][A
 42%|███████████████████████████████▎                                           | 5/12 [10:31:50<17:27:37, 8979.58s/it][A
 50%|█████████████

In [11]:
# Merge dataframes by geo-source
read_folder = r'E:\1-Case studies\GHS_SMOD\NUTS'
for year in tqdm([2003, 2006, 2010, 2013, 2016, 2021]):
    df_comb = None
    for file in os.listdir(read_folder):
        if file.split('_')[2][-4:] == str(year):
            df_temp = pd.read_csv(read_folder + '//' + file)
            df_comb = df_temp if df_comb is None else pd.concat([df_comb, df_temp]) 
    df_comb.to_csv(r'E:\1-Case studies\GHS_SMOD' + '\\' + 'GHS_SMOD_NUTS' + str(year) + '.csv')

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:16<00:00,  2.70s/it]
