In [4]:
import os
import gc
import rasterio
import geopandas as gpd
import numpy as np
import pandas as pd
from tqdm import tqdm
from rasterstats import zonal_stats

In [35]:
def reclass_reproject_raster(in_path, out_path, dst_crs, settlement_class=23):
    with rasterio.open(in_path) as src:
        transform, width, height = rasterio.warp.calculate_default_transform(
            src.crs, dst_crs, src.width, src.height, *src.bounds)
        kwargs = src.meta.copy()
        kwargs.update({
            'crs': dst_crs,
            'transform': transform,
            'width': width,
            'height': height
        })

        # keep only those rasters with a lable of settlement_class
        a = src.read(1)
        new_src = a.copy()
        new_src[np.where(a!=settlement_class)]=0
        new_src[np.where(a==settlement_class)]=1

        with rasterio.open(out_path, 'w', **kwargs) as dst:
            rasterio.warp.reproject(
                source=new_src,
                destination=rasterio.band(dst, 1),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=rasterio.warp.Resampling.nearest)


In [37]:
# Processing GHS-SMOD for NUTs
settlement_dict = {30: 'URBAN CENTRE GRID CELL',
                   23: 'DENSE URBAN CLUSTER GRID CELL',
                   22: 'SEMI-DENSE URBAN CLUSTER GRID CELL',
                   21: 'SUBURBAN OR PERI-URBAN GRID CELL',
                   13: 'RURAL CLUSTER GRID CELL',
                   12: 'LOW DENSITY RURAL GRID CELL',
                   11: 'VERY LOW DENSITY RURAL GRID CELL',
                   10: 'WATER GRID CELL'}

for settlement_class in tqdm(settlement_dict.keys()):
    read_root = r'C:\1-Data\GHS_SMOD\raw data'
    out_folder = r'C:\1-Data\GHS_SMOD\reclassify_nuts' + '\\' + str(settlement_class)
    dst_crs = 'EPSG:3035'
    indicator = settlement_dict[settlement_class]
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

    '''
    # Reproject and reclass only the files not processed before. This is to avoid replicate work upon interruption and rerunning the script
    for folder in tqdm([i for i in os.listdir(read_root) if i not in os.listdir(out_folder)]):
        for pop_file in os.listdir(read_root + '\\' + folder):
            if pop_file.endswith('.tif'):
                in_path = read_root + '\\' + folder + '\\' + pop_file
                out_path = out_folder + '\\' + pop_file
                reclass_reproject_raster(in_path, out_path, dst_crs)
    '''

    # Zonal statistics looping all years of NUTS regions
    nuts_folder = r'C:\1-Data\NUTS'
    for nuts_file in tqdm(os.listdir(nuts_folder)):
        if nuts_file.endswith('.shp'):
            nuts_path = nuts_folder + '\\' + nuts_file
            nuts = gpd.read_file(nuts_path)
            nuts_year = nuts_file.split('_')[-2]
            df_comb = None
            for tif in os.listdir(out_folder):
                if tif.endswith('.tif'):
                    zs_temp = pd.DataFrame(zonal_stats(nuts_path, out_folder + '\\' + tif, stats="sum"))
                    df_temp = pd.merge(nuts[['NUTS_ID']], zs_temp, left_index=True, right_index=True)
                    df_temp.columns = ['geo','ObsValue']
                    df_temp['Indicator'] = indicator
                    df_temp['freq'] = '5 year'
                    df_temp['ObsTime'] = tif.split('_')[2][1:]
                    df_temp['unit'] = 'km2'
                    df_comb = df_temp if df_comb is None else pd.concat([df_comb, df_temp])
            geo_source = 'NUTS' + nuts_year
            df_comb['geo_source'] = geo_source
            df_comb.set_index('geo').to_csv(r'C:\2-Case studies\GHS_SMOD\GHS_SMOD_' + geo_source + '_' + indicator + '.csv') 


  0%|                                                                                            | 0/8 [00:00<?, ?it/s]
  0%|                                                                                           | 0/31 [00:00<?, ?it/s][A
 16%|█████████████▍                                                                     | 5/31 [08:10<42:31, 98.12s/it][A
 32%|██████████████████████████▏                                                      | 10/31 [19:47<42:49, 122.36s/it][A
 48%|███████████████████████████████████████▏                                         | 15/31 [28:06<29:52, 112.06s/it][A
 65%|████████████████████████████████████████████████████▎                            | 20/31 [37:31<20:37, 112.49s/it][A
 81%|█████████████████████████████████████████████████████████████████▎               | 25/31 [47:11<11:22, 113.74s/it][A
100%|█████████████████████████████████████████████████████████████████████████████████| 31/31 [57:42<00:00, 111.71s/it][A
 12%|██████████    

In [11]:
gadm_path = r'C:\1-Data\GADM\gadm_410.gpkg'
gadm = gpd.read_file(gadm_path)

In [13]:
# Processing GHS-SMOD for GADM
settlement_dict = {30: 'URBAN CENTRE GRID CELL',
                   23: 'DENSE URBAN CLUSTER GRID CELL',
                   22: 'SEMI-DENSE URBAN CLUSTER GRID CELL',
                   21: 'SUBURBAN OR PERI-URBAN GRID CELL',
                   13: 'RURAL CLUSTER GRID CELL',
                   12: 'LOW DENSITY RURAL GRID CELL',
                   11: 'VERY LOW DENSITY RURAL GRID CELL',
                   10: 'WATER GRID CELL'}

for settlement_class in tqdm(settlement_dict.keys()):
    read_root = r'C:\1-Data\GHS_SMOD\raw data'
    out_folder = r'C:\1-Data\GHS_SMOD\reclassify_gadm' + '\\' + str(settlement_class)
    dst_crs = 'EPSG:4326'
    indicator = settlement_dict[settlement_class]
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)
    
    # Reproject and reclass only the files not processed before. This is to avoid replicate work upon interruption and rerunning the script
    for folder in tqdm([i for i in os.listdir(read_root) if i not in os.listdir(out_folder)]):
        if folder+'.tif' in os.listdir(out_folder):
            continue
        else:
            for pop_file in os.listdir(read_root + '\\' + folder):
                if pop_file.endswith('.tif'):
                    in_path = read_root + '\\' + folder + '\\' + pop_file
                    out_path = out_folder + '\\' + pop_file
                    reclass_reproject_raster(in_path, out_path, dst_crs)

    # Zonal statistics looping for GADM regions
    df_comb = None
    for tif in tqdm(os.listdir(out_folder)):
        if tif.endswith('.tif'):
            zs_temp = pd.DataFrame(zonal_stats(gadm_path, out_folder + '\\' + tif, stats="sum"))
            df_temp = pd.merge(gadm[['NAME_0','NAME_2']], zs_temp, left_index=True, right_index=True)
            df_temp.columns = ['geo_country', 'geo_city', 'ObsValue']
            df_temp['Indicator'] = indicator
            df_temp['freq'] = '5 year'
            df_temp['ObsTime'] = tif.split('_')[2][1:]
            df_temp['unit'] = 'km2'
            df_comb = df_temp if df_comb is None else pd.concat([df_comb, df_temp])
    # save the table
    geo_source = 'GADM'
    df_comb['geo_source'] = geo_source
    df_comb.to_csv(r'C:\Users\DemSc\Documents\GitHub\MapIneq\src\data-wrangling\Xiang\1-case studies\GHS_SMOD\GHS_SMOD_' + geo_source + '_' + indicator + '.csv')

  0%|                                                                                            | 0/8 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 6959.58it/s][A

  0%|                                                                                           | 0/12 [00:00<?, ?it/s][A
  8%|██████▌                                                                        | 1/12 [47:45<8:45:16, 2865.11s/it][A
 17%|████████████▊                                                                | 2/12 [1:39:45<8:22:34, 3015.42s/it][A
 25%|███████████████████▎                                                         | 3/12 [2:52:36<8:37:48, 3452.02s/it][A
  0%|                                                                                          | 0/8 [2:52:36<?, ?it/s]


KeyboardInterrupt: 