In [1]:
import os
import gc
import tarfile
import netCDF4 as nc
import numpy as np
import pandas as pd
import geopandas as gpd
from osgeo import gdal, osr
from tqdm import tqdm
import rasterio
from rasterstats import zonal_stats

# 1. Uncompress and convert nc to tif

In [10]:
# Unzip all gz files
read_folder = r'C:\1-Data\GRACED'
save_folder = r'C:\1-Data\GRACED\nc'

for file in tqdm(os.listdir(read_folder)):
    if file.endswith('gz'):
        with tarfile.open(read_folder + '\\' + file) as tar:
            tar.extractall(save_folder)

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [06:33<00:00,  5.25s/it]


In [47]:
def nc_to_geotiff(nc_file, save_path, day, dstSRS='EPSG:3035'):
    
    lat = nc_file.variables['latitude'][:]
    lon = nc_file.variables['longitude'][:]
    emission = np.asanyarray(nc_file.variables['emission'][day])

    # get the spatial range of the netcdf
    Lonmin, Latmax, Lonmax, Latmin = [lon.min(), lat.max(), lon.max(), lat.min()]

    # calculate the resolution
    Num_lat = len(lat)
    Num_lon = len(lon)
    Lat_res = (Latmax - Latmin) / (float(Num_lat) - 1)
    Lon_res = (Lonmax - Lonmin) / (float(Num_lon) - 1)

    # create the tif file and save it into the virtual file system in memory
    driver = gdal.GetDriverByName('GTiff')
    out_tif = driver.Create('/vsimem/emission.tif', Num_lon, Num_lat, 1, gdal.GDT_Float32)

    # set the spatial range of the tif file
    geotransform = (Lonmin, Lon_res, 0.0, Latmax, 0.0, -Lat_res)
    out_tif.SetGeoTransform(geotransform)

    # set the projection system
    prj = osr.SpatialReference()
    prj.ImportFromEPSG(4326)
    out_tif.SetProjection(prj.ExportToWkt())

    # check is the data are flipped and correct the data if yes
    if lat[0] <= lat[-1]: 
        emission = emission[::-1]
    else:
        pass

    # write data into tif and close the file
    out_tif.GetRasterBand(1).WriteArray(emission)
    # transform the projection to 3035 and save
    gdal.Warp(save_path, out_tif, srcSRS='EPSG:4326', dstSRS=dstSRS)
    out_tif.FlushCache() 
    out_tif = None

In [None]:
# convert nc to tiff for nuts data extraction
read_folder = r'C:\1-Data\GRACED\nc'
save_folder = r'C:\1-Data\GRACED\tiff'

if __name__ == '__main__':
    for netc in tqdm(os.listdir(read_folder)[:1]):
        read_path = read_folder + r'/' + netc
        nc_file = nc.Dataset(read_path)
        days = nc_file.variables['nday'][:]
        for day in days:
            save_path = save_folder + r'/' + netc.split('_')[-2][-4:] + '_' + netc.split('_')[-1].split('.')[0][1:] + '_' + str(day) + '.tif'
            nc_to_geotiff(nc_file, save_path, day)

# 2. Zonal staitistics 

In [None]:
nuts_folder = r'C:\1-Data\NUTS'
read_folder = r'C:\1-Data\GRACED\tiff'
nuts_list = ['NUTS_RG_01M_2003_3035.shp', 'NUTS_RG_01M_2006_3035.shp', 'NUTS_RG_01M_2013_3035.shp', 
             'NUTS_RG_01M_2016_3035.shp', 'NUTS_RG_01M_2016_3035.shp', 'NUTS_RG_01M_2024_3035.shp']
for nuts_file in nuts_list[2:]:
    df_comb = None
    nuts_file = nuts_folder + '\\' + nuts_file
    nuts = gpd.read_file(nuts_file)
    
    for file in tqdm(os.listdir(read_folder)):
        read_file = read_folder + '\\' + file
       
        # link the zonal statistics table to the boundary index
        zs_temp = pd.DataFrame(zonal_stats(nuts_file, read_file, stats=['mean','sum']))
        df_temp = pd.merge(nuts[['NUTS_ID']], zs_temp, left_index=True, right_index=True)
        
        # "unstack" the columns of 'mean' and 'sum' so that they are now in the same column of 'calculation'
        mean_temp = df_temp[['NUTS_ID', 'mean']]
        sum_temp = df_temp[['NUTS_ID', 'sum']]
        mean_temp.columns = ['NUTS_ID', 'obsValue']
        mean_temp.loc[:,'calculation'] = 'mean'
        sum_temp.columns = ['NUTS_ID', 'obsValue']
        sum_temp.loc[:,'calculation'] = 'sum'
        df_daily = pd.concat([mean_temp, sum_temp])
        df_daily.loc[:,'obsTime'] = file.split('.')[0]
        df_comb = df_daily if df_comb is None else pd.concat([df_comb, df_daily])
        
    # save the table
    geo_source = 'NUTS' + nuts_file.split('_')[-2]
    df_comb.loc[:,'geo_source'] = geo_source
    df_comb['id'] = df_comb.reset_index().index
    df_comb = df_comb[['id'] + [i for i in df_comb.columns if i not in ['geo_source', 'id']]+['geo_source']]
    
    df_comb.to_csv(r'C:\2-Case studies\graced' + '\\' + geo_source + '.csv', index=False)    
    # delete nuts and release the RAM
    del nuts
    gc.collect()

  0%|▏                                                                             | 4/2251 [02:03<19:31:20, 31.28s/it]