In [18]:
import os
import gc
import tarfile
import netCDF4 as nc
import numpy as np
import pandas as pd
import geopandas as gpd
from osgeo import gdal, osr
from tqdm import tqdm
import rasterio
from rasterstats import zonal_stats

# 1. Uncompress and convert nc to tif

In [10]:
# Unzip all gz files
read_folder = r'C:\1-Data\GRACED'
save_folder = r'C:\1-Data\GRACED\nc'

for file in tqdm(os.listdir(read_folder)):
    if file.endswith('gz'):
        with tarfile.open(read_folder + '\\' + file) as tar:
            tar.extractall(save_folder)

100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [06:33<00:00,  5.25s/it]


In [None]:
def nc_to_geotiff(read_path, save_path, dstSRS='EPSG:3035'):
    nc_file = nc.Dataset(read_path)
    lat = nc_file.variables['latitude'][:]
    lon = nc_file.variables['longitude'][:]
    PM25 = np.asanyarray(nc_file.variables['PM25'])

    # get the spatial range of the netcdf
    Lonmin, Latmax, Lonmax, Latmin = [lon.min(), lat.max(), lon.max(), lat.min()]

    # calculate the resolution
    Num_lat = len(lat)
    Num_lon = len(lon)
    Lat_res = (Latmax - Latmin) / (float(Num_lat) - 1)
    Lon_res = (Lonmax - Lonmin) / (float(Num_lon) - 1)

    # create the tif file and save it into the virtual file system in memory
    driver = gdal.GetDriverByName('GTiff')
    out_tif = driver.Create('/vsimem/PM25.tif', Num_lon, Num_lat, 1, gdal.GDT_Float32)

    # set the spatial range of the tif file
    geotransform = (Lonmin, Lon_res, 0.0, Latmax, 0.0, -Lat_res)
    out_tif.SetGeoTransform(geotransform)

    # set the projection system
    prj = osr.SpatialReference()
    prj.ImportFromEPSG(4326)
    out_tif.SetProjection(prj.ExportToWkt())

    # check is the data are flipped and correct the data if yes
    if lat[0] <= lat[-1]: 
        PM25 = PM25[::-1]
    else:
        pass

    # write data into tif and close the file
    out_tif.GetRasterBand(1).WriteArray(PM25)
    # transform the projection to 3035 and save
    gdal.Warp(save_path, out_tif, srcSRS='EPSG:4326', dstSRS=dstSRS)
    out_tif.FlushCache() 
    out_tif = None

In [20]:
read_path = r'C:\1-Data\GRACED\nc\CarbonMonitor_total_y2019_m01.nc'
nc_file = nc.Dataset(read_path)


In [None]:
lat = nc_file.variables['latitude'][:]
lon = nc_file.variables['longitude'][:]
emission = np.asanyarray(nc_file.variables['emission'])

In [22]:
nc_file.variables

{'latitude': <class 'netCDF4.Variable'>
 float64 latitude(latitude)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (1800,)
 filling on,
 'longitude': <class 'netCDF4.Variable'>
 float64 longitude(longitude)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (3600,)
 filling on,
 'emission': <class 'netCDF4.Variable'>
 float32 emission(nday, latitude, longitude)
     _FillValue: nan
 unlimited dimensions: 
 current shape = (31, 1800, 3600)
 filling on,
 'nday': <class 'netCDF4.Variable'>
 int64 nday(nday)
     units: days since 2019-01-01 00:00:00
     calendar: proleptic_gregorian
 unlimited dimensions: 
 current shape = (31,)
 filling on, default _FillValue of -9223372036854775806 used}

In [26]:
nc_file.variables['nday'][:]

masked_array(data=[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13,
                   14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
                   28, 29, 30],
             mask=False,
       fill_value=999999,
            dtype=int64)

In [None]:
lat = nc_file.variables['latitude'][:]
lon = nc_file.variables['longitude'][:]
PM25 = np.asanyarray(nc_file.variables['PM25'])

In [None]:
# convert nc to tiff for nuts data extraction
read_folder = '/data/xiang/1-Data/PM2.5/netcdf'
save_folder = '/data/xiang/1-Data/PM2.5/raster'

if __name__ == '__main__':
    for netc in tqdm(os.listdir(read_folder)):
        if netc.endswith('.nc'):
            read_path = read_folder + r'/' + netc
            save_path = save_folder + r'/' + netc[:-2] + 'tif'
            nc_to_geotiff(read_path, save_path)

In [None]:
nuts_folder = '/data/xiang/1-Data/NUTS/nuts version'
read_folder = '/data/xiang/1-Data/PM2.5/raster'
# for nuts_file in os.listdir(nuts_folder):
nuts_list = ['NUTS_RG_01M_2006_3035.shp', 'NUTS_RG_01M_2013_3035.shp', 'NUTS_RG_01M_2016_3035.shp', 'NUTS_RG_01M_2003_3035.shp']
for nuts_file in nuts_list:
    df_comb = None
    if nuts_file.endswith('.shp'):
        nuts_file = nuts_folder + r'/' + nuts_file
        nuts = gpd.read_file(nuts_file)
        for PM_file in tqdm(os.listdir(read_folder)):
            read_file = read_folder + r'/' + PM_file
            year = PM_file.split('.')[-3][:4]
            # link the zonal statistics table to the boundary index
            zs_temp = pd.DataFrame(zonal_stats(nuts_file, read_file))['mean']
            df_temp = pd.merge(nuts[['NUTS_ID']], zs_temp, left_index=True, right_index=True)
            df_temp.columns = ['geo','ObsValue']
            df_temp['Indicator'] = 'PM2.5'
            df_temp['freq'] = '1 year'
            df_temp['ObsTime'] = year
            df_temp['unit'] = 'ug/m3'
            df_comb = df_temp if df_comb is None else pd.concat([df_comb, df_temp])
        # save the table
        geo_source = 'NUTS' + nuts_file.split('_')[-2]
        df_comb['geo_source'] = geo_source
        df_comb.set_index('geo').to_csv('/data/xiang/3-case studies/2-PM2.5/PM25_' + geo_source + '.csv')    
        # delete nuts and release the RAM
        del nuts
        gc.collect()