In [28]:
# EM_Earth_functions the EMDNA according to lat/lon extent
import xarray as xr
import numpy as np
import os, time
import glob
import pandas as pd
import sys


def download_EM_ERATH (latrange = [39.4, 40.5],
                       lonrange = [115.6, 116.8],
                       root_save = '../EM_Earth_v1_subset/',
                       groups = ['deterministic_hourly','deterministic_raw_daily',\
                                 'probabilistic_daily'],
                       variables = ['prcp','tdew','tmean','trange'],
                       years = ['1950','1951','1952','1953','1954',\
                                '1955','1956','1957','1958','1959',\
                                '1960','1961','1962','1963','1964',\
                                '1965','1966','1967','1968','1969',\
                                '1970','1971','1972','1973','1974',\
                                '1975','1976','1977','1978','1979',\
                                '1980','1981','1982','1983','1984',\
                                '1985','1986','1987','1988','1989',\
                                '1990','1991','1992','1993','1994',\
                                '1995','1996','1997','1998','1999',\
                                '2000','2001','2002','2003','2004',\
                                '2005','2006','2007','2008','2009',\
                                '2010','2011','2012','2013','2014',\
                                '2015','2016','2017','2018','2019'],
                       months = ['01','02','03','04','05','06',\
                                 '07','08','09','10','11','12'],
                       ensembles = ['001','002','003','004','005',\
                                    '006','007','008','009','010',\
                                    '011','012','013','014','015',\
                                    '016','017','018','019','020',\
                                    '021','022','023','024','025'],
                       root_data = '../EM_Earth_v1/'):


    # add buffer
    buffer = 0.1
    latrange[0] = latrange[0] - buffer
    latrange[1] = latrange[1] + buffer
    lonrange[0] = lonrange[0] - buffer
    lonrange[1] = lonrange[1] + buffer

    # create the output folder
    os.makedirs(root_save, exist_ok=True)
    
    # Use os.path.join to concatenate the directory and file name:
    all_file_name = []
    for path, subdirs, files in os.walk(root_data):
        for name in files:
            if ".nc" in name:
                all_file_name = all_file_name + [os.path.join(path, name)]
    all_file_name = pd.DataFrame({'File_Name':all_file_name})
    print(all_file_name)

    
    for group in groups:

        for variable in variables:

            for year in years:

                for month in months:
                    
                    
                    
                    #
                    if group in ['deterministic_hourly', 'deterministic_raw_daily']:
                        
                        if group == 'deterministic_hourly':
                            terms = [group+'_global', year+month, variable] # terms are the terms
                            out_file_path = root_save+'deterministic_hourly_sliced/'+variable+\
                            '/'+year+month+'/'
                            out_file_name = 'EM_Earth_deterministic_hourly_'+\
                            variable+'_sliced_'+year+month+'.nc'
                        elif group == 'deterministic_raw_daily':
                            terms = [group, year+month, variable] # terms are the terms
                            out_file_path = root_save+'deterministic_raw_daily_sliced/'+variable+\
                            '/'+year+month+'/'
                            out_file_name = 'EM_Earth_deterministic_hourly_'+\
                            variable+'_sliced_'+year+month+'.nc'
                            
                        #
                        df_slice = all_file_name.copy()
                        for term in terms: # loop over terms to get the target file names
                            df_slice = df_slice[df_slice['File_Name'].str.contains(term)]
                            
                            
                        if not df_slice.empty:
                            # check if the size of the df_slice is only one
                            if not len(df_slice) == 1:
                                sys.exit('stop, it seems there are multiple input the year, month')
                            os.makedirs(out_file_path, exist_ok=True)
                            infile = df_slice['File_Name'].iloc[0]
                            subsetting (infile,\
                                        out_file_path+out_file_name,\
                                        latrange=latrange,\
                                        lonrange=lonrange)
                    
                    if group == 'probabilistic_daily':
                        
                        for ensemble in ensembles:
                        
                            terms = [group+'_global', year+month, variable, '_'+ensemble] # terms are the terms
                            out_file_path = root_save+'probabilistic_daily_global_sliced/'+variable+\
                            '/'+year+month+'/'
                            out_file_name = 'EM_Earth_probabilistic_daily_'+\
                            variable+'_sliced_'+year+month+'_'+ensemble+'.nc'
                            
                            # 
                            df_slice = all_file_name.copy()
                            for term in terms:
                                df_slice = df_slice[df_slice['File_Name'].str.contains(term)]
                                
                            if not df_slice.empty:
                                
                                if not len(df_slice) == 1:
                                    sys.exit('stop, it seems there are multiple input the year, month')
                                os.makedirs(out_file_path, exist_ok=True)
                                infile = df_slice['File_Name'].iloc[0]
                                subsetting (infile,\
                                            out_file_path+out_file_name,\
                                            latrange=latrange,\
                                            lonrange=lonrange)
                                
                                
                                
                            
                    
def subsetting (infile,
                outfile,
                latrange,
                lonrange):
    
    t1=time.time()
    if not os.path.isfile(outfile):
        ds = xr.open_dataset(infile)
        ds = ds.sel(lat=slice(latrange[1], latrange[0]), lon=slice(lonrange[0], lonrange[1]))
        encoding = {}
        for vv in ds.data_vars:
            encoding[vv] = {'zlib': True, 'complevel': 4}
        ds.to_netcdf(outfile, unlimited_dims='time', encoding=encoding)
        t2 = time.time()
        print('time cost:', t2-t1)


download_EM_ERATH()


                                            File_Name
0   ../EM_Earth_v1/probabilistic_daily_global/tran...
1   ../EM_Earth_v1/probabilistic_daily_global/tran...
2   ../EM_Earth_v1/probabilistic_daily_global/tdew...
3   ../EM_Earth_v1/probabilistic_daily_global/tdew...
4   ../EM_Earth_v1/probabilistic_daily_global/prcp...
..                                                ...
74  ../EM_Earth_v1/probabilistic_daily/tmean/Europ...
75  ../EM_Earth_v1/probabilistic_daily/tmean/South...
76  ../EM_Earth_v1/probabilistic_daily/tmean/South...
77  ../EM_Earth_v1/probabilistic_daily/tmean/Afric...
78  ../EM_Earth_v1/probabilistic_daily/tmean/Afric...

[79 rows x 1 columns]
time cost: 15.105352878570557
time cost: 0.8210618495941162
time cost: 0.44207310676574707
time cost: 0.40226292610168457
time cost: 0.44332385063171387
time cost: 0.8047990798950195
time cost: 0.7735087871551514
time cost: 0.4616367816925049
time cost: 0.4380531311035156
time cost: 0.441756010055542
time cost: 0.45287990570068