# Code to download part of EM-EARTH data for a given variable(s), year(s), or region(s)


## For using the code, linux system is needed and wget should be available.

## save frdr-checksums-0547.csv from the link https://drive.google.com/file/d/1HWNNCBpalAymIouburwm7flDhcjqWcqN/view?usp=share_link to the folder hash with the name frdr-checksums-0547.csv


In [13]:
import os
import requests
import base64
import hashlib
import glob
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import gdown

def download_EM_ERATH (server = 'https://g-772fa5.cd4fe.0ec8.data.globus.org/6/published/publication_542/submitted_data/',
                       groups = ['deterministic_hourly','deterministic_raw_daily',\
                                 'probabilistic_daily'],
                       variables = ['prcp','tdew','tmean','trange'],
                       regions = ['Asia','Europe','Africa',\
                                  'NorthAmerica','Oceania','SouthAmerica'],
                       years = ['1950','1951','1952','1953','1954',\
                                '1955','1956','1957','1958','1959',\
                                '1960','1961','1962','1963','1964',\
                                '1965','1966','1967','1968','1969',\
                                '1970','1971','1972','1973','1974',\
                                '1975','1976','1977','1978','1979',\
                                '1980','1981','1982','1983','1984',\
                                '1985','1986','1987','1988','1989',\
                                '1990','1991','1992','1993','1994',\
                                '1995','1996','1997','1998','1999',\
                                '2000','2001','2002','2003','2004',\
                                '2005','2006','2007','2008','2009',\
                                '2010','2011','2012','2013','2014',\
                                '2015','2016','2017','2018','2019'],
                       months = ['01','02','03','04','05','06',\
                                 '07','08','09','10','11','12'],
                       ensembles = ['001','002','003','004','005',\
                                    '006','007','008','009','010',\
                                    '011','012','013','014','015',\
                                    '016','017','018','019','020',\
                                    '021','022','023','024','025'],
                       root_save = '../EM_Earth_v1/'):
    
    
    # get the csv
    url = "https://drive.google.com/uc?id=1HWNNCBpalAymIouburwm7flDhcjqWcqN"
    output = "../hash/frdr-checksums-0547.csv"
    gdown.download(url, output, quiet=False)
    df = pd.read_csv(output)
    
    # 
    for group in groups:
        for year in years:
            for month in months:
                for variable in variables:
                    
                    # if the case is deterministic raw daily
                    if group == 'deterministic_raw_daily':
                        #
                        terms = [group, year+month, variable] # terms are the terms 
                        df_slice = df.copy()
                        for term in terms: # loop over terms to get the target file names
                            #
                            df_slice = df_slice[df_slice['File_Name'].str.contains(term)]
                        
                        for index, row in df_slice.iterrows():
                            
                            file_name, path_name, link_name = \
                            prepare_file_name_path_name(row['File_Name'],root_save,server)
                            
                            downlaod (file_name, path_name, link_name, row['sha256'].strip())
                            
                    
                    
                    for region in regions:
                        if group == 'deterministic_hourly' and variable != 'trange':
                            #
                            terms = [group, year+month, variable, region] # terms are the terms 
                            df_slice = df.copy()
                            
                            for term in terms: # loop over terms to get the target file names
                                #
                                df_slice = df_slice[df_slice['File_Name'].str.contains(term)]
                        
                            for index, row in df_slice.iterrows():

                                file_name, path_name, link_name = \
                                prepare_file_name_path_name(row['File_Name'],root_save,server)

                                downlaod (file_name, path_name, link_name, row['sha256'].strip())
                                
                        if group == 'probabilistic_daily':

                            for ensemble in ensembles:

                                #
                                terms = [group, year+month, variable, region, '_'+ensemble] # terms are the terms 
                                df_slice = df.copy()
                                
                                for term in terms: # loop over terms to get the target file names
                                    #
                                    df_slice = df_slice[df_slice['File_Name'].str.contains(term)]
                        
                                for index, row in df_slice.iterrows():

                                    file_name, path_name, link_name = \
                                    prepare_file_name_path_name(row['File_Name'],root_save,server)

                                    downlaod (file_name, path_name, link_name, row['sha256'].strip())

def get_hash (file_name):
    sha256_hash = hashlib.sha256()
    with open(file_name,"rb") as f:
        # Read and update hash string value in blocks of 4K
        for byte_block in iter(lambda: f.read(4096),b""):
            sha256_hash.update(byte_block)
        return sha256_hash.hexdigest().strip()
    
def prepare_file_name_path_name(full_name,
                                root_save,
                                server):
    file_path = re.sub('/globusdata/6/published/publication_542/submitted_data/EM_Earth_v1/',\
                       '',\
                       full_name)
    file_name = file_path.split("/")[-1].strip()
    path_name = re.sub(file_name,'',file_path).strip()
    link_name = server+'EM_Earth_v1/'+path_name+file_name
    path_name = root_save+path_name
    
    return file_name, path_name, link_name

def downlaod (file_name, path_name, link_name, hash_value_remote):
    
    #
    downloaded = False
    try_number = 1
            
    
    # directory
    if not os.path.isdir(path_name):
        os.makedirs(path_name)
    
    # check if the file exists on local directory
    if os.path.isfile(path_name+file_name):
        
        # check the hash value
        hash_value_local = get_hash (path_name+file_name)
        
        if hash_value_local == hash_value_remote:
            
            #
            downloaded = True
            
    while (not downloaded) and (try_number < 1000):

#         r = requests.get(link_name) # download the URL
#         # print the specification of the download 
#         print(r.status_code, r.headers['content-type'], r.encoding)
#         # if download successful the statuse code is 200 then save the file, else print what was not downloaded
#         if r.status_code == 200:
#             print('download was successful for '+link_name)
#             with open(path_name+file_name, 'wb') as f:
#                 f.write(r.content)
#         else:
#             print('download was not successful for '+link_name)

        os.system('wget '+link_name+' -O '+path_name+file_name)

        hash_value_local = get_hash (path_name+file_name)
        
        print(hash_value_local)
        print(hash_value_remote)

        if hash_value_local == hash_value_remote:
            downloaded = True

        try_number = try_number + 1

In [14]:
# example of download
download_EM_ERATH (variables = ['prcp','tdew','tmean','trange'],
                   years = ['1950','1951'],
                   months = ['01','02','03'],
                   ensembles = ['001','025'])