# User input

In [1]:
variable_of_interest = 'tas' # need hurs, rsds, sfcWind
temporal_resolution = 'day'

start_year = 1960 
stop_year = 2060

# zone of interest in Mozambique
# negative latitude is southern hemisphere, latitude is horizontal, longitude is vertical
# decimal degrees
#min_lat_zone_interest = -30
#max_lat_zone_interest = -10
#min_lon_zone_interest = 30
#max_lon_zone_interest = 45

# to have the more precise locations, based on this link https://ds.nccs.nasa.gov/thredds/ncss/grid/AMES/NEX/GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/pr/pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_2014.nc/dataset.html

# Import Packages

In [2]:
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import pandas as pd

import os
import os.path

from netCDF4 import Dataset

import numpy as np
import numpy.ma as ma

import multiprocessing as mp# to download several file in parrallel
from multiprocessing.pool import ThreadPool

# Define Functions

In [3]:
# function to extract the name of the file from its url
# the imput is an url
def extract_name_file(url):
    index_before_name=url.rfind('/') # returns the highest index where the last character '/' was found, which is just before the name of the file    
    #index_end_name=url.rfind('?') # returns the lowest index where the character '?' was found, which is just after the name of the file    
    name = url[index_before_name+1:len(url)] # return the name of the file as a string, with the suffix '.nc'
    return name

# function 'produce_name_list' produce a list of files' name, with the suffix '.nc'
# 'produce_name_list' use the function 'extract_name_file' to have the name of a file from its url
# the input is a list of url, from which we want to extract the corresponding names of files
def produce_name_list(url_list):
    name_list=[] # create empty list
    for file in url_list:
        f_name = extract_name_file(file) # return the name of the file as a string, with the suffix '.nc'
        name_list.append(f_name) # add extracted name in the list
    return name_list # return the list of names in the url_list

In [4]:
# function download_file download the file in the url given as input
# the input is an url of a file
def download_file(file):
    out_path=r'\\COWI.net\projects\A245000\A248363\CRVA\Datasets\NEX-GDDP-CMIP6' 
    # find the name of the file
    f_name = extract_name_file(file) # return the name of the file as a string, with the suffix '.nc' at the end of the name
    print('\nName of the file: '+f_name)
    if os.path.join(out_path,f_name):
        print('The file '+f_name+' exists')
        if test_open_file(os.path.join(out_path,f_name))==[]:
        # the file exists and is not corrupted
            return print('The file '+f_name+' is not corrupted')# will return None when used in 
    print('The file '+f_name+' must be requested')
    
    # this will get the url and retry 20 times in case of requests.exceptions.ConnectionError
    # backoff_factor will help to apply delays between attempts to avoid failing again in case of periodic request quota
    session = requests.Session()
    retry = Retry(connect=10, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    with session.get(file) as r:
        # download data in the servor for datasets
        with open(f'//COWI.net/projects/A245000/A248363/CRVA/Datasets/NEX-GDDP-CMIP6/{f_name}', 'wb') as f:
            f.write(r.content)
    
    while test_open_file(os.path.join(out_path,f_name))!=[]: # test if download file is corrupted
        print('\nFile '+f_name+' did not open')
        
        #connect_timeout = 1 # time in seconds, the timeout for establishing a connection to the server. This timeout value can be set by passing the "timeout" parameter to the requests.request() function and setting the "connect" key to the desired timeout value:
        #read_timeout = 1000 # time in seconds, the timeout for waiting for a response from the server after the connection has been established. This timeout value can be set by passing the "timeout" parameter to the requests.request() function and setting the "read" key to the desired timeout value:

        with session.get(file) as r:# return the url were data need to be downloaded
        # download data in the servor for datasets
            with open(f'//COWI.net/projects/A245000/A248363/CRVA/Datasets/NEX-GDDP-CMIP6/{f_name}', 'wb') as f:
                f.write(r.content)
    
    print('\nFile '+f_name+' did open')
    
    return file

In [5]:
# function 'list_name_corrupted_file' aims to return a list of names, corresponding to the names of the corrupted files
# in the name_list given as input
def list_name_corrupted_file(name_list,out_path):
    ## Hypothesis: some files are corrupted
    # check if files are corrupted and count the corrupted ones
    invalid_files = [] # create an empty list to register the names of the corrupted files
    for name in name_list:
        if not name.endswith('.nc'):
            name = name + '.nc'
        if test_open_file(os.path.join(out_path,name))!=[]:
            invalid_files.append(name) # register name of the corrupted files
    return invalid_files # return a list of names, with suffix '.nc', of corrupted files

In [6]:
def test_open_file(path):
    try:
        im = Dataset(path)
        name = []
        im.close()
    except (IOError, OSError):
        # the files is corrupted
        name = extract_name_file(path) # return the name of the file as a string, with the suffix '.nc'
    return name

In [7]:
# list urls that should be downloaded again
def url_to_download_again(url_list,invalid_files):
    url_corrupted_file = []
    for url in url_list:
        f_name = extract_name_file(url) # return the name of the file as a string
        if f_name in invalid_files:
            url_corrupted_file.append(url)

    dict ={' fileUrl':url_corrupted_file}
    df = pd.DataFrame(dict)
    path = os.path.join(out_path,'file_to_download_again.csv')
    df.to_csv(path)
    return url_corrupted_file

In [8]:
## those three function are used to have the information concerning a file
## information are in the name of the file, so the name of the file is used to find its related information
## information mean variable, time_aggregation, model, scenario, year of the file

### this function permit to extract the word before the first character '_' in the input 'name'
### the input name is in format str
### returning the new_name, without the word found, will permit to re-use the function to find all 
#     the information concerning the studied file
def name_next_boundary(name):
    index_before_name=name.find('_') # returns the lowest index where the character '_' was found
    word = name[0:index_before_name] # first word in the string 'name', before the first character '_'
    new_name = name.replace(word+'_','') # delete the word found from the string 'name'
    return word, new_name # return, in string format, the word found (which is an information of the studied file), 
                    # and the string 'new_name', which is 'name' without the word found

# this function permit to extract the year of the studied file
# the year is always writen at the end of the name's file
# the input name is in format str
def find_year(name):
    index_before_name=name.rfind('_') # returns the highest index where the character '_' was found
    # the last character '_' is just before the year in the string 'name'
    # determine if the string 'name' ends with '.nc'
    if name.endswith('.nc'):
        # 'name' ends with '.nc'
        name_end = 3 # the three last character of the string name will be removed to find the year of the studied file
    else:
        # 'name' does not end with '.nc'
        name_end = 0 # no character will be removed at the end of 'name' to find the year of the studied file
    year = name[index_before_name+1:len(name)-name_end] # the year is extracted from the name of the file studied
    # based on the index_before_name (highest index where the character '_' was found) and the suffix of 'name'
    return year # the year in string format is returned

# This function use the functions 'name_next_boundary' and 'find_year' to extract the information of the file studied
# the input name is in format str, the name of the file from which we want information
def data_information(name):
    #### use of the function 'name_next_boundary': each time it is used, 
    # returns an information, and the name of the studied file without this information
    (variable, shorten_name) = name_next_boundary(name)
    (time_aggregation, shorten_name) = name_next_boundary(shorten_name)
    (model, shorten_name) = name_next_boundary(shorten_name)
    (scenario, shorten_name) = name_next_boundary(shorten_name)
    #### use the function 'find_year' to extract the information 'year' from the string 'shorten_name'
    year = find_year(shorten_name)
    # the function returns all the information of the studied file
    return variable, time_aggregation, model, scenario, year

# Define Paths

In [9]:
out_path=r'\\COWI.net\projects\A245000\A248363\CRVA\Datasets\NEX-GDDP-CMIP6'

# Infos for Multiprocessing

In [10]:
# count the number of CPU/cores
mp.cpu_count()

8

In [11]:
NBCORES=5

# Complete list of url with files to download

In [12]:
# register information from csv file
#all_urls = pd.read_csv(r'C:\Users\CLMRX\OneDrive - COWI\Documents\GitHub\CRVA_tool\outputs\NEX-GDDP-CMIP6\gddp-cmip6-thredds-fileserver.csv')
csv_path = os.path.join(out_path,'gddp-cmip6-thredds-fileserver.csv')
all_urls = pd.read_csv(csv_path)

In [13]:
### make all elements of the csv into a readable list

temp_list = all_urls[[' fileUrl']].T# transpose csv
temp_list=temp_list.values.tolist()
temp_list=temp_list[0]
url_list=[s.replace(' ', '') for s in temp_list]

# Interest in temperature files
Define list of url and files' names corresponding to temperature files between start day and stop day

In [14]:
url_list_climate_variable = [url for url in url_list if (variable_of_interest+'_') in url and int(url[len(url)-7:len(url)-3])>=start_year and int(url[len(url)-7:len(url)-3])<=stop_year and 'r1i1p1f1_gn' in url and temporal_resolution in url]

In [15]:
len(url_list_climate_variable)

3539

In [16]:
# create url that will only that zone of interest
#url_list_climate_variable_zone_of_interest = [url.replace('thredds2/fileServer','thredds/ncss') + '?var='+variable_of_interest+'&north='+str(max_lat_zone_interest)+'&west='+str(min_lon_zone_interest)+'&east='+str(max_lon_zone_interest)+'&south='+str(min_lat_zone_interest)+'&disableProjSubset=on&horizStride=1&time_start='+url[len(url)-7:len(url)-3]+'-01-01T12%3A00%3A00Z&time_end='+url[len(url)-7:len(url)-3]+'-12-31T12%3A00%3A00Z&timeStride=1&addLatLon=true' for url in url_list_climate_variable]

In [17]:
#url_list_climate_variable_zone_of_interest

# Download file of interest

In [None]:
try:
    # download files with Threads
    results = ThreadPool(NBCORES).imap_unordered(download_file, url_list_climate_variable) 
    # first input is the function, second input must be an iterable
    for r in results: #I don't understand why, without this step, it does not work
         print(r)
except:
    print('Error occured')
    
name_list_climate_variable = produce_name_list(url_list_climate_variable)
name_list_climate_variable_corrupted = list_name_corrupted_file(name_list_climate_variable,out_path)
url_corrupted_file=url_to_download_again(url_list_climate_variable,name_list_climate_variable_corrupted)
len(name_list_climate_variable_corrupted)

while name_list_climate_variable_corrupted!=[]:
    # download files with Threads
    try:
        results = ThreadPool(NBCORES).imap_unordered(download_file, url_corrupted_file) 
        # first input is the function, second input must be an iterable
        for r in results: #I don't understand why, without this step, it does not work
             print(r)
    except:
        continue
    name_list_climate_variable_corrupted = list_name_corrupted_file(name_list_climate_variable,out_path)
    url_corrupted_file=url_to_download_again(url_list_climate_variable,name_list_climate_variable_corrupted)
    len(name_list_climate_variable_corrupted)


Name of the file: tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1960.nc
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1960.nc exists

Name of the file: tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1961.nc
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1961.nc exists

Name of the file: tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1962.nc
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1962.nc exists

Name of the file: tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1963.nc
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1963.nc exists

Name of the file: tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1964.nc
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1964.nc exists
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1960.nc is not corrupted

Name of the file: tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1965.nc
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1965.nc exists
None
The file tas_day_ACCESS-CM2_historical_r1i1p1f1_gn_1963.nc is not corrupted

Name of the file: tas_day_ACCESS

In [18]:
name_list_climate_variable = produce_name_list(url_list_climate_variable)
name_list_climate_variable_corrupted = list_name_corrupted_file(name_list_climate_variable,out_path)
url_corrupted_file=url_to_download_again(url_list_climate_variable,name_list_climate_variable_corrupted)
len(name_list_climate_variable_corrupted)

2135