# User input

In [1]:
variable_of_interest = 'tasmax_day_' # add list of available name variable, period of time available
start_year = 1950
stop_year = 2100

# Import Packages

In [2]:
import requests
import pandas as pd

import os
import os.path

from netCDF4 import Dataset

import numpy as np
import numpy.ma as ma

import multiprocessing as mp# to download several file in parrallel
from multiprocessing.pool import ThreadPool

# Define Functions

In [3]:
# function to extract the name of the file from its url
# the imput is an url
def extract_name_file(url):
    index_before_name=url.rfind('/') # returns the highest index where the last character '/' was found, which is just before the name of the file    
    name = url[index_before_name+1:len(url)] # return the name of the file as a string, with the suffix '.nc'
    return name

# function 'produce_name_list' produce a list of files' name, with the suffix '.nc'
# 'produce_name_list' use the function 'extract_name_file' to have the name of a file from its url
# the input is a list of url, from which we want to extract the corresponding names of files
def produce_name_list(url_list):
    name_list=[] # create empty list
    for file in url_list:
        f_name = extract_name_file(file) # return the name of the file as a string, with the suffix '.nc'
        name_list.append(f_name) # add extracted name in the list
    return name_list # return the list of names in the url_list

In [4]:
# function download_file download the file in the url given as input
# the input is an url of a file
def download_file(file):
    # find the name of the file
    f_name = extract_name_file(file) # return the name of the file as a string, with the suffix '.nc' at the end of the name
    # check if the file was already downloaded
    #test = os.path.join('//COWI.net/projects/A245000/A248363/CRVA/Datasets/NEX-GDDP-CMIP6/',f_name+'.nc')
    #if os.path.isfile(test): # if the file was aleready download
        #continue # continue the for loop without executing the code after this line. The code follonwing download the file
    #    return# if we entered the if, the file was already dowloaded, no need to downloaded it again
    # the file was not downloaded, the following code will permit to download it in the servor for dataset
    
    
    connect_timeout = 1 # time in seconds, the timeout for establishing a connection to the server. This timeout value can be set by passing the "timeout" parameter to the requests.request() function and setting the "connect" key to the desired timeout value:
    read_timeout = 1000 # time in seconds, the timeout for waiting for a response from the server after the connection has been established. This timeout value can be set by passing the "timeout" parameter to the requests.request() function and setting the "read" key to the desired timeout value:
    
    
    r = requests.get(file, timeout=(connect_timeout, read_timeout)) # return the url were data need to be downloaded
    # download data in the servor for datasets
    with open(f'//COWI.net/projects/A245000/A248363/CRVA/Datasets/NEX-GDDP-CMIP6/{f_name}', 'wb') as f:
        f.write(r.content)
    return file
#results = ThreadPool(NBCORES).imap_unordered(download_file, url_list_precipitation)
#for r in results: #I don't understand why, ut without this step it does not work
#     print(r)

In [5]:
# function 'list_name_corrupted_file' aims to return a list of names, corresponding to the names of the corrupted files
# in the name_list given as input
def list_name_corrupted_file(name_list,out_path):
    ## Hypothesis: some files are corrupted
    # check if files are corrupted and count the corrupted ones
    invalid_files = [] # create an empty list to register the names of the corrupted files
    for name in name_list:
        if not name.endswith('.nc'):
            name = name + '.nc'
        if test_open_file(os.path.join(out_path,name))!=[]:
            invalid_files.append(name) # register name of the corrupted files
    return invalid_files # return a list of names, with suffix '.nc', of corrupted files

In [6]:
def test_open_file(path):
    try:
        im = Dataset(path)
        name = []
        im.close()
    except (IOError, OSError):
        # the files is corrupted
        name = extract_name_file(path) # return the name of the file as a string, with the suffix '.nc'
    return name

In [7]:
# list urls that should be downloaded again
def url_to_download_again(url_list,invalid_files):
    url_corrupted_file = []
    for url in url_list:
        f_name = extract_name_file(url) # return the name of the file as a string
        if f_name in invalid_files:
            url_corrupted_file.append(url)

    dict ={' fileUrl':url_corrupted_file}
    df = pd.DataFrame(dict)
    path = os.path.join(out_path,'file_to_download_again.csv')
    df.to_csv(path)
    return url_corrupted_file

In [8]:
## those three function are used to have the information concerning a file
## information are in the name of the file, so the name of the file is used to find its related information
## information mean variable, time_aggregation, model, scenario, year of the file

### this function permit to extract the word before the first character '_' in the input 'name'
### the input name is in format str
### returning the new_name, without the word found, will permit to re-use the function to find all 
#     the information concerning the studied file
def name_next_boundary(name):
    index_before_name=name.find('_') # returns the lowest index where the character '_' was found
    word = name[0:index_before_name] # first word in the string 'name', before the first character '_'
    new_name = name.replace(word+'_','') # delete the word found from the string 'name'
    return word, new_name # return, in string format, the word found (which is an information of the studied file), 
                    # and the string 'new_name', which is 'name' without the word found

# this function permit to extract the year of the studied file
# the year is always writen at the end of the name's file
# the input name is in format str
def find_year(name):
    index_before_name=name.rfind('_') # returns the highest index where the character '_' was found
    # the last character '_' is just before the year in the string 'name'
    # determine if the string 'name' ends with '.nc'
    if name.endswith('.nc'):
        # 'name' ends with '.nc'
        name_end = 3 # the three last character of the string name will be removed to find the year of the studied file
    else:
        # 'name' does not end with '.nc'
        name_end = 0 # no character will be removed at the end of 'name' to find the year of the studied file
    year = name[index_before_name+1:len(name)-name_end] # the year is extracted from the name of the file studied
    # based on the index_before_name (highest index where the character '_' was found) and the suffix of 'name'
    return year # the year in string format is returned

# This function use the functions 'name_next_boundary' and 'find_year' to extract the information of the file studied
# the input name is in format str, the name of the file from which we want information
def data_information(name):
    #### use of the function 'name_next_boundary': each time it is used, 
    # returns an information, and the name of the studied file without this information
    (variable, shorten_name) = name_next_boundary(name)
    (time_aggregation, shorten_name) = name_next_boundary(shorten_name)
    (model, shorten_name) = name_next_boundary(shorten_name)
    (scenario, shorten_name) = name_next_boundary(shorten_name)
    #### use the function 'find_year' to extract the information 'year' from the string 'shorten_name'
    year = find_year(shorten_name)
    # the function returns all the information of the studied file
    return variable, time_aggregation, model, scenario, year

# Projects information

In [9]:
import geopandas as gpd
import os
import os.path

data_folder=r'\\COWI.net\projects\A245000\A248363\CRVA\Datasets'
project_location_path=os.path.join(data_folder,'Mozambique_PPIS/EIB_locations_few.shp')
#study boundary (optional)
study_area_path=os.path.join(data_folder,'Mozambique_PPIS/mozambique.shp')


#projection CRS (default = 'EPSG:4326')
bCRS='EPSG:4326'

#buffer for climate/grid variables
buffer=40000 #buffer in meters, 0 = no buffer is computed

mCRS='EPSG:31983' #metric CRS for buffer in meters (find relevant metric CRS for location!)

#load shapefiles
projects = gpd.read_file(project_location_path).to_crs(bCRS)

study_area = gpd.read_file(study_area_path).to_crs(bCRS)

#calculate buffer around points/shape
if buffer != 0:
    projects_buf=projects.to_crs(mCRS)  #project to crs with metric units to get buffer in meters
    projects_buf['geometry']=projects.to_crs(mCRS).buffer(buffer) #assign the buffer as the new geometry - 
    projects_buf=projects_buf.to_crs(bCRS)#project back to orginal crs


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


# Define Paths

In [10]:
out_path=r'\\COWI.net\projects\A245000\A248363\CRVA\Datasets\NEX-GDDP-CMIP6'

# Infos for Multiprocessing

In [11]:
# count the number of CPU/cores
mp.cpu_count()

8

In [12]:
NBCORES=5

# Complete list of url with files to download

In [13]:
# register information from csv file
#all_urls = pd.read_csv(r'C:\Users\CLMRX\OneDrive - COWI\Documents\GitHub\CRVA_tool\outputs\NEX-GDDP-CMIP6\gddp-cmip6-thredds-fileserver.csv')
csv_path = os.path.join(out_path,'gddp-cmip6-thredds-fileserver.csv')
all_urls = pd.read_csv(csv_path)

In [14]:
### make all elements of the csv into a readable list

temp_list = all_urls[[' fileUrl']].T# transpose csv
temp_list=temp_list.values.tolist()
temp_list=temp_list[0]
url_list=[s.replace(' ', '') for s in temp_list]

# Interest in temperature files
Define list of url and files' names corresponding to temperature files between start day and stop day

In [15]:
url_list_climate_variable = [url for url in url_list if variable_of_interest in url and int(url[len(url)-7:len(url)-3])>=start_year and int(url[len(url)-7:len(url)-3])<=stop_year and 'r1i1p1f1_gn' in url]

In [16]:
len(url_list_climate_variable)

5468

In [17]:
name_list_climate_variable = produce_name_list(url_list_climate_variable)

In [18]:
name_list_climate_variable

['tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1951.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1952.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1953.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1954.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1955.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1956.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1957.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1958.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1959.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1960.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1961.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1962.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1963.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1964.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1965.nc',
 'tasmax_day_ACCESS-CM2_historical_r1i1p1f1_gn_1966.nc',
 'tasmax_day_ACCESS-CM2_histori

In [19]:
for file in url_list_climate_variable: # for loop to download the file in each url
    f_name = extract_name_file(file)
    test = os.path.join('//COWI.net/projects/A245000/A248363/CRVA/Datasets/NEX-GDDP-CMIP6/',f_name)
    if os.path.isfile(test): # if the file was aleready download
        if test_open_file(test)==[]:
            # file is not corrupted
            continue # continue the for loop
    # the file was already downloaded but is corrupted
    download_file(file)

ConnectionError: HTTPSConnectionPool(host='ds.nccs.nasa.gov', port=443): Read timed out.

# List of corrupted files to download again

In [20]:
name_list_climate_variable_corrupted = list_name_corrupted_file(name_list_climate_variable,out_path)
url_corrupted_file=url_to_download_again(url_list_climate_variable,name_list_climate_variable_corrupted)
len(name_list_climate_variable_corrupted)

4082

In [21]:
name_list_climate_variable_corrupted

['tasmax_day_CanESM5_ssp245_r1i1p1f1_gn_2051.nc',
 'tasmax_day_CanESM5_ssp585_r1i1p1f1_gn_2015.nc',
 'tasmax_day_CMCC-CM2-SR5_ssp585_r1i1p1f1_gn_2018.nc',
 'tasmax_day_CMCC-CM2-SR5_ssp585_r1i1p1f1_gn_2065.nc',
 'tasmax_day_CMCC-ESM2_ssp245_r1i1p1f1_gn_2016.nc',
 'tasmax_day_CMCC-ESM2_ssp245_r1i1p1f1_gn_2018.nc',
 'tasmax_day_CMCC-ESM2_ssp245_r1i1p1f1_gn_2053.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2030.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2039.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2074.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2075.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2076.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2077.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2078.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2079.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2080.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2081.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2082.nc',
 'tasmax_day_CMCC-ESM2_ssp585_r1i1p1f1_gn_2083.nc',
 'tasmax_d

# Download file of interest

In [None]:
# download with for loop
while len(url_corrupted_file) != 0: # while the list 'url_corrupted_file' is not empty, the following code will continue
    for file in url_corrupted_file: # for loop to download the file in each url
        f_name = extract_name_file(file)
        test = os.path.join('//COWI.net/projects/A245000/A248363/CRVA/Datasets/NEX-GDDP-CMIP6/',f_name)
        if os.path.isfile(test): # if the file was aleready download
            if test_open_file(test)==[]:
                # file is not corrupted
                continue # continue the for loop
        # the file was already downloaded but is corrupted
        download_file(file)
    # check if all downloaded files are not corrupted
    name_list_climate_variable_corrupted = list_name_corrupted_file(name_list_climate_variable,out_path)
    url_corrupted_file=url_to_download_again(url_list_climate_variable,name_list_climate_variable_corrupted) 

In [None]:
# download files with Threads
results = ThreadPool(NBCORES).imap_unordered(download_file, url_corrupted_file) 
# first input is the function, second input must be an iterable
for r in results: #I don't understand why, without this step, it does not work
     print(r)