This notebook is meant to be used to register copernicus data in csv files.

Data source : https://cds.climate.copernicus.eu/cdsapp#!/dataset/projections-cmip6?tab=form

# User input

In [None]:

global_variable = 'pr'
name_variable = 'precipitation'

temporal_resolution = 'daily'

y_start = 1950
y_end = 2014

# wind register at 10 m

# Functions and packages

In [None]:
import pandas as pd
import numpy as np
import numpy.ma as ma
import netCDF4 as nc#not directly used but needs to be imported for some nc4 files manipulations, use for nc files
from netCDF4 import Dataset
import xarray as xr
import datetime # to have actual date
import os
import os.path
import cdsapi # for copernicus function
import shutil
from datetime import datetime

# Out path

In [None]:
out_path=r'\\COWI.net\projects\A245000\A248363\CRVA\Datasets'

# Project information

In [None]:
name_projects = np.array(['WTP_Mutua_EIB', 'Gorongosa_EIB', 'Chimoio_WTP_EIB', 'Pemba_EIB'])

lon_projects_data = np.array([34.5927839939706, 34.07824286310398 , 33.47333313659342, 40.52545156033736])
lon_projects = pd.Series(lon_projects_data)

lat_projects_data = np.array([-19.495079648575242, -18.68063728746643, -19.125095255188334,-12.973942656747809])
lat_projects = pd.Series(lat_projects_data)
buffer_area_project=2
area_projects = [lat_projects - buffer_area_project, lat_projects+buffer_area_project, lon_projects-buffer_area_project,lon_projects+buffer_area_project] # list format

# Class

# Functions

In [None]:
########################################### Register data from nc file of Copernicus ############################################
# Aim of the function: this function aims to register in a dataframe and a csv file the data from the nc file downloaded with
# the function copernicus_data
# Actions of this function
#     1) Create the string indicating the period of interest
#     2) Creating path and file name to register dataframe in csv file
#     3) Register data, with its corresponding experiments and models, in dataframe and csv file
#        3 a) Test if path does not exists (if dataframe is not registered) : 
#                1 . Thanks to copernicus_data, download nc fils from copernicus CMIP6 website for each experiment and each model
#                2 . Open the dowloaded nc file in the jupyter notebook if it exists
#                3 . In a dataframe, register the value in the nc file, for each experiment, model and day
#                4 . If there no value for each experiments and models tested, the datfram is empty and the user is informed
#        3 b) Test if path exists (dataframe is registered) : no need to register again, return in dataframe the existing 
#             csv file in a dataframe

# Parameters of the function
# temporal_resolution: 'daily', 'monthly', or 'fixed'. String type 
# year_str: list containing all the years under the string type and in the period of interest
# experiments: copernicus_elements.experiments
# models: copernicus_elements.models
# out_path: path were the outputs are registered. Defined by the user at the beginning of the code 
# global_variable: global name of the climate variable of interest (example: Wind)
# name_variable: name of the elements downloaded from copernicus (example: 'near_surface_wind_speed')
# name_project: Name of the project for which the data are taken
# area: list containing latitudes and logitudes around the project

def csv_copernicus(temporal_resolution,year_str,experiments,models,out_path, global_variable, name_variable, name_projects,area,lat_projects,lon_projects,source = 'Copernicus-CMIP6',name_area='all-Mozambique'):    
    df_final = []
    
    # create string for name of folder depending on type of period
    if temporal_resolution == 'fixed':
        period = 'fixed'
    else:
        period=year_str[0]+'-'+year_str[len(year_str)-1]
    
    (dates, index_dates)=date_copernicus(temporal_resolution,year_str) # create time vector depending on temporal resolution
    
    #k = 0 # to find closest latitude and longitude for each project, without making the loop for each, ssp, each model, year and project
    #i = 0 # to have indexes of projects    
    r''''    
    for name_project in name_projects:
        print('############################### Project name: '+name_project+' ###############################')
        
        # modification on name_project str to ensure no problem whent using this str as name of a folder
        name_project = name_project.replace('-','_') # take off every blank space of project names
        name_project = name_project.replace('/','_') # take off every / of project names
        name_project = name_project.replace(r'"\"','_') # take off every \ of project names
        # brackets shouldn't be a problem for name projects'''

    title_file = name_project +'_' +period+ '_' + temporal_resolution + '_' +name_variable+'.csv'

    path_for_csv = os.path.join(out_path,'csv',source,name_variable,name_project,period) # create path for csv file
    
    for SSP
        for model_simulation
            path_for_file= os.path.join(out_path,name_variable,source,period,SSP,model_simulation,name_area)
            path_nc_file = search_for_nc(path_for_file)
        r'''
        if not os.path.isdir(path_for_csv): # test if the path for csv already exists; if not, first part if the if is applied
            os.makedirs(path_for_csv) # to ensure the creation of the path
            # the dataframe_copernicus functions aims to test if the data with the specific parameters exists (with copernicus_data)
            # and then produce a csv file if the data exists
            if k == 0:
                (df,k,index_closest_lat,index_closest_lon,closest_value_lat,closest_value_lon)=dataframe_copernicus(temporal_resolution,year_str,experiments,models,out_path, global_variable, name_variable, name_project,[area[0][k],area[1][k],area[2][k],area[3][k]],lat_projects[k],lon_projects[k],period,index_dates,dates,path_for_csv,title_file,source,k,i)
            if k ==1:
                (df,k,index_closest_lat,index_closest_lon,closest_value_lat,closest_value_lon)=dataframe_copernicus(temporal_resolution,year_str,experiments,models,out_path, global_variable, name_variable, name_project,[area[0][k],area[1][k],area[2][k],area[3][k]],lat_projects[k],lon_projects[k],period,index_dates,dates,path_for_csv,title_file,source,k,i,index_closest_lat_d=index_closest_lat,index_closest_lon_d=index_closest_lon,closest_value_lat_d=closest_value_lat,closest_value_lon_d=closest_value_lon)
                
            #return df
        else:# test if the data were already downloaded; if yes, this part of the if is applied
            if len(os.listdir(path_for_csv)) == 0: #test if the directory is empty
                # the csv file does not exist, even if the path exist
                # the dataframe_copernicus functions aims to test if the data with the specific parameters exists (with copernicus_data)
                # and then produce a csv file if the data exists
                if k == 0:
                    (df,k,index_closest_lat,index_closest_lon,closest_value_lat,closest_value_lon)=dataframe_copernicus(temporal_resolution,year_str,experiments,models,out_path, global_variable, name_variable, name_project,[area[0][k],area[1][k],area[2][k],area[3][k]],lat_projects[k],lon_projects[k],period,index_dates,dates,path_for_csv,title_file,source,k,i)
                    
                if k == 1:
                    (df,k,index_closest_lat,index_closest_lon,closest_value_lat,closest_value_lon)=dataframe_copernicus(temporal_resolution,year_str,experiments,models,out_path, global_variable, name_variable, name_project,[area[0][k],area[1][k],area[2][k],area[3][k]],lat_projects[k],lon_projects[k],period,index_dates,dates,path_for_csv,title_file,source,k,i,index_closest_lat_d=index_closest_lat,index_closest_lon_d=index_closest_lon,closest_value_lat_d=closest_value_lat,closest_value_lon_d=closest_value_lon)
            else: # the directory is not empty
                df=file_already_downloaded(path_for_csv,title_file,name_variable)
                
        #df_final = pd.concat([df_final,df])
        i+=1 # iterate indexes projects'''

    return df#df_final

In [None]:
# problem with register_data function --> problem with format of time, vaires between files

In [None]:
# register data concerning each project under the form of a csv, with the model, scenario, period, latitude and longitude
def register_data(climate_variable_path,name_project,name_variable,index_dates,dates,experiment,model,index_closest_lat,index_closest_lon,closest_value_lat,closest_value_lon,df,i):
    print('Registering the data in a dataframe')
    #Open_path = Dataset(climate_variable_path) # open netcdf file
    #lat_dataframe = np.ma.getdata(Open_path.variables['lat']).data
    #lon_dataframe = np.ma.getdata(Open_path.variables['lon']).data
    #column_name = find_column_name(Open_path)
    #data_with_all = ma.getdata(Open_path.variables[column_name]).data
    
    ds = xr.open_dataset(climate_variable_path)
    r'''
    for moment in index_dates: # case if temporal resolution is daily
        data_dataframe = ds.variables[global_variable].isel(time=moment,lat=index_closest_lat[i],lon=index_closest_lon[i]) # data_with_all[moment,:,:]
        Date = (dates[moment],) # create tuple for iteration of dataframe
        Name_Project = (name_project,)

        # Create the MultiIndex
        midx = pd.MultiIndex.from_product([Name_Project,closest_value_lat[i],closest_value_lon[i],experiment, model, Date],names=['Name project', 'Latitude', 'Longitude','Experiment', 'Model', 'Date'])
        # multiindex to name the columns
        cols_str = [name_variable]
        #cols = pd.MultiIndex.from_product([lon_str,lon_dataframe])
        # Create the Dataframe
        Variable_dataframe = pd.DataFrame(data = data_dataframe, 
                                    index = midx,
                                    columns = cols_str)
        Variable_dataframe
        # Concatenate former and new dataframe
        df = pd.concat([df,Variable_dataframe])# register information for project
    '''
    conversion_factor = 1
    if global_variable =='pr':
        conversion_factor = 86400
        # convert precipitation data from kg.m^(-2).s^(-1) to mm/day :  1 kg/m2/s = 86400 mm/day
    data_dataframe = ds.variables[global_variable].isel(lat=index_closest_lat[i],lon=index_closest_lon[i]).values*conversion_factor # data_with_all[moment,:,:]
    # missing 29.02 ?
    if len(ds.variables['time'].values)<len(index_dates):
        max(ds.indexes['time'].year)
        
        max(ds.indexes['time'].day)
            # yes, missing 29.02
            for j in np.where((dates.month == 2) & (dates.day ==29))[0]:
                data_dataframe=np.insert(data_dataframe,j,np.nan)
    Date = dates.tolist() # create tuple for iteration of dataframe
    Name_Project = (name_project,)
    
    print('\ni = '+ str(i))
    print('\nclosest_value_lat[i]'+str(closest_value_lat[i]))
    print('\ntype(closest_value_lat[i])'+str(type(closest_value_lat[i])))
    
    # Create the MultiIndex
    midx = pd.MultiIndex.from_product([Name_Project,(closest_value_lat[i],),(closest_value_lon[i],),experiment, model, Date],names=['Name project', 'Latitude', 'Longitude','Experiment', 'Model', 'Date'])
    # multiindex to name the columns
    cols_str = [name_variable]
    #cols = pd.MultiIndex.from_product([lon_str,lon_dataframe])
    # Create the Dataframe
    Variable_dataframe = pd.DataFrame(data = data_dataframe, 
                                index = midx,
                                columns = cols_str)
    Variable_dataframe
    # Concatenate former and new dataframe
    df = pd.concat([df,Variable_dataframe])# register information for project
    
    ds.close() # to spare memory
    #Open_path.close # to spare memory
    return df

In [None]:
# function to return column name in the netCDF file
# all netCDF file form copernicus have this format for their variables names
# ['time', 'time_bnds', 'lat', 'lat_bnds', 'lon', 'lon_bnds', Name of climate variable of interest]
# take of 'time', 'time_bnds', 'lat', 'lat_bnds', 'lon', 'lon_bnds'
def find_column_name(Open_path):
    # make a list with every variables of the netCDF file of interest
    climate_variable_variables=list(Open_path.variables)
    # variables that are not the column name of interest 
    elements_not_climate_var =['time', 'time_bnds', 'bnds','lat', 'lat_bnds', 'lon', 'lon_bnds','time_bounds','bounds','lat_bounds','lon_bounds','height']
    for str in elements_not_climate_var:
        if str in climate_variable_variables:
            climate_variable_variables.remove(str)
    return climate_variable_variables[0]

In [None]:
def file_already_downloaded(path_for_csv,title_file,name_variable):
    print('The file was already downloaded')
    df = pd.read_csv(os.path.join(path_for_csv,title_file)) # read the downloaded data for the analysis

    # changing name of columns
    name_columns=df.iloc[0].array
    df.rename(columns={'Unnamed: 0':'Experiment','Unnamed: 1':'Model','Unnamed: 2':'Date','Unnamed: 3':'Latitude'}, inplace=True)

    lon_dataframe=name_columns[4:len(name_columns)] # register data for columns of multiindex

    df.drop([0,1], axis=0,inplace=True) # remove 2 first lines

    # recreate multiindex 

    # .... with columns
    df.set_index(['Name project', 'Latitude', 'Longitude','Experiment', 'Model', 'Date'],inplace=True)

    # .... with lines

    cols_str = [name_variable]
    df.columns=cols_str
    return df

In [None]:
# seach_for_nc is a function looking in path_for_file for a document in .nc format

def search_for_nc(path_for_file):
    print('path_for_file does exist Function copernicus search for nc')
    for file in os.listdir(path_for_file):
        if file.endswith(".nc"):
            final_path=os.path.join(path_for_file, file)
            
            print('The file is in the path Function copernicus search for nc\n')
            print('Before path_length, The final path for the nc file is: '+final_path)
            answer = str(os.path.isfile(final_path))
            print('\n The final path for nc file exists ? '+answer+'\n')
            final_path=path_length(final_path) # check if length of path is too long
            print('After path_length, The final path for the nc file is: '+final_path)
            answer = str(os.path.isfile(final_path))
            print('\n The final path for nc file exists ? '+answer+'\n')
            return final_path # the function returns the path of the nc file of interest
            break # stop the function if a nc file was found 
        else:
            pass
    # the all folder has been search and there is no nc file in it
    print('Problem : No nc file was found Function copernicus Function copernicus search for nc')# this line is out of the for loop, 
    #because it should only appear once all the folder has been examinated and if the break of the if was not used

In [None]:
# this functions test if the path is too long
# if the path is more than 250 char, the path wll be modified in order for windows to accept is as a path

def path_length(str1):
    if len(str1)>250:
        path = os.path.abspath(str1) # normalize path
        if path.startswith(u"\\\\"):
            path=u"\\\\?\\UNC\\"+path[2:]
        else:
            path=u"\\\\?\\"+path
        return path
    else:
        return str1

In [None]:
# function to create path for the downloaded file
def create_file_download_path(start_path,name_variable,name_area,SSP,model,year,temporal_resolution,source):
    # adapt the name of the folder for the period, depending on the type of period
    if len(year)==1:
        file_download = os.path.join(start_path,name_variable,source,year,SSP,model,name_area)
    elif len(year)>1:
        period=year[0]+'-'+year[len(year)-1]
        file_download = os.path.join(start_path,name_variable,source,period,SSP,model,name_area)
    elif temporal_resolution == 'fixed':
        file_download = os.path.join(start_path,name_variable,source,'fixed_period',SSP,model,name_area)
    return file_download