## Parser for UEAs csv-based crop data

### notes

country, ISO, Model name , variable, mean, SWL, impact-tag (in filename), 

* complication - I see the variable should be a mix of the model name (cultivar) + the variable in the filename (e.g. irrigation_avoided_perc_change).

* To get country name exactly as it is in the other data we should look it up from the shapefile data.

Joint country code (SCG) is replaced by
The new 3-letter codes are SRB (Serbia) and MNE (Montenegro).


In [None]:
import pandas as pd
import geopandas as gpd
from iso3166 import countries
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
def identify_netcdf_and_csv_files(path='data'):
    """Crawl through a specified folder and return a dict of the netcdf d['nc']
    and csv d['csv'] files contained within.
    Returns something like
    {'nc':'data/CNRS_data/cSoil/orchidee-giss-ecearth.SWL_15.eco.cSoil.nc'}
    """
    netcdf_files = []
    csv_files = []
    for root, dirs, files in os.walk(path):
        if isinstance([], type(files)):
            for f in files:
                if f.split('.')[-1] in ['nc']:
                    netcdf_files.append(''.join([root,'/',f]))
                elif  f.split('.')[-1] in ['csv']:
                    if f not in ['PopAff_SWLs_Country.csv', 'ExpDam_SWLs_Country.csv']:
                        csv_files.append(''.join([root,'/',f]))
    return {'nc':netcdf_files,'csv':csv_files}


def get_metadata_from_UEA_csv(f):
    filename_stripped = f[7:]  # <-- this will be the target ./processed/ + filename_stripped
    filename_pieces = filename_stripped.split('/')
    exploded_filename = filename_pieces[-1].split(".")
    tmp_model_taxonomy = exploded_filename[0]
    # In this case the field that should be model taxonomy has been merged with model name and variable
    # I will need to seperate that 
    model_taxonomy = '-'.join(tmp_model_taxonomy.split('-')[1:])
    variable = '_'.join([tmp_model_taxonomy.split('-')[0], exploded_filename[-2]])
    #print(tmp_model_taxonomy)

    impact_tag = filename_pieces[-1].split(".")[2]

    tmp_swl = filename_pieces[-1].split(".")[1]
    if tmp_swl == 'SWL_15':
        swl_info = 1.5
    elif tmp_swl == 'SWL_2':
        swl_info = 2.0
    elif tmp_swl == 'SWL_4':
        swl_info = 4.0
    elif tmp_swl == 'SWL_6':
        swl_info = 6.0
    else:
        raise ValueError("Unknown SWL input {}".format(tmp_swl))

    model_short_name = None
    season = None
    is_seasonal = None
    is_monthly = None
    month = None
    is_multi_model_summary = None
    model_long_name = None
    institution = None
    d = {'swl_info': swl_info,
         'model_taxonomy': model_taxonomy,
         'impact_tag': impact_tag,
         'variable': variable,
         'model_short_name':model_short_name,
         'is_seasonal': False,
         'season': season,
         'is_monthly': False,
         'month': month,
         'is_multi_model_summary': False,
         'model_long_name':model_long_name,
         'institution': institution
        }
    return d


def process_csv_file(f, s, verbose=False):
    keys =['name_0','iso','variable','swl_info',
            'count', 'max','min','mean','std','impact_tag','institution',
            'model_long_name','model_short_name','model_taxonomy',
            'is_multi_model_summary','is_seasonal','season','is_monthly',
            'month']
    df = pd.read_csv(f)
    SCG_value = None
    f_level_dic = get_metadata_from_UEA_csv(f)
    # We need to do this via the CSV data not the shapefile data as csv data has iso's not present in shapefiles
    # which we will need to deal with somehow.
    mean_key = df.keys()[-1]
    tmp_data = []
    for row in df.index:
        csv_iso = df['ISO3'][row]
        csv_value = df[mean_key][row]
        meta_1 = {'iso': csv_iso, 'mean':csv_value, 'count':None, 'min':None, 'max': None, 'std':None}
        smask = s['iso'] == meta_1['iso']
        if all([len(s[smask]) != 1, csv_iso != 'SCG']):
            if verbose: print(csv_iso,' has unexpected size',s[smask])
        if csv_iso == 'SCG':
            if verbose: print("Found Serbia-Montenegro")
            SCG_value = csv_value
        else:
            # Assume all is well if only one shape matched
            meta_2 = {'name_0': s[smask]['name_0'].values[0]}
        tmp_d = {**f_level_dic,**meta_1, **meta_2}
        tmp_data.append([tmp_d[key] for key in keys])
    # Hack to get Serbia (SRB) and Montenegro (MNE) data into the project:
    if SCG_value:
        if verbose: print("Engaging Serbia-Montenegro hack")
        serbia = {**f_level_dic, 'name_0': 'Serbia', 'iso':'SRB', 'mean': SCG_value,
                  'count':None, 'min':None, 'max': None, 'std':None}
        montenegro = {**f_level_dic, 'name_0': 'Montenegro', 'iso':'MNE', 'mean': SCG_value,
                  'count':None, 'min':None, 'max': None, 'std':None}
        tmp_data.append([serbia[key] for key in keys])
        tmp_data.append([montenegro[key] for key in keys])
    
    file_target = '/'.join(['./processed/admin0',f[7:]])  #<-- write target
    path_check ='/'.join(file_target.split('/')[:-1])
    # WRITE EXTRACTED VALUES TO A SPECIFIC SWL CSV FILE IN PROCESSED with matching filename
    if not os.path.exists(path_check):
        os.makedirs(path_check)
    tmp_df = pd.DataFrame(tmp_data, columns = keys)
    tmp_df.to_csv(file_target, index=False)
    if verbose: print("Generated ", file_target)
    return

#### Run

In [None]:
fs = identify_netcdf_and_csv_files('./data')

In [None]:
for n, f in enumerate(fs['csv'][0:5]):
    print(n, f)

In [None]:
s = gpd.read_file("./data/gadm28_adm0_simplified/gadm28_adm0_simplified.shp")

In [None]:
for f in fs['csv']:
    print('working on', f)
    process_csv_file(f=f, s=s, verbose=False)