# Scripts to check the datasheet mis-entries

In [30]:
import os
import pandas as pd
import numpy as np

# Configuration

Run this code twice for each `model_type = "Text"` and `"Figure"`

In [44]:
############# CHANGE HERE FOR YOUR PURPOSE ####################
data_dir = f"..\data"
model_type = "Figure" # Model type to check: Text or Figure
###############################################################

# Load Data

In [45]:
df_loc = pd.read_csv(os.path.join(data_dir, 'Location.csv')) # The lat/lon should be pre-formatted in decimal units
df_model = pd.read_csv(os.path.join(data_dir, f'ModelAnalysis_{model_type}.csv'))
df_taxonomy = pd.read_csv(os.path.join(data_dir, 'ProcessHierarchyNetwork.csv'))
df_FunctionType = pd.read_csv(os.path.join(data_dir, 'FunctionType.csv'))

Parse dataframe into table

In [46]:
# Location table
# Sanity check if data can be joined
# df.set_index('key').join(other.set_index('key'))
df_loc["id"] = df_loc.index + 1
df_loc["huc_watershed_id"] = np.nan

# FunctionType table
df_FunctionType["id"] = df_FunctionType.index + 1

# Citation table
df_model["attribution_url"].fillna(df_model["url"], inplace=True)
df_citation = df_model[["citation", "url"]].copy()
df_citation["attribution"] = df_model["attribution"].copy()
df_citation["attribution_url"] = df_model["attribution_url"].copy()
df_citation["id"] = df_citation.index + 1

# Spatial and temporal zone tables
df_spatialZoneType = df_model["spatial_property"].copy().drop_duplicates()
df_spatialZoneType = df_spatialZoneType.to_frame()
df_spatialZoneType.reset_index(inplace=True)
df_spatialZoneType = df_spatialZoneType.drop(columns='index')
df_spatialZoneType['id'] = df_spatialZoneType.index + 1
df_temporalZoneType = df_model["temporal_property"].copy().drop_duplicates()
df_temporalZoneType = df_temporalZoneType.to_frame()
df_temporalZoneType.reset_index(inplace=True)
df_temporalZoneType = df_temporalZoneType.drop(columns='index')
df_temporalZoneType['id'] = df_temporalZoneType.index + 1

# Alternative name table
df_altNames0 = df_taxonomy.set_index(['process', 'function', 'identifier', 'process_level']).apply(
    lambda x: x.str.split(',').explode()).reset_index()
df_altNames = df_altNames0[['alternative_names', 'process']].copy()
df_altNames['alternative_names'] = df_altNames['alternative_names'].str.strip()
df_altNames['alternative_names'] = df_altNames['alternative_names'].str.capitalize()
df_altNames.dropna(axis=0, inplace=True)
df_altNames["id"] = df_altNames.index + 1

# Model table
df_model["id"] = df_model.index + 1
df_modelmain = df_model[['id', 'citation', 'watershed_name',
                        'spatial_property', 'num_spatial_zones', 'temporal_property',
                        'num_temporal_zones', 'vegetation_info', 'soil_info', 'geol_info',
                        'topo_info', 'three_d_info', 'uncertainty_info', 'other_info'
                        ]].copy()

# LinkProcessPerceptual table
# Get all the process original text and taxonomy name from model
frames = [df_model[['id', 'flux1', 'flux1_taxonomy']].copy().rename(
    columns={"id": "entry_id", "flux1": "original_text", "flux1_taxonomy": "process"}),
        df_model[['id', 'flux2', 'flux2_taxonomy']].copy().rename(columns={"id": "entry_id", "flux2": "original_text", "flux2_taxonomy": "process"}),
        df_model[['id', 'flux3', 'flux3_taxonomy']].copy().rename(columns={"id": "entry_id", "flux3": "original_text", "flux3_taxonomy": "process"}),
        df_model[['id', 'flux4', 'flux4_taxonomy']].copy().rename(columns={"id": "entry_id", "flux4": "original_text", "flux4_taxonomy": "process"}),
        df_model[['id', 'flux5', 'flux5_taxonomy']].copy().rename(columns={"id": "entry_id", "flux5": "original_text", "flux5_taxonomy": "process"}),
        df_model[['id', 'flux6', 'flux6_taxonomy']].copy().rename(columns={"id": "entry_id", "flux6": "original_text", "flux6_taxonomy": "process"}),
        df_model[['id', 'flux7', 'flux7_taxonomy']].copy().rename(columns={"id": "entry_id", "flux7": "original_text", "flux7_taxonomy": "process"}),
        df_model[['id', 'flux8', 'flux8_taxonomy']].copy().rename(columns={"id": "entry_id", "flux8": "original_text", "flux8_taxonomy": "process"}),
        df_model[['id', 'flux9', 'flux9_taxonomy']].copy().rename(columns={"id": "entry_id", "flux9": "original_text", "flux9_taxonomy": "process"}),
        df_model[['id', 'flux10', 'flux10_taxonomy']].copy().rename(columns={"id": "entry_id", "flux10": "original_text", "flux10_taxonomy": "process"}),
        df_model[['id', 'flux11', 'flux11_taxonomy']].copy().rename(columns={"id": "entry_id", "flux11": "original_text", "flux11_taxonomy": "process"}),
        df_model[['id', 'flux12', 'flux12_taxonomy']].copy().rename(columns={"id": "entry_id", "flux12": "original_text", "flux12_taxonomy": "process"}),
        df_model[['id', 'flux13', 'flux13_taxonomy']].copy().rename(columns={"id": "entry_id", "flux13": "original_text", "flux13_taxonomy": "process"}),
        df_model[['id', 'flux14', 'flux14_taxonomy']].copy().rename(columns={"id": "entry_id", "flux14": "original_text", "flux14_taxonomy": "process"}),
        df_model[['id', 'store1', 'store1_taxonomy']].copy().rename(columns={"id": "entry_id", "store1": "original_text", "store1_taxonomy": "process"}),
        df_model[['id', 'store2', 'store2_taxonomy']].copy().rename(columns={"id": "entry_id", "store2": "original_text", "store2_taxonomy": "process"}),
        df_model[['id', 'store3', 'store3_taxonomy']].copy().rename(columns={"id": "entry_id", "store3": "original_text", "store3_taxonomy": "process"}),
        df_model[['id', 'store4', 'store4_taxonomy']].copy().rename( columns={"id": "entry_id", "store4": "original_text", "store4_taxonomy": "process"}),
        df_model[['id', 'store5', 'store5_taxonomy']].copy().rename(columns={"id": "entry_id", "store5": "original_text", "store5_taxonomy": "process"}),
        df_model[['id', 'store6', 'store6_taxonomy']].copy().rename(columns={"id": "entry_id", "store6": "original_text", "store6_taxonomy": "process"}),
        df_model[['id', 'store7', 'store7_taxonomy']].copy().rename(columns={"id": "entry_id", "store7": "original_text", "store7_taxonomy": "process"}),
        df_model[['id', 'store8', 'store8_taxonomy']].copy().rename(columns={"id": "entry_id", "store8": "original_text", "store8_taxonomy": "process"}),
        df_model[['id', 'store9', 'store9_taxonomy']].copy().rename(columns={"id": "entry_id", "store9": "original_text", "store9_taxonomy": "process"}),
        df_model[['id', 'store10', 'store10_taxonomy']].copy().rename(columns={"id": "entry_id", "store10": "original_text", "store10_taxonomy": "process"})
        ]

df_linkProcessPerceptual0 = pd.concat(frames, axis=0, ignore_index=True)
df_linkProcessPerceptual0["id"] = df_linkProcessPerceptual0.index + 1

# Create taxonomy table
df_process0 = df_taxonomy.drop(columns='alternative_names')

## Check process taxonomy - flux and store names

In [47]:
# join process taxonomy and model table
df_linkProcessPerceptual0["process_lower"] = df_linkProcessPerceptual0['process'].str.lower()
df_linkProcessPerceptual0["process_lower"] = df_linkProcessPerceptual0['process_lower'].str.strip()
df_process0["process_lower"] = df_process0['process'].str.lower()
df_process0["process_lower"] = df_process0['process_lower'].str.strip()

# find and add some new process from model table to taxonomy table (# Check here if you want to check process miscategorization)
df_linkProcessPerceptual1 = df_linkProcessPerceptual0.merge(df_process0, on='process_lower', how='left')
new_process = df_linkProcessPerceptual1.loc[(df_linkProcessPerceptual1['process_x'].isnull() == False) & (
            df_linkProcessPerceptual1['process_y'].isnull() == True)]
new_process.drop_duplicates(subset='process_lower', inplace=True)
# new_process.to_excel(r'..\data\text_models_workspace\newprocess_v2.xlsx')

new_process

# This should return empty dataframe if everything matches 
# If not, there was process names in 'ModelAnalysis_{model_type}.csv' that doesn't exist in 'ProcessHierarchyNetwork.csv'. Check back the datasheet

Unnamed: 0,entry_id,original_text,process_x,id,process_lower,process_y,function,identifier,process_level


## Check match with location database

In [48]:
# join location  and model table
df_linkLocation = pd.merge(df_model[['id', 'citation', 'watershed_name']], df_loc, left_on='watershed_name', right_on='name', how='left')
df_linkLocation.loc[(df_linkLocation['watershed_name'].isnull()) | (df_linkLocation['name'].isnull())]

# This should return empty dataframe if everything matches 
# If not, there was watershed name in 'ModelAnalysis_{model_type}.csv' that doesn't exist in 'Location_formatted.csv'. Check back the datasheet

Unnamed: 0,id_x,citation,watershed_name,name,country,lat,lon,area_km2,id_y,huc_watershed_id
130,131,,,,,,,,,


## Check watershed attributes

In [49]:
df_model['vegetation_info'].unique()

array(['N', 'Cropland described', 'Vegetation described',
       'Forest described', 'Seasonal change discussed',
       'Vegetation icons', 'Vegetation types described', nan],
      dtype=object)

In [37]:
df_model['soil_info'].unique()

array(['N', 'Soil described', 'Soil types described',
       'Soil texture described', 'Soil hydraulic properties described',
       'Multiple properties described', 'Horizons described'],
      dtype=object)

In [38]:
df_model['geol_info'].unique()

array(['N', 'Geology described', 'Glacier described', 'Karst described',
       'Bedrock described'], dtype=object)

In [39]:
df_model['topo_info'].unique()

array(['N', 'Slopes described', 'Topography described'], dtype=object)

In [40]:
df_model['uncertainty_info'].unique()

array(['N', 'Multiple interpretations demonstrated',
       'Limitations discussed', 'Uncertainty described',
       'Unknown items identified'], dtype=object)

In [41]:
df_model['other_info'].unique()

array(['N', 'Impact of root activity described', 'salinity study'],
      dtype=object)

In [42]:
df_model['spatial_property'].unique()

array(['Hillslope position', 'N', 'Land use / Land cover',
       'Soil or Geology', 'Catchment spatial scale', 'Process',
       'Hillslope position/Catchment spatial scale', 'Topography',
       'Multiple catchments', 'Tributary',
       'Land cover / Hillslope position'], dtype=object)

In [43]:
df_model['temporal_property'].unique()

array(['N', 'Wetness and event', 'Season', 'Event', 'Season and event',
       'Rainfall intensity', 'Season and rainfall intensity', 'Wetness',
       'Season with snow', 'Interannual', 'Event and rainfall intensity',
       'Season and rainfall intensity with snow', 'Season and wetness',
       nan], dtype=object)