# Scripts to check the datasheet mis-entries

In [19]:
import os
import pandas as pd
import numpy as np

# Configuration

In [20]:
data_dir = f"G:\Shared drives\Perceptual model review\ForRyoko\data"
model_type = "Figure" # Model type to check: Text or Figure

In [21]:
df_loc = pd.read_excel(os.path.join(data_dir, 'Location_formatted.xlsx')) # The lat/lon should be pre-formatted in decimal units
df_model = pd.read_excel(os.path.join(data_dir, f'ModelAnalysis_{model_type}.xlsx'))
df_taxonomy = pd.read_excel(os.path.join(data_dir, 'ProcessHierarchyNetwork.xlsx'))
df_FunctionType = pd.read_excel(os.path.join(data_dir, 'FunctionType.xlsx'))

# Read Data

In [22]:
## Parse dataframe into table

# Location table
# Sanity check if data can be joined
# df.set_index('key').join(other.set_index('key'))
df_loc["id"] = df_loc.index + 1
df_loc = df_loc.drop(columns='Unnamed: 0')
df_loc["huc_watershed_id"] = np.nan

# FunctionType table
df_FunctionType["id"] = df_FunctionType.index + 1

# Citation table
df_model["attribution_url"].fillna(df_model["url"], inplace=True)
df_citation = df_model[["citation", "url"]].copy()
df_citation["attribution"] = df_model["attribution"].copy()
df_citation["attribution_url"] = df_model["attribution_url"].copy()
df_citation["id"] = df_citation.index + 1

# Spatial and temporal zone tables
df_spatialZoneType = df_model["spatial_property"].copy().drop_duplicates()
df_spatialZoneType = df_spatialZoneType.to_frame()
df_spatialZoneType.reset_index(inplace=True)
df_spatialZoneType = df_spatialZoneType.drop(columns='index')
df_spatialZoneType['id'] = df_spatialZoneType.index + 1
df_temporalZoneType = df_model["temporal_property"].copy().drop_duplicates()
df_temporalZoneType = df_temporalZoneType.to_frame()
df_temporalZoneType.reset_index(inplace=True)
df_temporalZoneType = df_temporalZoneType.drop(columns='index')
df_temporalZoneType['id'] = df_temporalZoneType.index + 1

# Alternative name table
df_altNames0 = df_taxonomy.set_index(['process', 'function', 'identifier', 'process_level']).apply(
    lambda x: x.str.split(',').explode()).reset_index()
df_altNames = df_altNames0[['alternative_names', 'process']].copy()
df_altNames['alternative_names'] = df_altNames['alternative_names'].str.strip()
df_altNames['alternative_names'] = df_altNames['alternative_names'].str.capitalize()
df_altNames.dropna(axis=0, inplace=True)
df_altNames["id"] = df_altNames.index + 1

# Model table
df_model["id"] = df_model.index + 1
df_modelmain = df_model[['id', 'citation', 'watershed_name',
                        'spatial_property', 'num_spatial_zones', 'temporal_property',
                        'num_temporal_zones', 'vegetation_info', 'soil_info', 'geol_info',
                        'topo_info', 'three_d_info', 'uncertainty_info', 'other_info'
                        ]].copy()

# LinkProcessPerceptual table
# Get all the process original text and taxonomy name from model
frames = [df_model[['id', 'flux1', 'flux1_taxonomy']].copy().rename(
    columns={"id": "entry_id", "flux1": "original_text", "flux1_taxonomy": "process"}),
        df_model[['id', 'flux2', 'flux2_taxonomy']].copy().rename(columns={"id": "entry_id", "flux2": "original_text", "flux2_taxonomy": "process"}),
        df_model[['id', 'flux3', 'flux3_taxonomy']].copy().rename(columns={"id": "entry_id", "flux3": "original_text", "flux3_taxonomy": "process"}),
        df_model[['id', 'flux4', 'flux4_taxonomy']].copy().rename(columns={"id": "entry_id", "flux4": "original_text", "flux4_taxonomy": "process"}),
        df_model[['id', 'flux5', 'flux5_taxonomy']].copy().rename(columns={"id": "entry_id", "flux5": "original_text", "flux5_taxonomy": "process"}),
        df_model[['id', 'flux6', 'flux6_taxonomy']].copy().rename(columns={"id": "entry_id", "flux6": "original_text", "flux6_taxonomy": "process"}),
        df_model[['id', 'flux7', 'flux7_taxonomy']].copy().rename(columns={"id": "entry_id", "flux7": "original_text", "flux7_taxonomy": "process"}),
        df_model[['id', 'flux8', 'flux8_taxonomy']].copy().rename(columns={"id": "entry_id", "flux8": "original_text", "flux8_taxonomy": "process"}),
        df_model[['id', 'flux9', 'flux9_taxonomy']].copy().rename(columns={"id": "entry_id", "flux9": "original_text", "flux9_taxonomy": "process"}),
        df_model[['id', 'flux10', 'flux10_taxonomy']].copy().rename(columns={"id": "entry_id", "flux10": "original_text", "flux10_taxonomy": "process"}),
        df_model[['id', 'flux11', 'flux11_taxonomy']].copy().rename(columns={"id": "entry_id", "flux11": "original_text", "flux11_taxonomy": "process"}),
        df_model[['id', 'flux12', 'flux12_taxonomy']].copy().rename(columns={"id": "entry_id", "flux12": "original_text", "flux12_taxonomy": "process"}),
        df_model[['id', 'flux13', 'flux13_taxonomy']].copy().rename(columns={"id": "entry_id", "flux13": "original_text", "flux13_taxonomy": "process"}),
        df_model[['id', 'flux14', 'flux14_taxonomy']].copy().rename(columns={"id": "entry_id", "flux14": "original_text", "flux14_taxonomy": "process"}),
        df_model[['id', 'store1', 'store1_taxonomy']].copy().rename(columns={"id": "entry_id", "store1": "original_text", "store1_taxonomy": "process"}),
        df_model[['id', 'store2', 'store2_taxonomy']].copy().rename(columns={"id": "entry_id", "store2": "original_text", "store2_taxonomy": "process"}),
        df_model[['id', 'store3', 'store3_taxonomy']].copy().rename(columns={"id": "entry_id", "store3": "original_text", "store3_taxonomy": "process"}),
        df_model[['id', 'store4', 'store4_taxonomy']].copy().rename( columns={"id": "entry_id", "store4": "original_text", "store4_taxonomy": "process"}),
        df_model[['id', 'store5', 'store5_taxonomy']].copy().rename(columns={"id": "entry_id", "store5": "original_text", "store5_taxonomy": "process"}),
        df_model[['id', 'store6', 'store6_taxonomy']].copy().rename(columns={"id": "entry_id", "store6": "original_text", "store6_taxonomy": "process"}),
        df_model[['id', 'store7', 'store7_taxonomy']].copy().rename(columns={"id": "entry_id", "store7": "original_text", "store7_taxonomy": "process"}),
        df_model[['id', 'store8', 'store8_taxonomy']].copy().rename(columns={"id": "entry_id", "store8": "original_text", "store8_taxonomy": "process"}),
        df_model[['id', 'store9', 'store9_taxonomy']].copy().rename(columns={"id": "entry_id", "store9": "original_text", "store9_taxonomy": "process"}),
        df_model[['id', 'store10', 'store10_taxonomy']].copy().rename(columns={"id": "entry_id", "store10": "original_text", "store10_taxonomy": "process"})
        ]

df_linkProcessPerceptual0 = pd.concat(frames, axis=0, ignore_index=True)
df_linkProcessPerceptual0["id"] = df_linkProcessPerceptual0.index + 1

# Create taxonomy table
df_process0 = df_taxonomy.drop(columns='alternative_names')

## Check process taxonomy - flux and store names

In [23]:

# join process taxonomy and model table
df_linkProcessPerceptual0["process_lower"] = df_linkProcessPerceptual0['process'].str.lower()
df_linkProcessPerceptual0["process_lower"] = df_linkProcessPerceptual0['process_lower'].str.strip()
df_process0["process_lower"] = df_process0['process'].str.lower()
df_process0["process_lower"] = df_process0['process_lower'].str.strip()

# find and add some new process from model table to taxonomy table (# Check here if you want to check process miscategorization)
df_linkProcessPerceptual1 = df_linkProcessPerceptual0.merge(df_process0, on='process_lower', how='left')
new_process = df_linkProcessPerceptual1.loc[(df_linkProcessPerceptual1['process_x'].isnull() == False) & (
            df_linkProcessPerceptual1['process_y'].isnull() == True)]
new_process.drop_duplicates(subset='process_lower', inplace=True)
# new_process.to_excel(r'..\data\text_models_workspace\newprocess_v2.xlsx')

new_process

# Returns empty dataframe if everything matches 
# If not, check back the datasheet

Unnamed: 0,entry_id,original_text,process_x,id,process_lower,process_y,function,identifier,process_level


## Check match with location database

In [24]:
# join location  and model table
df_linkLocation = pd.merge(df_model[['id', 'citation', 'watershed_name']], df_loc, left_on='watershed_name', right_on='name', how='left')
df_linkLocation.loc[(df_linkLocation['watershed_name'].isnull()) | (df_linkLocation['name'].isnull())]

# Returns empty dataframe if everything matches 
# If not, check back the datasheet

Unnamed: 0,id_x,citation,watershed_name,name,lat,lon,area_km2,id_y,huc_watershed_id


## Properties

In [25]:
df_model['vegetation_info'].unique()

array(['N', 'Cropland described', 'Vegetation described',
       'Forest described', 'Seasonal change discussed',
       'Vegetation icons', 'Vegetation types described'], dtype=object)

In [26]:
df_model['soil_info'].unique()

array(['N', 'Horizons described', 'Soil types described'], dtype=object)

In [27]:
df_model['geol_info'].unique()

array(['N', 'Geology described', 'Glacier described', 'Bedrock described'],
      dtype=object)

In [28]:
df_model['topo_info'].unique()

array(['Slopes described', 'N', 'Topography described', 'Scale bar shown'],
      dtype=object)

In [29]:
df_model['uncertainty_info'].unique()

array(['N', 'Unknown items identified', 'Uncertainty described'],
      dtype=object)

In [30]:
df_model['other_info'].unique()

array(['Soil properties (surface sealing, cracking)',
       'This is a longitudinal profile, average transit time given, scale given',
       'Riparian area only',
       'Show cross-section of water table slopes/depths for wet and dry seasons',
       'Reaction fronts shown', nan,
       'Joints/faults show, approximate scale given in caption',
       'Soil clay content in caption, % streamflow contribution in legend, open boundary noted, extra notes shown on figures',
       'The sources (=stores) and flow paths are shown as separate pictures. The diffuse groundwater flows are shown as being from shallow and deep aquifers, but are described as being "flows through the soil matrix and macropores"',
       'Inconsistencies betweeen picture and legend, wiggly arrows meaning unknown, many unlabelled arrows',
       'Arrows show water table rise/fall; indicates which storages have variable storage',
       'Describes water sources as event water, shallow gw, deep gw',
       'Focused on 

In [31]:
df_model['spatial_property'].unique()

array(['N', 'Hillslope position', 'Catchment spatial scale', 'Topography',
       'Aspects', 'Soil or Geology', 'Process', 'Land use / Land cover'],
      dtype=object)

In [32]:
df_model['temporal_property'].unique()

array(['Season', 'N', 'Rainfall intensity', 'Season with snow', 'Wetness',
       'Event', 'Season and wetness', 'Interannual'], dtype=object)

(TODO) Write code to check if Taxonomy is not empty