In [1]:
import toml
import pandas as pd
from pathlib import Path

# Data Validation

## The validation configuration

*Parameters:*

In [2]:
validation_path = Path.cwd().parent / "data" / "metadata_test" / "_validation_schema_v2.toml"

*Code:*

In [3]:
validation = toml.load(validation_path)
validation

{'metadata_keys': ['type', 'format', 'values'],
 'hosts': {'host_id': {'type': 'string', 'format': 'AA0_00000'},
  'host_groupNumber': {'type': 'integer'},
  'host_sex': {'type': 'string', 'format': 'A', 'values': ['M', 'F']},
  'host_age': {'type': 'integer'},
  'host_death': {'type': 'integer'},
  'host_species': {'type': 'string'},
  'host_breed': {'type': 'string'}},
 'events': {'host_id': {'type': 'string', 'format': 'AA0_00000'},
  'event_day': {'type': 'integer'},
  'event_time': {'type': 'string', 'format': 'HH:MM'},
  'event_type': {'type': 'string',
   'values': ['measurement', 'inoculation', 'treatment']},
  'measurement_type': {'type': 'string'},
  'measurement_quantity': {'type': 'float'},
  'measurement_unit': {'type': 'string'},
  'inoculation_type': {'type': 'string'},
  'inoculation_pathogen': {'type': 'string'},
  'inoculation_dose': {'type': 'float'},
  'inoculation_unit': {'type': 'string'},
  'treatment_type': {'type': 'string'},
  'treatment_dose': {'type': 'float

## Verify data structure in project folder

The project folder should have the following structure:

├── data

│ ├── environment.csv

│ ├── environment_events.csv

│ ├── hosts.csv

│ └── host_events.csv

*Parameters*

In [4]:
proj_data_path = Path.cwd().parent / "data" / "synthetic_data"

*Code*

In [5]:
files = [f.name for f in proj_data_path.glob('**/*.csv') if f.is_file()]
files = [f for f in files if f in ['environment_events.csv', 'environment.csv', 'host_events.csv', 'hosts.csv']]
assert "environment.csv" in files
assert "environment_events.csv" in files
assert "hosts.csv" in files
assert "host_events.csv" in files
print(f"Found files: {files}")

Found files: ['environment_events.csv', 'environment.csv', 'host_events.csv', 'hosts.csv']


If all files are found, read them in as pandas dataframes:

In [6]:
data = {}
for f in files:
    try:
        data[f] = pd.read_csv(proj_data_path.joinpath(f), sep=';')
    except pd.errors.EmptyDataError:
        print(f"File is empty: {f}")
assert list(data.keys()) == files # NOTE: environment_events
print(data.keys())

dict_keys(['environment_events.csv', 'environment.csv', 'host_events.csv', 'hosts.csv'])


## Check column names and each file

### Check individual files according to section in validation file

In [7]:
from validation_utils import *

SyntaxError: f-string: unmatched '[' (validation_utils.py, line 27)

In [None]:
check_column_exists(data, validation)

In [None]:
check_column_types(data, validation)

### Check that identifers `host_id` and `environment_id` are defined

### Are identifiers unique?

In [None]:
ids = {}

ids["host_id"] = data["hosts.csv"]["host_id"].unique()
if not len(ids["host_id"]) == len(data["hosts.csv"]["host_id"]):
    print("hosts.csv: Column host_id contains duplicates.")
    assert False

In [None]:
ids["environment_id"] = data["environment.csv"]["environment_id"].unique()
if not len(ids["environment_id"]) == len(data["environment.csv"]["environment_id"]):
    print("environment.csv: Column environment_id contains duplicates.")
    assert False

### Are there None values in the identifier columns?

In [None]:
if sum(pd.isna(ids["host_id"])):
    print(f"hosts.csv: There are {sum(pd.isna(ids['host_id']))} empty cells in host_id.")

In [None]:
if sum(pd.isna(ids["environment_id"])):
    print(f"environment.csv: There are sum(pd.isna(ids['environment_id'])) empty cells in environment_id.")

### Are all identifiers in the other csv files defined?

In [None]:
for id_type in ids:
    for data_name in data:
        if id_type in data[data_name].columns:
            if not set(data[data_name][id_type]).issubset(ids[id_type]):
                 print(f"File {data_name} contains undefined ids in column {id_type}:")
                 print(set(data[data_name][id_type]).difference(ids[id_type]))

## Depedencies between columns

*Parameters:*

Columns that belong together, if one is set, the others also need to be set.

In [None]:
cols = {}
cols["event"] = ["event_day", "event_time", "event_type"]
cols["measurement"] = ["measurement_type", "measurement_quantity", "measurement_unit"]
cols["inoculation"] = ["inoculation_type", "inoculation_pathogen", "inoculation_dose", "inoculation_unit"]
cols["treatment"] = ["treatment_type", "treatment_dose", "treatment_unit"]

### Check for None/NA values in cluster or columns

In [None]:
for data_name in data:
    result = check_column_clusters(cols, data[data_name])
    if len(result) > 0:
        print(f"{data_name}: Invalid entries found:")
        for res in result:
            df = data[data_name][res[1]]
            print(f"\t Line {res[0]}: {df.iloc[[res[0]]]}")

## Dependencies between clusters of columns

In [None]:
for data_name in data:
    if set(cols["event"]).issubset(data[data_name].columns):
        measure = set(cols["measurement"]).issubset(data[data_name].columns)
        inoc = set(cols["inoculation"]).issubset(data[data_name].columns)
        treat = set(cols["treatment"]).issubset(data[data_name].columns)
        if not (measure or inoc or treat):
            print(f"{data_name}: Need also information on either of measurement, incoulation or treatment")