In [611]:
import tomllib
import pandas as pd

from pathlib import Path

# Data Validation

## The validation configuration

*Parameters:*

In [616]:
validation_path = Path.cwd().parent / "data" / "metadata_test" / "_validation_schema_v2.toml"

*Code:*

In [619]:
from validate import read_toml
validation = read_toml(validation_path)
validation

[93mINFO reading validation file: C:\Users\lager008\Documents\GitHub\autumn-fair\data\metadata_test\_validation_schema_v2.toml.[0m


{'metadata_keys': ['type', 'format', 'values'],
 'hosts': {'host_id': {'type': ['string', 'integer']},
  'host_groupNumber': {'type': 'integer'},
  'host_sex': {'type': 'string', 'format': 'A', 'values': ['M', 'F']},
  'host_age': {'type': 'integer'},
  'host_death': {'type': 'integer'},
  'host_species': {'type': 'string'},
  'host_breed': {'type': 'string'},
  'host_age_unit': {'type': 'string',
   'values': ['H', 'D', 'W', 'M', 'Y'],
   'format': 'A'},
  'host_round': {'type': 'integer'}},
 'events': {'host_id': {'type': ['string', 'integer']},
  'event_day': {'type': 'integer'},
  'event_time': {'type': 'string', 'format': 'HH:MM'},
  'event_type': {'type': 'string', 'values': ['M', 'I', 'T']},
  'measurement_type': {'type': 'string'},
  'measurement_quantity': {'type': 'float'},
  'measurement_unit': {'type': 'string'},
  'inoculation_type': {'type': 'string'},
  'inoculation_pathogen': {'type': 'string'},
  'inoculation_dose': {'type': 'float'},
  'inoculation_unit': {'type': 'st

## Verify data structure in project folder

The project folder should have the following structure:

├── data

│ ├── environment.csv

│ ├── environment_events.csv

│ ├── hosts.csv

│ └── host_events.csv

*Parameters*

In [624]:
proj_data_path = Path.cwd().parent / "data" / "synthetic_data_2"
sep = ","

proj_data_path = Path.cwd().parent / "data" / "example_study2"
sep = ";"

*Code*

In [627]:
from validate import read_csv_files
data = read_csv_files(proj_data_path, sep)
print(data.keys())

dict_keys(['environment.csv', 'environment_events.csv', 'hosts.csv', 'host_events.csv'])


## Check column names in each file

In [630]:
from validation_utils import check_column_exists
result = check_column_exists(data, validation)
print(f"Missing columns (file, column name):\n {result}")

Missing columns (file, column name):
 []


## Find columns which are empty

In [633]:
from validation_utils import find_empty_columns
result = find_empty_columns(data)
print(f"Empty columns (file, column name):\n {result}")

Empty columns (file, column name):
 [('host_events.csv', 'inoculation_type'), ('host_events.csv', 'inoculation_pathogen'), ('host_events.csv', 'inoculation_dose'), ('host_events.csv', 'inoculation_unit'), ('host_events.csv', 'inoculation_outcome')]


## Check column types

Check whether a column has the expected type(s) defined in the configuration file. 

In [637]:
from validation_utils import check_column_types
result = check_column_types(data, validation)
print(f"Column type check (file, column name, type, expected type):\n {result}")

Column type check (file, column name, type, expected type):
 [('host_events.csv', 'inoculation_type', 'float64', 'string'), ('host_events.csv', 'inoculation_pathogen', 'float64', 'string'), ('host_events.csv', 'inoculation_unit', 'float64', 'string'), ('host_events.csv', 'measurement_outcome', 'float64', 'integer'), ('host_events.csv', 'treatment_outcome', 'float64', 'integer'), ('host_events.csv', 'inoculation_outcome', 'float64', 'integer')]


In [None]:
#### This check gives back that boolean values are float instead of integer. Plus there is still the problem about the "missing" inoculation values. 

## Columns with categorical values

The validation file also contains information on categorical values. Check if columns for which categorical values are defined really only contain those values.

In [585]:
from validation_utils import check_column_values
result = check_column_values(data, validation)
print(f"Undefined categorical values (file, column name, undefined values):\n {result}")

Undefined categorical values (file, column name, undefined values):
 [('host_events.csv', 'measurement_outcome', {nan}), ('host_events.csv', 'treatment_outcome', {0.0, nan}), ('host_events.csv', 'inoculation_outcome', {nan})]


## Identifier columns

In the current set up we have two identifier columns which can occurr in all of the files:

In [589]:
ids = ["host_id", "environment_id"]

We expect those columns to **not contain any blank cells**.

The values for the dientifiers are defined in the respcetive columns in *host.csv* and *environment.csv*. In those files the columns must conatin unique values, i.e. the length of the column is equal to the number of unique values found in the column.

If the output of the cell below is `True`, then all checks have passed successfully.

In [592]:
from validation_utils import identifier_checks
identifier_checks(data, ids)

True

## Depedencies between columns

Some columns have dependencies between each other. E.g. if in a row a value is set in one columns, we also expect values in the other columns. Those *column dependencies* are defined in the respective section of the configuration file.

In [596]:
validation["column_dependencies"]

{'event': ['event_day', 'event_time', 'event_type'],
 'measurement': ['measurement_type',
  'measurement_quantity',
  'measurement_unit',
  'measurement_outcome'],
 'inoculation': ['inoculation_type',
  'inoculation_pathogen',
  'inoculation_dose',
  'inoculation_unit',
  'inoculation_outcome'],
 'treatment': ['treatment_type',
  'treatment_dose',
  'treatment_unit',
  'treatment_outcome']}

### Check if columns of a cluster are present and contain values

The code below checks:
When the columns defined in *event* are all present in the dataframe,
then we also expect all columns of *measurement*, *inoculation* and *treatment*
to be present in the same dataframe

In [600]:
# Only for formatting out put
dependency_names = list(validation["column_dependencies"].keys())[1:]
print(dependency_names)

['measurement', 'inoculation', 'treatment']


In [602]:
for data_name, df in data.items():
    if set(validation["column_dependencies"]["event"]).issubset(df.columns):
        measure = set(validation["column_dependencies"]["measurement"]).issubset(df.columns)
        inoc = set(validation["column_dependencies"]["inoculation"]).issubset(df.columns)
        treat = set(validation["column_dependencies"]["treatment"]).issubset(df.columns)
        res = [i for i, val in enumerate([measure, inoc, treat]) if not val]
        if len(res) > 0:
            missing_cols = [dependency_names[idx] for idx in res]
            print(f"{data_name}: Need also information on {missing_cols}")
data.items()
df.columns

environment_events.csv: Need also information on ['inoculation', 'treatment']


Index(['host_id', 'event_day', 'event_time', 'event_type', 'measurement_type',
       'measurement_quantity', 'measurement_outcome', 'measurement_unit',
       'inoculation_type', 'inoculation_pathogen', 'inoculation_dose',
       'inoculation_unit', 'inoculation_outcome', 'treatment_type',
       'treatment_dose', 'treatment_unit', 'treatment_outcome'],
      dtype='object')

### The check above is correct for the "host_event.csv" however in the "environment_event.csv" there are also columns on measurement. 
### Hence the check is needed for the first file but not for the second one. We have the change this check.  

### Check values in a cluster of columns

For each of the set of columns defined in the *dependencies* we assume that they are all empty per row or they all carry a value.

In [607]:
from validation_utils import check_column_clusters
for data_name, df in data.items():
    result = check_column_clusters(validation["column_dependencies"], df)
    if len(result) > 0:
        print(f"Missing values in {data_name}:")
        print(f"Row index, column name:\n {result}")