In [None]:
import tomllib
import pandas as pd

from pathlib import Path

# Data Validation

## The validation configuration

*Parameters:*

In [None]:
validation_path = Path.cwd().parent / "data" / "metadata_test" / "_validation_schema_v2.toml"

*Code:*

In [None]:
from validate import read_toml
validation = read_toml(validation_path)
validation

## Verify data structure in project folder

The project folder should have the following structure:

├── data

│ ├── environment.csv

│ ├── environment_events.csv

│ ├── hosts.csv

│ └── host_events.csv

*Parameters*

In [None]:
proj_data_path = Path.cwd().parent / "data" / "test_data"
sep = ","

*Code*

In [None]:
from validate import read_csv_files
data = read_csv_files(proj_data_path, sep)
print(data.keys())

## Check column names in each file

In [None]:
from validation_utils import check_column_exists
result = check_column_exists(data, validation)
print(f"Missing columns (file, column name):\n {result}")

## Find columns which are empty

In [None]:
from validation_utils import find_empty_columns
result = find_empty_columns(data)
print(f"Empty columns (file, column name):\n {result}")

## Check column types

Check whether a column has the expected type(s) defined in the configuration file. 

In [None]:
from validation_utils import check_column_types
result = check_column_types(data, validation)
print(f"Column type check (file, column name, type, expected type):\n {result}")

## Columns with categorical values

The validation file also contains information on categorical values. Check if columns for which categorical values are defined really only contain those values.

In [None]:
from validation_utils import check_column_values
result = check_column_values(data, validation)
print(f"Undefined categorical values (file, column name, undefined values):\n {result}")

## Identifier columns

In the current set up we have two identifier columns which can occur in all of the files:

In [None]:
ids = ["host_id", "environment_id"]

We expect those columns to **not contain any blank cells**.

The values for the identifiers are defined in the respective columns in *host.csv* and *environment.csv*. In those files the columns must contain unique values, i.e. the length of the column is equal to the number of unique values found in the column.

If the output of the cell below is `True`, then all checks have passed successfully.

In [None]:
from validation_utils import identifier_checks
identifier_checks(data, ids)

## Depedencies between columns

Some columns have dependencies between each other. E.g. if in a row a value is set in one columns, we also expect values in the other columns. Those *column dependencies* are defined in the respective section of the configuration file.

In [None]:
validation["column_dependencies"]

### Check if columns of a cluster are present and contain values

The code below checks:
When the columns defined in *event* are all present in the dataframe,
then we also expect all columns of *measurement*, *inoculation* and *treatment*
to be present in the same dataframe

In [None]:
# Only for formatting output
dependency_names = list(validation["column_dependencies"].keys())[1:]
print(dependency_names)

In [None]:
for data_name, df in data.items():
    if set(validation["column_dependencies"]["event"]).issubset(df.columns):
        measure = set(validation["column_dependencies"]["measurement"]).issubset(df.columns)
        inoc = set(validation["column_dependencies"]["inoculation"]).issubset(df.columns)
        treat = set(validation["column_dependencies"]["treatment"]).issubset(df.columns)
        res = [i for i, val in enumerate([measure, inoc, treat]) if not val]
        if len(res) > 0:
            missing_cols = [dependency_names[idx] for idx in res]
            print(f"{data_name}: Need also information on {missing_cols}")

### Check values in a cluster of columns

For each of the set of columns defined in the *dependencies* we assume that they are all empty per row or they all carry a value.

In [None]:
from validation_utils import check_column_clusters
for data_name, df in data.items():
    result = check_column_clusters(validation["column_dependencies"], df)
    if len(result) > 0:
        print(f"Missing values in {data_name}:")
        print(f"Row index, column name:\n {result}")