In [None]:
import toml
import pandas as pd
from pathlib import Path

# Data Validation

## The validation configuration

*Parameters:*

In [None]:
validation_path = Path.cwd().parent / "data" / "metadata_test" / "_validation_schema_v2.toml"

*Code:*

In [None]:
validation = toml.load(validation_path)
validation

## Verify data structure in project folder

The project folder should have the follwing structure:

├── data

│ ├── environment.csv

│ ├── environment_events.csv

│ ├── hosts.csv

│ └── host_events.csv

*Parameters*

In [None]:
proj_data_path = Path.cwd().parent / "data" / "synthetic_data"

*Code*

In [None]:
files = [f.name for f in proj_data_path.glob('**/*.csv') if f.is_file()]
print(f"Found files: {files}")
assert "environment.csv" in files
assert "environment_events.csv" in files
assert "hosts.csv" in files
assert "host_events.csv" in files

If all files are found, read them in as pandas dataframes:

In [None]:
data = {}
for f in files:
    try:
        data[f] = pd.read_csv(proj_data_path.joinpath(f))
    except pd.errors.EmptyDataError:
        print(f"File is empty: {f}")
assert list(data.keys()) == files
print(data.keys())

## Check column names and each file
### Functions

The function below defines when types in the data frame equal the definition in the validation file:

In [None]:
def type_is_equal(col_type: str, val_type: str) -> bool:
    # string
    if col_type == "object" and val_type == "string":
        return True
    # float
    if col_type.startswith("float") and val_type == "float":
        return True
    # integer
    if col_type.startswith("int") and val_type.startswith("int"):
        return True
    return False

The following function walks over all columns in all csv files and checks their data type. It also notifies about columns that were not found but are defined in the validation file:

In [None]:
def check_column_types(data, validation):
    for data_name in data:
        if data_name == "host_events.csv":
            section = "events"
        else:
            section = data_name.split(".")[0]
        print(f"Checking {data_name} against {section}:")
        for var_name in validation[section]:
            if var_name in data[data_name].columns:
                if not type_is_equal(str(data[data_name][var_name].dtype), validation[section][var_name]["type"]):
                    print(f"{var_name} should be {validation[section][var_name]["type"]}")
                    print(f"\t Found: {str(data[data_name][var_name].dtype)}")
    print("----")

The third function finds all columns which are mentioned in the validation file but cannot be found in the data:

In [None]:
def check_column_exists(data, validation):
    for data_name in data:
        if data_name == "host_events.csv":
            section = "events"
        else:
            section = data_name.split(".")[0]
        for var_name in validation[section]:
            if not var_name in data[data_name].columns:
                print(f"Column not found: {var_name}")
    print("----")

### Check individual files according to section in validation file

In [None]:
check_column_exists(data, validation)

In [None]:
check_column_types(data, validation)