In [1]:
from pathlib import Path
import yaml
import pandas as pd
import re

In [2]:
# use current path as the root path
__file__ = Path.cwd()
ROOT_PATH = Path.cwd()

# other directories
LOG_PATH = (ROOT_PATH / "log")
DATA_PATH = (ROOT_PATH / 'data')
SRC_PATH = (ROOT_PATH / 'src')
CONFIG_PATH = (ROOT_PATH / 'configs')

In [3]:
# Load YAML validation schema
config_path = CONFIG_PATH / 'main.yaml'

with open(config_path, 'r') as file:
    rules = yaml.safe_load(file)['table_schema']


In [9]:
test_01 = pd.read_excel("data/test_01.xlsx")
test_01.head()

Unnamed: 0,ID1,Name,Email,Signup Date,Last Purchase,Total Spent,Referral Code,Membership Level1
0,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1


In [None]:
class Doorkeeper:
    def __init__(self, ) -> None:
        pass
        

In [10]:
# Function to validate a DataFrame against the rules
def validate_table(df, rules):
    errors = []

    # Check required columns
    for col in rules['required_columns']:
        if col['name'] not in df.columns:
            errors.append(f"Missing required column: {col['name']}")
        # else:
        #     # Validate data type and emptiness
        #     if not all(df[col['name']].apply(lambda x: isinstance(x, eval(col['data_type'])) if pd.notnull(x) else col['allows_empty'])):
        #         errors.append(f"Invalid data in column: {col['name']}")

    # Check for optional columns constraints if they exist
    for col in rules.get('optional_columns', []):
        if col['name'] in df.columns:
            # Similar validation as for required columns can be performed here
            pass

    # Additional constraints
    # for constraint in rules.get('constraints', []):
    #     if 'regex' in constraint and constraint['column'] in df.columns:
    #         if not all(df[constraint['column']].apply(lambda x: re.match(constraint['regex'], x) if pd.notnull(x) else True)):
    #             errors.append(f"Data in column {constraint['column']} fails regex validation")
    #
    #     if 'unique' in constraint and constraint['column'] in df.columns:
    #         if not df[constraint['column']].is_unique:
    #             errors.append(f"Data in column {constraint['column']} is not unique as required")

    return errors

In [12]:
errors = validate_table(df=test_01, rules=rules)

In [13]:
errors

['Missing required column: ID']

In [6]:
import os
from ipykernel import get_connection_file

def get_notebook_path():
    connection_file = get_connection_file()
    connection_file_dirname = os.path.dirname(connection_file)
    notebooks = [nb for nb in os.listdir(connection_file_dirname) if nb.endswith(".ipynb")]
    if notebooks:
        return os.path.join(connection_file_dirname, notebooks[0])
    else:
        return None

print(get_notebook_path())

None


# task class

In [None]:
# the core idea is that each template can be used to validate multiple datasets
datasets = {
    "data1": {
        "template": "templates/data1.yaml",
        "data": ["data/data1_01.csv", "data/data_02.xlsx"]
    },
    "data2": {
        "template": "templates/data2.yaml",
        "data": ["data/data2_01.csv", "data/data2_02.csv"]
    }
}

In [1]:
from datetime import datetime

# each task can contain multiple templates and datasets
class Task:
    def __init__(self, task:str) -> None:
        self.TASK = task
        self.TIME_TAG = self.get_current_datetime()

        # file paths
        self.PATH = {
            "ROOT": Path.cwd(),
            "DATA": (ROOT_PATH / 'data'),
            "TEMPLATES": (ROOT_PATH / 'templates'),
            "TASK_TEMPLATES": (ROOT_PATH / 'templates' / 'tasks'),
            "DATASET_TEMPLATES": (ROOT_PATH / 'templates' / 'datasets'),
        }

        self.datasets = {}

    def get_current_datetime(self):
        """
        get current date and time as string
        """
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    def create_paths(self):
        """
        Crate the necessary directories
        """
        for path in self.PATH.values():
            path.mkdir(parents=True, exist_ok=True)

    def load_task_template(self):
        """
        Load the task template
        """
        path = self.PATH["TASK_TEMPLATES"] / f"{self.TASK}.yaml"
        with open(path, 'r') as file:
            return yaml.safe_load(file)
    
    def load_datasets(self):
        """
        Load the datasets from the data directory
        """
        for dataset in self.datasets:
            self.datasets[dataset] = pd.read_csv(self.PATH["DATA"] / dataset)
        

# test