In [None]:
import pandas as pd
from glob import glob
import numpy as np
from tqdm.auto import tqdm

In [None]:
DATA_PATH = r"D:\MA\data\Kaggle\animal-crossing-new-horizons-nookplaza-dataset"
RESULT_PATH = r"D:\Programming\partial-BINDER\pBinder\results"

In [None]:
df_dict = {}
for relation in glob(DATA_PATH + "/*.csv"):
    name = relation[len(DATA_PATH)+1:-4]
    df_dict[name] = pd.read_csv(relation, dtype=str, na_filter=False)

In [None]:
def load_att(att_string: str):
    att_string = att_string.strip()
    attributes = []
    for i in att_string.split(','):
        relation = i.split('.')[0]
        attributes.append(''.join(i.split('.')[1:]))
    return relation, attributes

In [None]:
def validate_IND(dep_relation, dep_attributes, ref_relation, ref_attributes):
    dep_values = df_dict[dep_relation][dep_attributes]
    dep_values = dep_values[np.all(dep_values != '', axis=1)]
    dep_values = dep_values.values.T.tolist()
    dep_values = set(map('#'.join, zip(*dep_values)))

    ref_values = df_dict[ref_relation][ref_attributes]
    ref_values = ref_values[np.all(ref_values != '', axis=1)]
    ref_values = ref_values.values.T.tolist()
    ref_values = set(map('#'.join, zip(*ref_values)))
    
    for d in dep_values:
        if d not in ref_values:
            return False
    return True

In [None]:
known_INDs = set()
def recursive_subset_check(dep_rel, dep_att, ref_rel, ref_att):
    s = f'{dep_rel}.{dep_att} <= {ref_rel}.{ref_att}'

    if s in known_INDs:
        return True

    if len(dep_att) > 1:
        for i in range(len(dep_att)):
            valid = recursive_subset_check(dep_rel, dep_att[:i] + dep_att[(i+1):], ref_rel, ref_att[:i] + ref_att[(i+1):])
            if not valid:
                return False
    valid = validate_IND(dep_rel, dep_att, ref_rel, ref_att)
    if valid:
        known_INDs.add(s)
    return valid

In [None]:
def run():
    e_count = 0
    distinct_pINDs = set()
    for layer in sorted(glob(RESULT_PATH + "/*.txt")):
        with open(layer, mode='r', encoding='utf-8-sig') as f:
            pINDs = f.readlines()
        for pIND in tqdm(pINDs):
            pIND = pIND[1:-2]
            pIND = pIND.replace(u'\ufeff', '')
            dep = pIND.split(") <= (")[0]
            dep_rel, dep_att = load_att(dep)

            for ref in (pIND.split(") <= (")[1]).split(") (", ):
                ref_rel, ref_att = load_att(ref)

                if not recursive_subset_check(dep_rel, dep_att, ref_rel, ref_att):
                    e_count += 1
                    print(f'Error {e_count} for:', dep_rel, dep_att, ref_rel, ref_att)
                distinct_pINDs.add(dep + " <= " + ref)
    return distinct_pINDs

In [None]:
binder_pINDs = run()

In [None]:
RESULT_PATH = r"D:\Programming\spind\results"
spind_pINDs = run()

In [None]:
not_in = 0
yes = 0
for b in binder_pINDs:
    if b in spind_pINDs:
        yes += 1
    else:
        print(b)
        not_in += 1
        break
yes, not_in

In [None]:
yes, not_in = 0, 0
for s in spind_pINDs:
    if s in binder_pINDs:
        yes += 1
    else:
        print(s)
        not_in += 1
        break
yes, not_in