In [1]:
import pandas as pd
import numpy as np
import json
import operator
import itertools 
import matplotlib.pyplot as plt
import re
import subprocess

In [44]:
dataDir="./datasets/"
datasets=[
    "adult",
    "airport",
    "atom",
    "CLASSIFICATION",
    "flights",
    "food",
    "Hospital",
    "inspection",
    "ncvoter",
    "SPStock",
    "tax500k",
]

In [3]:
JARDir="./algorithmJARs/"
algorithms=[
    "Hydra",
    "DCFinder",
    "ADCMiner",
    "FastADC",
]

# Code

## Dataset

In [4]:
class Dataset:
    def __init__(self,file,**args):
        self.columns=pd.read_csv(file,nrows=0).columns
        self.header=[re.match(r'([^\(\)]*)(?:\(| )([^\(\)]*)\)?',col) for col in self.columns]
        self.names=[match[1] for match in self.header]
        typeMap={'String':str,'Integer':float,'Double':float,'int':float,'str':str,'float':float}
        self.types=[typeMap[match[2]] for match in self.header]
        
        self.df=pd.read_csv(file,dtype={col:type for col,type in zip(self.columns,self.types)},**args)
        
    def randRows(self,n):
        ids=np.random.randint(0,len(self.df),n)
        return self.df.iloc[ids]
    def randFields(self,n):
        return pd.DataFrame({col:dfs[col].iloc[list(np.random.randint(0,len(dfs),n))].values for dfs in [self.df] for col in dfs.columns})

    def buildPLIs(self):
        self.PLI= {col:self.df.groupby(by=col).groups for col in self.df}
    def shuffle(self):
        self.df=self.randFields(len(self.df))

## DenialConstraintResult

In [5]:
class Operator:
    def __init__(self,func,expFunc) -> None:
        self.func=func
        self.expFunc=expFunc
    def __call__(self,a,b):
        return self.func(a,b)
    def negate(self):
        return Operator(operator.invert(self.func))
    def expected(self,c1,c2):
        return self.expFunc(c1,c2)
    def __repr__(self) -> str:
        return self.func.__name__
    def __eq__(self, other: object) -> bool:
        if isinstance(other, Operator):
            return self.func==other.func
        return False
    def __hash__(self):
        fields=(self.func)
        hash_value = hash(fields)
        return hash_value

def eqExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    return np.sum(probs**2)-1/len(l)
   

eq=Operator(operator.eq,eqExp)

def neExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    return 1-np.sum(probs**2)
ne=Operator(operator.ne,neExp)

def geExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(1-cumProbs+probs))-1/len(l)
ge=Operator(operator.ge,geExp)

def leExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(cumProbs))-1/len(l)
le=Operator(operator.le,leExp)

def gtExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(1-cumProbs))
gt=Operator(operator.gt,gtExp)

def ltExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(cumProbs-probs))
lt=Operator(operator.lt,ltExp)
operatorMap={
    "EQUAL":eq,
    "UNEQUAL":ne,
    "LESS_EQUAL":le,
    "GREATER_EQUAL":ge,
    "LESS":lt,
    "GREATER":gt
}



class Predicate:
    def __init__(self,l:str,op:Operator,r:str) -> None:
        self.l=l
        self.r=r
        self.op=op
        self.exp=None
    def eval(self,df,t0,t1):
        return self.op(t0[self.l],t1[self.r])
    def expected(self,df):
        if self.exp is None:
            self.exp=self.op.expected(df.df[self.l],None)
        return self.exp
            
    
    def __repr__(self) -> str:
        return '"'+self.l +'" '+self.op.__repr__()+' "'+self.r+'"'
    def __hash__(self):
        fields=(self.l,self.r)
        hash_value = hash(fields)
        return hash_value

    def __eq__(self, other):
        if isinstance(other, Predicate):
            sFields=(self.l,self.op,self.r)
            oFields=(other.l,other.op,other.r)
            return sFields==oFields
        return False


class DenialConstraint:
    def __init__(self,preds) -> None:
        self.preds=preds
    def eval(self,df,t0,t1):
        return sum([pred.eval(df,t0,t1) for pred in self.preds])
    def coverage(self,df,t0s,t1s):
        pos,neg=0,0
        num=self.eval(df,t0s,t1s)
        dclen=len(self.preds)
        pos=(num==dclen).sum()
        neg=(num<dclen).sum()

                
        return neg/(pos+neg)
    def sampleCoverage(self,df,n=None):
        nn=len(df.df)
        if n is None:
            n=nn**2
        t0s = np.random.randint(0,len(df.df),n)
        t1s = np.random.randint(0,len(df.df),n)
        t1s=(t1s+(t1s==t0s)*np.random.randint(1,len(df.df),n))%len(df.df)
        return self.coverage(df,t0s,t1s)
    def expCoverage(self,df):
        return 1-np.prod([pred.expected(df) for pred in self.preds])
    def __repr__(self) -> str:
        return "!["+" & ".join([pred.__repr__() for pred in self.preds])+"]"


class DenialConstraintSet:
    def __init__(self,path,dataset,algorithm) -> None:        
        self.predicates={}
        opmap={"==":eq,"<>":ne,">=":ge,"<=":le,">":gt,"<":lt}
        def getPred(c1,op,c2):
            if (c1,c2,op) not in self.predicates:
                self.predicates[(c1,c2,op)]=Predicate(c1,opmap[op],c2)
            return self.predicates[(c1,c2,op)]
        
        self.DCs=[]
        
        with open(path) as f:
            for line in f:
                line=line.strip()[2:-1] #strip !(...)
                preds=line.split('^')
                regex=r't0\.'+dataset+'\.csv\.([^=><]*)(==|<>|>=|<=|>|<)t1\.'+dataset+'\.csv\.([^=><]*)'
                if algorithm in ['ADCMiner','FastADC']:
                    regex=r't0\.([^=><]*) (==|<>|>=|<=|>|<) t1\.([^=><]*)'
                preds = [getPred(*re.match(regex,pred).groups()) for pred in preds]
                self.DCs.append(DenialConstraint(preds))


# Result generation

## Discover DCs

In [45]:
for dataset in datasets:
    for algorithm in algorithms:
        for aprox in ["0.00","0.01"]:
            print("RUN: {}_{}_{}".format(algorithm,dataset,aprox))
            command='java -Xmx8g -cp {} Main {} {} 10'.format(JARDir+algorithm+'.jar',dataDir+dataset+".csv",aprox)
            result = subprocess.run(command, shell=True,stdout = subprocess.DEVNULL)
            print("MOVE: {}_{}_{}".format(algorithm,dataset,aprox))
            command='mv output.txt results/{}_{}_{}'.format(algorithm,dataset,aprox)
            result = subprocess.run(command, shell=True,stdout = subprocess.DEVNULL)

RUN: Hydra_adult_0.00
MOVE: Hydra_adult_0.00
RUN: Hydra_adult_0.01
MOVE: Hydra_adult_0.01
RUN: DCFinder_adult_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 240
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_adult_0.00
RUN: DCFinder_adult_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 189
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_adult_0.01
RUN: ADCMiner_adult_0.00
MOVE: ADCMiner_adult_0.00
RUN: ADCMiner_adult_0.01
MOVE: ADCMiner_adult_0.01
RUN: FastADC_adult_0.00
MOVE: FastADC_adult_0.00
RUN: FastADC_adult_0.01
MOVE: FastADC_adult_0.01
RUN: Hydra_airport_0.00
MOVE: Hydra_airport_0.00
RUN: Hydra_airport_0.01
MOVE: Hydra_airport_0.01
RUN: DCFinder_airport_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 184
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_airport_0.00
RUN: DCFinder_airport_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 230
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_airport_0.01
RUN: ADCMiner_airport_0.00
MOVE: ADCMiner_airport_0.00
RUN: ADCMiner_airport_0.01
MOVE: ADCMiner_airport_0.01
RUN: FastADC_airport_0.00
MOVE: FastADC_airport_0.00
RUN: FastADC_airport_0.01
MOVE: FastADC_airport_0.01
RUN: Hydra_atom_0.00
MOVE: Hydra_atom_0.00
RUN: Hydra_atom_0.01
MOVE: Hydra_atom_0.01
RUN: DCFinder_atom_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 206
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_atom_0.00
RUN: DCFinder_atom_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 205
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_atom_0.01
RUN: ADCMiner_atom_0.00
MOVE: ADCMiner_atom_0.00
RUN: ADCMiner_atom_0.01
MOVE: ADCMiner_atom_0.01
RUN: FastADC_atom_0.00
MOVE: FastADC_atom_0.00
RUN: FastADC_atom_0.01
MOVE: FastADC_atom_0.01
RUN: Hydra_CLASSIFICATION_0.00
MOVE: Hydra_CLASSIFICATION_0.00
RUN: Hydra_CLASSIFICATION_0.01
MOVE: Hydra_CLASSIFICATION_0.01
RUN: DCFinder_CLASSIFICATION_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 205
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_CLASSIFICATION_0.00
RUN: DCFinder_CLASSIFICATION_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 213
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_CLASSIFICATION_0.01
RUN: ADCMiner_CLASSIFICATION_0.00
MOVE: ADCMiner_CLASSIFICATION_0.00
RUN: ADCMiner_CLASSIFICATION_0.01
MOVE: ADCMiner_CLASSIFICATION_0.01
RUN: FastADC_CLASSIFICATION_0.00
MOVE: FastADC_CLASSIFICATION_0.00
RUN: FastADC_CLASSIFICATION_0.01
MOVE: FastADC_CLASSIFICATION_0.01
RUN: Hydra_flights_0.00
MOVE: Hydra_flights_0.00
RUN: Hydra_flights_0.01
MOVE: Hydra_flights_0.01
RUN: DCFinder_flights_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 194
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_flights_0.00
RUN: DCFinder_flights_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 197
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_flights_0.01
RUN: ADCMiner_flights_0.00
MOVE: ADCMiner_flights_0.00
RUN: ADCMiner_flights_0.01
MOVE: ADCMiner_flights_0.01
RUN: FastADC_flights_0.00
MOVE: FastADC_flights_0.00
RUN: FastADC_flights_0.01
MOVE: FastADC_flights_0.01
RUN: Hydra_food_0.00
MOVE: Hydra_food_0.00
RUN: Hydra_food_0.01
MOVE: Hydra_food_0.01
RUN: DCFinder_food_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 210
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_food_0.00
RUN: DCFinder_food_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 193
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_food_0.01
RUN: ADCMiner_food_0.00
MOVE: ADCMiner_food_0.00
RUN: ADCMiner_food_0.01
MOVE: ADCMiner_food_0.01
RUN: FastADC_food_0.00
MOVE: FastADC_food_0.00
RUN: FastADC_food_0.01
MOVE: FastADC_food_0.01
RUN: Hydra_Hospital_0.00
MOVE: Hydra_Hospital_0.00
RUN: Hydra_Hospital_0.01
MOVE: Hydra_Hospital_0.01
RUN: DCFinder_Hospital_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 172
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_Hospital_0.00
RUN: DCFinder_Hospital_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 177
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_Hospital_0.01
RUN: ADCMiner_Hospital_0.00
MOVE: ADCMiner_Hospital_0.00
RUN: ADCMiner_Hospital_0.01
MOVE: ADCMiner_Hospital_0.01
RUN: FastADC_Hospital_0.00
MOVE: FastADC_Hospital_0.00
RUN: FastADC_Hospital_0.01
MOVE: FastADC_Hospital_0.01
RUN: Hydra_inspection_0.00
MOVE: Hydra_inspection_0.00
RUN: Hydra_inspection_0.01
MOVE: Hydra_inspection_0.01
RUN: DCFinder_inspection_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 202
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_inspection_0.00
RUN: DCFinder_inspection_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 244
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_inspection_0.01
RUN: ADCMiner_inspection_0.00
MOVE: ADCMiner_inspection_0.00
RUN: ADCMiner_inspection_0.01
MOVE: ADCMiner_inspection_0.01
RUN: FastADC_inspection_0.00
MOVE: FastADC_inspection_0.00
RUN: FastADC_inspection_0.01
MOVE: FastADC_inspection_0.01
RUN: Hydra_ncvoter_0.00
MOVE: Hydra_ncvoter_0.00
RUN: Hydra_ncvoter_0.01
MOVE: Hydra_ncvoter_0.01
RUN: DCFinder_ncvoter_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 221
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_ncvoter_0.00
RUN: DCFinder_ncvoter_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 206
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_ncvoter_0.01
RUN: ADCMiner_ncvoter_0.00
MOVE: ADCMiner_ncvoter_0.00
RUN: ADCMiner_ncvoter_0.01
MOVE: ADCMiner_ncvoter_0.01
RUN: FastADC_ncvoter_0.00
MOVE: FastADC_ncvoter_0.00
RUN: FastADC_ncvoter_0.01
MOVE: FastADC_ncvoter_0.01
RUN: Hydra_SPStock_0.00
MOVE: Hydra_SPStock_0.00
RUN: Hydra_SPStock_0.01
MOVE: Hydra_SPStock_0.01
RUN: DCFinder_SPStock_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 219
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_SPStock_0.00
RUN: DCFinder_SPStock_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 203
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_SPStock_0.01
RUN: ADCMiner_SPStock_0.00
MOVE: ADCMiner_SPStock_0.00
RUN: ADCMiner_SPStock_0.01
MOVE: ADCMiner_SPStock_0.01
RUN: FastADC_SPStock_0.00
MOVE: FastADC_SPStock_0.00
RUN: FastADC_SPStock_0.01
MOVE: FastADC_SPStock_0.01
RUN: Hydra_tax500k_0.00
MOVE: Hydra_tax500k_0.00
RUN: Hydra_tax500k_0.01
MOVE: Hydra_tax500k_0.01
RUN: DCFinder_tax500k_0.00


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 180
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.0.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 0 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_tax500k_0.00
RUN: DCFinder_tax500k_0.01


[main] INFO de.metanome.algorithms.dcfinder.input.Input - Time to build plis: 194
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Error threshold: 0.01.
[main] INFO de.metanome.algorithms.dcfinder.DCFinder - Discovering DCs with at most 1 violating tuple pairs.
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - First level chunks: 1
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Available processors: 12
[main] INFO de.metanome.algorithms.dcfinder.evidenceset.builders.SplitReconstructEvidenceSetBuilder - Building the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Finding Minimal Covers for the Evidence Set...
[main] INFO de.metanome.algorithms.dcfinder.setcover.partial.MinimalCoverSearch - Building denial constraints...


MOVE: DCFinder_tax500k_0.01
RUN: ADCMiner_tax500k_0.00
MOVE: ADCMiner_tax500k_0.00
RUN: ADCMiner_tax500k_0.01
MOVE: ADCMiner_tax500k_0.01
RUN: FastADC_tax500k_0.00
MOVE: FastADC_tax500k_0.00
RUN: FastADC_tax500k_0.01
MOVE: FastADC_tax500k_0.01


In [178]:
dcs=DenialConstraintSet("output.txt","Hospital","FastADC")

In [40]:
df=pd.read_csv("datasets/adult.csv")
df=df.drop(['capital-gain(Double)','capital-loss(Double)','Hours-per-week(Double)','Education-num(Integer)'],axis=1)
df.head(10)


Unnamed: 0,age(Integer),workclass(String),fnlwgt(Integer),education(String),Marital-status(String),occupation(String),relationship(String),race(String),sex(String),Native-country(String),class(String)
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
5,37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K
6,49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
8,31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,>50K
9,42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K


In [41]:
df.to_csv("adult.csv",index=False)

In [171]:
len(dcs.DCs)

92

In [170]:
ds=Dataset("datasets/Hospital.csv")
ds.buildPLIs()

In [179]:
n=10000
t0s = ds.randFields(n)
t1s = ds.randFields(n)

res=[ dc.coverage(ds,t0s,t1s) for dc in dcs.DCs]


In [180]:
[i for i,r in enumerate(res) if r<0.9]

[]

In [157]:
dcs.DCs

[!["ZIP Code(String)" eq "ZIP Code(String)"],
 !["Condition(String)" ne "Condition(String)" & "Measure Code(String)" eq "Measure Code(String)"],
 !["Condition(String)" ne "Condition(String)" & "Measure Name(String)" eq "Measure Name(String)"],
 !["County Name(String)" eq "County Name(String)" & "Sample(String)" eq "Sample(String)"],
 !["County Name(String)" eq "County Name(String)" & "State(String)" ne "State(String)"],
 !["StateAvg(String)" eq "StateAvg(String)"],
 !["Measure Name(String)" eq "Measure Name(String)" & "Sample(String)" eq "Sample(String)"],
 !["County Name(String)" eq "County Name(String)" & "Measure Name(String)" eq "Measure Name(String)"],
 !["Condition(String)" eq "Condition(String)" & "County Name(String)" eq "County Name(String)"],
 !["Condition(String)" eq "Condition(String)" & "Hospital Type(String)" ne "Hospital Type(String)" & "State(String)" eq "State(String)"],
 !["Hospital Type(String)" ne "Hospital Type(String)" & "Measure Code(String)" eq "Measure Code(Str

In [137]:
dcs.DCs[8804].expCoverage(ds)

0.9963751279140795

In [124]:
res[8804]

1.0

In [142]:
ltExp(ds.df['capital-loss float'],ds.df['capital-loss float'])

0.04550623851887257

In [8]:
line='¬(t0.adult.csv.Native-country str<>t1.adult.csv.Native-country str^t0.adult.csv.capital-gain float<>t1.adult.csv.capital-gain float^t0.adult.csv.class str==t1.adult.csv.class str^t0.adult.csv.race str==t1.adult.csv.race str)'
line=line[2:-1] #strip !(...)
preds=line.split('^')


In [89]:
eq(ds.df['age int'],ds.df['age int'])

0        True
1        True
2        True
3        True
4        True
         ... 
32556    True
32557    True
32558    True
32559    True
32560    True
Name: age int, Length: 32561, dtype: bool