In [79]:
import pandas as pd
import numpy as np
import json
import operator
import itertools 
import matplotlib.pyplot as plt
import re
import subprocess

In [75]:
dataDir="./datasets/"
datasets=[
    "adult",
    "airport",
    "atom",
    "CLASSIFICATION",
    "flights",
    "food",
    "Hospital",
    "inspection",
    "ncvoter",
    "SPStock",
    "tax500k",
]

In [76]:
JARDir="./algorithmJARs/"
algorithms=[
    "Hydra",
    "DCFinder",
    "ADCMiner",
    "FastADC",
]

# Code

## Dataset

In [77]:
class Dataset:
    def __init__(self,file,**args):
        self.columns=pd.read_csv(file,nrows=0).columns
        self.header=[re.match(r'(.*)(?:\(| )(.*)\)?',col) for col in self.columns]
        self.names=[match[1] for match in self.header]
        typeMap={'String':str,'Integer':float,'Double':float,'int':float,'str':str,'float':float}
        self.types=[typeMap[match[2]] for match in self.header]
        
        self.df=pd.read_csv(file,dtype={col:type for col,type in zip(self.columns,self.types)},**args)
        
    def randRows(self,n):
        ids=np.random.randint(0,len(self.df),n)
        return self.df.iloc[ids]
    def randFields(self,n):
        return pd.DataFrame({col:dfs[col].iloc[list(np.random.randint(0,len(dfs),n))].values for dfs in [self.df] for col in dfs.columns})

    def buildPLIs(self):
        self.PLI= {col:self.df.groupby(by=col).groups for col in self.df}
    def shuffle(self):
        self.df=self.randFields(len(self.df))

## DenialConstraintResult

In [78]:
class Operator:
    def __init__(self,func,expFunc) -> None:
        self.func=func
        self.expFunc=expFunc
    def __call__(self,a,b):
        return self.func(a,b)
    def negate(self):
        return Operator(operator.invert(self.func))
    def expected(self,c1,c2):
        return self.expFunc(c1,c2)
    def __repr__(self) -> str:
        return self.func.__name__
    def __eq__(self, other: object) -> bool:
        if isinstance(other, Operator):
            return self.func==other.func
        return False
    def __hash__(self):
        fields=(self.func)
        hash_value = hash(fields)
        return hash_value

def eqExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    return np.sum(probs**2)-1/len(l)
   

eq=Operator(operator.eq,eqExp)

def neExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    return 1-np.sum(probs**2)
ne=Operator(operator.ne,neExp)

def geExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(1-cumProbs+probs))-1/len(l)
ge=Operator(operator.ge,geExp)

def leExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(cumProbs))-1/len(l)
le=Operator(operator.le,leExp)

def gtExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(1-cumProbs))
gt=Operator(operator.gt,gtExp)

def ltExp(l,r):
    vals,freqs=np.unique(l.values,return_counts=True)
    probs=freqs/len(l)
    cumProbs=np.cumsum(probs)
    return np.sum(probs*(cumProbs-probs))
lt=Operator(operator.lt,ltExp)
operatorMap={
    "EQUAL":eq,
    "UNEQUAL":ne,
    "LESS_EQUAL":le,
    "GREATER_EQUAL":ge,
    "LESS":lt,
    "GREATER":gt
}



class Predicate:
    def __init__(self,l:str,op:Operator,r:str) -> None:
        self.l=l
        self.r=r
        self.op=op
        self.exp=None
    def eval(self,df,t0,t1):
        return self.op(df.df.loc[t0,self.l],df.df.loc[t1,self.r])
    def expected(self,df):
        if self.exp is None:
            self.exp=self.op.expected(df.df[self.l],None)
        return self.exp
            
    
    def __repr__(self) -> str:
        return '"'+self.l +'" '+self.op.__repr__()+' "'+self.r+'"'
    def __hash__(self):
        fields=(self.l,self.r)
        hash_value = hash(fields)
        return hash_value

    def __eq__(self, other):
        if isinstance(other, Predicate):
            sFields=(self.l,self.op,self.r)
            oFields=(other.l,other.op,other.r)
            return sFields==oFields
        return False


class DenialConstraint:
    def __init__(self,preds) -> None:
        self.preds=preds
    def eval(self,df,t0,t1):
        return sum([pred.eval(df,t0,t1) for pred in self.preds])
    def coverage(self,df,t0s,t1s):
        pos,neg=0,0
        cov=0.0
        dclen=len(self.preds)
        for t0,t1 in zip(t0s,t1s):
            num=self.eval(df,t0,t1)            
            if num==dclen:
                pos+=1
            else:
                neg+=1
                cov+=(num+1)/dclen
                
        return neg/(pos+neg),cov/(pos+neg),2+len(set({pred.op for pred in self.preds}))+len(set({col for pred in self.preds for col in [pred.l,pred.r]}))
    def sampleCoverage(self,df,n=None):
        nn=len(df.df)
        if n is None:
            n=nn**2
        t0s = np.random.randint(0,len(df.df),n)
        t1s = np.random.randint(0,len(df.df),n)
        t1s=(t1s+(t1s==t0s)*np.random.randint(1,len(df.df),n))%len(df.df)
        return self.coverage(df,t0s,t1s)
    def expCoverage(self,df):
        return 1-np.prod([pred.expected(df) for pred in self.preds])
    def __repr__(self) -> str:
        return "!["+" & ".join([pred.__repr__() for pred in self.preds])+"]"


class DenialConstraintSet:
    def __init__(self,path,dataset) -> None:        
        self.predicates={}
        opmap={"==":eq,"<>":ne,">=":ge,"<=":le,">":gt,"<":lt}
        def getPred(c1,op,c2):
            if (c1,c2,op) not in self.predicates:
                self.predicates[(c1,c2,op)]=Predicate(c1,opmap[op],c2)
            return self.predicates[(c1,c2,op)]
        
        self.DCs=[]
        
        with open(path) as f:
            for line in f:
                line=line.strip()[2:-1] #strip !(...)
                preds=line.split('^')
                preds = [getPred(*re.match(r't0\.'+dataset+'\.csv\.([^=><]*)(==|<>|>=|<=|>|<)t1\.'+dataset+'\.csv\.([^=><]*)',pred).groups()) for pred in preds]
                self.DCs.append(DenialConstraint(preds))


# Result generation

## Discover DCs

In [80]:
for dataset in datasets[:1]:
    for algorithm in algorithms[:1]:
        command='java -Xmx8g -cp {} Main {} 0.01 10000'.format(JARDir+algorithm+'.jar',dataDir+dataset+".csv")
        result = subprocess.run(command, shell=True)

17:46:32.251 [main] INFO  d.h.n.dc.algorithms.hybrid.Hydra - Building approximate evidence set...
17:46:32.795 [main] INFO  d.h.n.dc.algorithms.hybrid.Hydra - Estimation size systematic sampling:21966
17:46:32.916 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column age int
17:46:33.182 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column workclass str
17:46:33.222 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column fnlwgt int
17:46:33.279 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column education str
17:46:33.309 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column Education-num int
17:46:33.350 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column Marital-status str
17:46:33.381 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling column occupation str
17:46:33.431 [main] INFO  d.h.n.d.e.b.s.ColumnAwareEvidenceSetBuilder - Sampling col

In [82]:
dcs=DenialConstraintSet("output.txt","adult")

In [62]:
ds.df

Unnamed: 0,age int,workclass str,fnlwgt int,education str,Education-num int,Marital-status str,occupation str,relationship str,race str,sex str,capital-gain float,capital-loss float,Hours-per-week float,Native-country str,class str
0,40.0,Self-emp-not-inc,250821.0,Bachelors,7.0,Never-married,Prof-specialty,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,178780.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Other-relative,White,Male,7298.0,0.0,40.0,United-States,<=50K
2,53.0,Private,29036.0,Some-college,10.0,Never-married,Sales,Wife,White,Male,0.0,0.0,40.0,United-States,<=50K
3,40.0,Private,42938.0,HS-grad,11.0,Divorced,Craft-repair,Own-child,White,Female,0.0,0.0,49.0,England,<=50K
4,20.0,Local-gov,160123.0,HS-grad,9.0,Married-civ-spouse,Prof-specialty,Not-in-family,Asian-Pac-Islander,Female,0.0,0.0,50.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,39.0,Private,181280.0,HS-grad,12.0,Married-civ-spouse,Tech-support,Not-in-family,White,Female,0.0,0.0,40.0,United-States,<=50K
32557,24.0,Private,357118.0,HS-grad,10.0,Never-married,?,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
32558,19.0,?,454508.0,Some-college,9.0,Married-civ-spouse,Transport-moving,Not-in-family,White,Male,0.0,0.0,40.0,Jamaica,<=50K
32559,43.0,Federal-gov,168048.0,Bachelors,9.0,Never-married,Other-service,Unmarried,White,Male,0.0,0.0,45.0,United-States,>50K


In [85]:
len(dcs.DCs)

32214

In [84]:
ds.shuffle()
res=[ dc.sampleCoverage(ds,100) for dc in dcs.DCs]


KeyboardInterrupt: 

In [70]:
[i for i,r in enumerate(res) if r[0]<0.95]

[274, 505, 510, 768, 1673, 1729, 2230, 2231, 2467]

In [69]:
dcs.DCs[2231]

!["Education-num int" ne "Education-num int" & "education str" eq "education str"]

In [59]:
ds=Dataset("datasets/adult.csv")
ds.buildPLIs()

In [8]:
line='¬(t0.adult.csv.Native-country str<>t1.adult.csv.Native-country str^t0.adult.csv.capital-gain float<>t1.adult.csv.capital-gain float^t0.adult.csv.class str==t1.adult.csv.class str^t0.adult.csv.race str==t1.adult.csv.race str)'
line=line[2:-1] #strip !(...)
preds=line.split('^')


In [89]:
eq(ds.df['age int'],ds.df['age int'])

0        True
1        True
2        True
3        True
4        True
         ... 
32556    True
32557    True
32558    True
32559    True
32560    True
Name: age int, Length: 32561, dtype: bool