In [1]:
import psycopg2
import pandas as pd
from string import Template

In [2]:
# dataname = "small_census_01"
# index = "v1"

In [4]:
dataname = "small_adult_1"
index = "3"

In [5]:
# dataname =  "food_small"
# index = "v1"

### Connect to db

In [6]:
conn = psycopg2.connect(host="localhost",
                        database="{}_{}".format(dataname,index), 
                        user="holocleanuser", 
                        password="abcd1234")

In [7]:
def query_df(q, value = []):
    if len(value) != 0:
        df = pd.read_sql_query(q, conn, params=value)
    else:
        df = pd.read_sql_query(q, conn)
    return df

In [8]:
def table_lists():
    df = query_df("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
    return df

In [9]:
table_lists()

Unnamed: 0,table_name
0,inf_values_idx
1,small_adult_1
2,dk_cells
3,cell_domain
4,pos_values
5,cell_distr
6,inf_values_dom
7,small_adult_1_repaired
8,small_adult_1_clean


In [10]:
df = query_df("select * from small_adult_1")

In [11]:
df.head()

Unnamed: 0,_tid_,age,workclass,education,maritalstatus,occupation,relationship,race,sex,hoursperweek,country,income
0,0,<18,private,10th,never-married,machine-op-inspct,not-in-family,white,male,40,united-states,lessthan50k
1,1,<18,_nan_,10th,never-married,_nan_,own-child,white,male,40,united-states,lessthan50k
2,2,<18,private,11th,never-married,sales,own-child,white,female,25,united-states,lessthan50k
3,3,<18,private,10th,never-married,sales,own-child,white,female,18-21,united-states,lessthan50k
4,4,<18,_nan_,11th,never-married,_nan_,own-child,white,male,40,united-states,lessthan50k


In [12]:
# get all attributes 
data = query_df("select * from {}".format(dataname))
attributes = data.columns.values[1:]
attributes

array(['age', 'workclass', 'education', 'maritalstatus', 'occupation',
       'relationship', 'race', 'sex', 'hoursperweek', 'country', 'income'],
      dtype=object)

### Total Errors

In [13]:
errors_template = Template('SELECT t1._tid_, t2._attribute_, t1.\"$attr\" as init, t2._value_ as gt '\
                            'FROM $init_table as t1, $grdt_table as t2 '\
                            'WHERE t1._tid_ = t2._tid_ '\
                              'AND t2._attribute_ = \'$attr\' '\
                              'AND t1.\"$attr\" != t2._value_')

In [14]:
def get_total_errors():
    all_error = []
    for attr in attributes:
        query = errors_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname,
                        attr=attr)
        df = query_df(query)
        all_error.append(df)
    return all_error

In [15]:
all_error = get_total_errors()

In [16]:
all_error = pd.concat(all_error, ignore_index=True)

In [17]:
all_error

Unnamed: 0,_tid_,_attribute_,init,gt
0,1975,relationship,own-child,not-in-family
1,2651,race,white,black
2,1512,sex,male,female
3,605,income,lesjsthan50k,lessthan50k
4,609,income,lessthan5k0,lessthan50k
5,615,income,lessthan50k***,lessthan50k
6,616,income,lessthan5k0,lessthan50k
7,619,income,lnessthan50k,lessthan50k
8,621,income,lesstahn50k,lessthan50k
9,624,income,lesstahn50k,lessthan50k


In [18]:
vid_template = Template("SELECT * from cell_domain where attribute = \'$attr\' and _tid_ = \'$tid\'")

In [19]:
query_df(vid_template.substitute(attr='detailed_household_and_family_stat', tid=632))

Unnamed: 0,_cid_,_tid_,_vid_,attribute,domain,domain_size,fixed,init_index,init_value


In [20]:
init_count = all_error.shape[0]
init_count

321

### Total Repair

In [21]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
                 "FROM %s as t1, %s as t2 " \
                 "WHERE t1._tid_ = t2._tid_ " \
                   "AND t1.attribute = t2.attribute " \
                   "AND t1.init_value != t2.rv_value"\
                %('cell_domain', 'inf_values_dom')
all_repair = query_df(query)

In [22]:
all_repair.head()

Unnamed: 0,_tid_,attribute,init,repair
0,0,age,<18,18-21
1,1,age,<18,18-21
2,2,age,<18,18-21
3,3,age,<18,18-21
4,4,age,<18,18-21


In [23]:
total_repair_init_count = all_repair.shape[0]
total_repair_init_count

703

In [24]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
         "FROM %s as t1, %s as t2, %s as t3 " \
         "WHERE t1._tid_ = t2._tid_ " \
           "AND t1.attribute = t2.attribute " \
           "AND t1.init_value != t2.rv_value " \
           "AND t1._tid_ = t3._tid_ " \
           "AND t1.attribute = t3._attribute_"\
        %('cell_domain', 'inf_values_dom', '%s_clean'%dataname)
repair_gt = query_df(query)

In [25]:
repair_gt.head(15)

Unnamed: 0,_tid_,attribute,init,repair
0,0,age,<18,18-21
1,1,age,<18,18-21
2,2,age,<18,18-21
3,3,age,<18,18-21
4,4,age,<18,18-21
5,5,age,<18,18-21
6,6,age,<18,18-21
7,7,age,<18,18-21
8,8,age,<18,18-21
9,9,age,<18,18-21


In [26]:
repair_gt_init_count = repair_gt.shape[0]
repair_gt_init_count

703

### Correct Repair

In [27]:
correct_repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.\"$attr\" as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.\"$attr\" != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute '\
                                'AND errors._value_ = repairs.rv_value')

In [28]:
def get_total_repair():
    all_rp = []
    for attr in attributes:
        query = correct_repairs_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname, 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [29]:
all_repair_gt = get_total_repair()

In [30]:
all_repair_gt = pd.concat(all_repair_gt, ignore_index=True)

In [31]:
all_repair_gt.head(30)

Unnamed: 0,_tid_,_attribute_,init,gt,repair


In [32]:
all_repair_gt.shape[0]

0

### wrong repair correctly detected

In [33]:
wrong_repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.\"$attr\" as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.\"$attr\" != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute '\
                                'AND errors._value_ <> repairs.rv_value')

In [34]:
def get_total_wrong_repair():
    all_rp = []
    for attr in attributes:
        query = wrong_repairs_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname, 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [35]:
wrong_repair_gt = get_total_wrong_repair()

In [36]:
wrong_repair_gt = pd.concat(wrong_repair_gt, ignore_index=True)

In [37]:
wrong_repair_gt

Unnamed: 0,_tid_,_attribute_,init,gt,repair
0,1975,relationship,own-child,not-in-family,own-child
1,1512,sex,male,female,male


### Detected Errors

In [38]:
query = "SELECT t1._tid_, t1.init_value as init, t2._value_ as gt " \
        "FROM %s as t1, %s as t2, %s as t3 " \
        "WHERE t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
        "AND t1.attribute = t2._attribute_ " \
        "AND t1.init_value != t2._value_" \
        % ('cell_domain', '%s_clean'%dataname, 'dk_cells')

In [39]:
init_error = query_df(query)

In [40]:
init_error.shape[0]

109

In [41]:
init_error.head(10)

Unnamed: 0,_tid_,init,gt
0,1512,male,female
1,1975,own-child,not-in-family
2,609,lessthan5k0,lessthan50k
3,619,lnessthan50k,lessthan50k
4,628,lessthn50k,lessthan50k
5,650,leqssthan50k,lessthan50k
6,656,lessthan0k,lessthan50k
7,657,lessthan50k***,lessthan50k
8,734,lssthan50k,lessthan50k
9,735,lesstha5n0k,lessthan50k
