In [1]:
import psycopg2
import pandas as pd
from string import Template

In [24]:
# dataname = "small_census_01"
# index = "v1"

In [25]:
dataname = "small_adult_1"
index = "d3"

In [26]:
# dataname =  "food_small"
# index = "v1"

In [2]:
dataname = "adult_1"
index = "1"

### Connect to db

In [3]:
conn = psycopg2.connect(host="localhost",
                        database="{}_{}".format(dataname,index), 
                        user="holocleanuser", 
                        password="abcd1234")

In [4]:
def query_df(q, value = []):
    if len(value) != 0:
        df = pd.read_sql_query(q, conn, params=value)
    else:
        df = pd.read_sql_query(q, conn)
    return df

In [5]:
def table_lists():
    df = query_df("""SELECT table_name FROM information_schema.tables
       WHERE table_schema = 'public'""")
    return df

In [6]:
table_lists()

Unnamed: 0,table_name
0,inf_values_idx
1,adult_1
2,dk_cells
3,cell_domain
4,pos_values
5,cell_distr
6,inf_values_dom
7,adult_1_repaired
8,adult_1_clean


In [9]:
df = query_df("select * from %s" % dataname)

In [10]:
df.head()

Unnamed: 0,_tid_,age,workclass,education,maritalstatus,occupation,relationship,race,sex,hoursperweek,country,income
0,0,<18,private,10th,never-married,machine-op-inspct,not-in-family,white,male,40,united-states,lessthan50k
1,1,<18,_nan_,10th,never-married,_nan_,own-child,white,male,40,united-states,lessthan50k
2,2,<18,private,11th,never-married,sales,own-child,white,female,25,united-states,lessthan50k
3,3,<18,private,10th,never-married,sales,own-child,white,female,18-21,united-states,lessthan50k
4,4,<18,_nan_,11th,never-married,_nan_,own-child,white,male,40,united-states,lessthan50k


In [11]:
# get all attributes 
data = query_df("select * from {}".format(dataname))
attributes = data.columns.values[1:]
attributes

array(['age', 'workclass', 'education', 'maritalstatus', 'occupation',
       'relationship', 'race', 'sex', 'hoursperweek', 'country', 'income'],
      dtype=object)

### Total Errors

In [12]:
errors_template = Template('SELECT t1._tid_, t2._attribute_, t1.\"$attr\" as init, t2._value_ as gt '\
                            'FROM $init_table as t1, $grdt_table as t2 '\
                            'WHERE t1._tid_ = t2._tid_ '\
                              'AND t2._attribute_ = \'$attr\' '\
                              'AND t1.\"$attr\" != t2._value_')

In [13]:
def get_total_errors():
    all_error = []
    for attr in attributes:
        query = errors_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname,
                        attr=attr)
        df = query_df(query)
        all_error.append(df)
    return all_error

In [14]:
all_error = get_total_errors()

In [15]:
all_error = pd.concat(all_error, ignore_index=True)

In [52]:
all_error.head()

Unnamed: 0,_tid_,_attribute_,init,gt
0,4987,age,18-21,22-30
1,5683,age,18-21,22-30
2,6496,age,18-21,22-30
3,6983,age,18-21,22-30
4,8989,age,18-21,22-30


In [17]:
vid_template = Template("SELECT * from cell_domain where attribute = \'$attr\' and _tid_ = \'$tid\'")

In [18]:
query_df(vid_template.substitute(attr='income', tid=609))

Unnamed: 0,_cid_,_tid_,_vid_,attribute,domain,domain_size,fixed,init_index,init_value,weak_label,weak_label_idx
0,6709,609,4879,income,morethan50k|||lessthan50k|||lessthan5k0,3,0,2,lessthan5k0,lessthan5k0,2


In [19]:
df = query_df(vid_template.substitute(attr='income', tid=609))

In [20]:
df['domain'].values

array(['morethan50k|||lessthan50k|||lessthan5k0'], dtype=object)

In [21]:
query_df("SELECT t2.distribution from cell_domain as t1, cell_distr as t2 where t1._vid_ = 4879 and t2._vid_ = t1._vid_ ").values

array([['{0.10650698,0.10650698,0.78698605}']], dtype=object)

In [22]:
init_count = all_error.shape[0]
init_count

1062

### Total Repair

In [23]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
                 "FROM %s as t1, %s as t2 " \
                 "WHERE t1._tid_ = t2._tid_ " \
                   "AND t1.attribute = t2.attribute " \
                   "AND t1.init_value != t2.rv_value"\
                %('cell_domain', 'inf_values_dom')
all_repair = query_df(query)

In [24]:
all_repair.head()

Unnamed: 0,_tid_,attribute,init,repair
0,8,country,mexico,united-states
1,25,country,mexico,united-states
2,28,country,columbia,united-states
3,29,workclass,local-gov,private
4,38,country,poland,united-states


In [25]:
total_repair_init_count = all_repair.shape[0]
total_repair_init_count

29093

In [26]:
query = "SELECT t1._tid_, t1.attribute, t1.init_value as init, t2.rv_value as repair " \
         "FROM %s as t1, %s as t2, %s as t3 " \
         "WHERE t1._tid_ = t2._tid_ " \
           "AND t1.attribute = t2.attribute " \
           "AND t1.init_value != t2.rv_value " \
           "AND t1._tid_ = t3._tid_ " \
           "AND t1.attribute = t3._attribute_"\
        %('cell_domain', 'inf_values_dom', '%s_clean'%dataname)
repair_gt = query_df(query)

In [27]:
repair_gt.head()

Unnamed: 0,_tid_,attribute,init,repair
0,8,country,mexico,united-states
1,25,country,mexico,united-states
2,28,country,columbia,united-states
3,29,workclass,local-gov,private
4,38,country,poland,united-states


In [28]:
repair_gt_init_count = repair_gt.shape[0]
repair_gt_init_count

29093

### Correct Repair

In [29]:
correct_repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.\"$attr\" as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.\"$attr\" != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute '\
                                'AND errors._value_ = repairs.rv_value')

In [30]:
def get_total_repair():
    all_rp = []
    for attr in attributes:
        query = correct_repairs_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname, 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [31]:
all_repair_gt = get_total_repair()

In [32]:
all_repair_gt = pd.concat(all_repair_gt, ignore_index=True)

In [33]:
all_repair_gt.head(30)

Unnamed: 0,_tid_,_attribute_,init,gt,repair


In [34]:
all_repair_gt.shape[0]

0

### wrong repair correctly detected

In [35]:
wrong_repairs_template = Template('SELECT errors._tid_, errors._attribute_, '\
                            ' errors.init, errors._value_ as gt, repairs.rv_value as repair FROM'\
                            '(SELECT t2._tid_, t2._attribute_, t2._value_, t1.\"$attr\" as init '\
                             'FROM $init_table as t1, $grdt_table as t2 '\
                             'WHERE t1._tid_ = t2._tid_ '\
                               'AND t2._attribute_ = \'$attr\' '\
                               'AND t1.\"$attr\" != t2._value_ ) as errors, $inf_dom as repairs '\
                              'WHERE errors._tid_ = repairs._tid_ '\
                                'AND errors._attribute_ = repairs.attribute '\
                                'AND errors._value_ <> repairs.rv_value')

In [36]:
def get_total_wrong_repair():
    all_rp = []
    for attr in attributes:
        query = wrong_repairs_template.substitute(init_table=dataname, grdt_table='%s_clean'%dataname, 
                                            inf_dom = 'inf_values_dom', attr=attr)
        df = query_df(query)
        all_rp.append(df)
    return all_rp

In [37]:
wrong_repair_gt = get_total_wrong_repair()

In [38]:
wrong_repair_gt = pd.concat(wrong_repair_gt, ignore_index=True)

In [39]:
wrong_repair_gt

Unnamed: 0,_tid_,_attribute_,init,gt,repair
0,4987,age,18-21,22-30,18-21
1,7521,age,18-21,22-30,18-21
2,12308,age,18-21,22-30,18-21
3,13059,age,18-21,22-30,18-21
4,13769,age,18-21,22-30,18-21
5,14557,age,18-21,22-30,18-21
6,18898,age,18-21,31-50,18-21
7,19894,age,18-21,31-50,18-21
8,20535,age,18-21,31-50,18-21
9,21160,age,18-21,31-50,18-21


### Detected Errors

In [40]:
query = "SELECT t1._tid_, t1.init_value as init, t2._value_ as gt " \
        "FROM %s as t1, %s as t2, %s as t3 " \
        "WHERE t1._tid_ = t2._tid_ AND t1._cid_ = t3._cid_ " \
        "AND t1.attribute = t2._attribute_ " \
        "AND t1.init_value != t2._value_" \
        % ('cell_domain', '%s_clean'%dataname, 'dk_cells')

In [41]:
init_error = query_df(query)

In [42]:
init_error.shape[0]

1062

In [43]:
init_error.head(10)

Unnamed: 0,_tid_,init,gt
0,657,lessthan50k***,lessthan50k
1,713,lessthan50k***,lessthan50k
2,745,lessthapn50k,lessthan50k
3,828,lessthans50k,lessthan50k
4,1036,lessthna50k,lessthan50k
5,1103,lesshan50k,lessthan50k
6,1230,lestshan50k,lessthan50k
7,1567,lesstwhan50k,lessthan50k
8,1670,lesstha5n0k,lessthan50k
9,1747,lessthanr50k,lessthan50k


In [48]:
query_df("select fixed, init_value, weak_label, rv_value, _value_ from cell_domain t1, inf_values_dom t2, adult_1_clean t3 where t1._tid_ = t3._tid_ and t1.attribute = t3._attribute_ and t1._tid_ = t2._tid_ and t1.attribute = t2.attribute and fixed = 1 and weak_label = rv_value and rv_value != _value_;")



Unnamed: 0,fixed,init_value,weak_label,rv_value,_value_
0,1,columbia,united-states,united-states,columbia
1,1,poland,united-states,united-states,poland
2,1,puerto-rico,united-states,united-states,puerto-rico
3,1,never-worked,private,private,never-worked
4,1,philippines,united-states,united-states,philippines
5,1,guatemala,united-states,united-states,guatemala
6,1,england,united-states,united-states,england
7,1,england,united-states,united-states,england
8,1,philippines,united-states,united-states,philippines
9,1,other,white,white,other


In [49]:
query_df("select fixed, init_value, weak_label from cell_domain where fixed = 2 and init_value = weak_label;")

Unnamed: 0,fixed,init_value,weak_label
0,2,private,private
1,2,united-states,united-states
2,2,white,white
3,2,united-states,united-states
4,2,united-states,united-states
5,2,united-states,united-states
6,2,united-states,united-states
7,2,united-states,united-states
8,2,united-states,united-states
9,2,white,white
