In [10]:
from presentation_utils import *
from DoD import view_4c_analysis_baseline as v4c
import glob
import pprint

In [11]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

# Config

In [12]:
# directory storing the views
dir_path = "./building/"
# max size of candidate key
candidate_key_size = 2
# sample rows to present
sample_size = 5
# exploration / exploitation (choose next pair to present among top_k)
top_k = 10
# epsilon-greedy
epsilon = 0.1

max_num_interactions = 100

# Run 4C

In [13]:
# Run 4C
print("Running 4C...")

compatible_groups, contained_groups, complementary_groups, contradictory_groups, all_pair_contr_compl = \
    v4c.main(dir_path, candidate_key_size)

print("Compatible groups:")
for group in compatible_groups:
    print(group)

print("Contained groups:")
for group in contained_groups:
    print(group)

print("Contradictory groups:")
for path1, candidate_key_tuple, key_value_tuples, path2 in contradictory_groups:
    print(str(list(candidate_key_tuple)) + ": " + path1 + " - " + path2)

print("Complementary groups:")
for path1, path2, candidate_key_tuple, _, _ in complementary_groups:
    print(str(list(candidate_key_tuple)) + ": " + path1 + " - " + path2)

Running 4C...
Found 50 valid tables
View candidates classify into 1 groups based on schema

Num elements with schema 220221704124236003 is: 50


100%|██████████| 201/201 [00:14<00:00, 13.59it/s]

Compatible groups:
['./building/view_43', './building/view_45']
['./building/view_44', './building/view_42']
['./building/view_9', './building/view_23']
['./building/view_8', './building/view_22']
['./building/view_21', './building/view_19', './building/view_39', './building/view_5']
['./building/view_18', './building/view_20', './building/view_38', './building/view_4']
['./building/view_37', './building/view_41']
['./building/view_36', './building/view_40']
['./building/view_34', './building/view_30']
['./building/view_35', './building/view_31']
['./building/view_28', './building/view_24']
['./building/view_29', './building/view_25']
['./building/view_33', './building/view_17', './building/view_3', './building/view_15']
['./building/view_32', './building/view_16', './building/view_14', './building/view_2']
['./building/view_0', './building/view_10', './building/view_26', './building/view_12']
['./building/view_1', './building/view_27', './building/view_11', './building/view_13']
Conta




# Pruning 4C views
## Remove identical views and keep the contained view with the largest cardinality

In [14]:
view_files = glob.glob(dir_path + "/view_*")
print("Number of views: ", len(view_files))

view_files = prune_compatible_views(view_files, compatible_groups)
print("After pruning compatible views: ", len(view_files))

view_files = prune_contained_views(view_files, contained_groups)
print("After pruning contained views: ", len(view_files))

Number of views:  50
After pruning compatible views:  22
After pruning contained views:  16


# Pre-processing and generating sample rows to present

In [15]:
print("Pre-processing...")

contr_or_compl_view_pairs, non_contr_or_compl_views, row_to_path_dict = preprocess(view_files, all_pair_contr_compl, sample_size)

  0%|          | 1/201 [00:00<00:20,  9.69it/s]

Pre-processing...


100%|██████████| 201/201 [00:06<00:00, 29.55it/s]


# Actual presentation

In [16]:
%gui asyncio
# Using asynchronous widgets. Requires ipykernel 4.7 or later
# pip install ipython ipykernel --upgrade

# async
task = present_async(view_files, contr_or_compl_view_pairs, non_contr_or_compl_views, row_to_path_dict,
                     top_k, epsilon, max_num_interactions, sample_size)

# sync
# final_view_scores, num_interactions = present(view_files, contr_or_compl_view_pairs, non_contr_or_compl_views, row_to_path_dict,
#                                               top_k, epsilon, max_num_interactions, sample_size)

Output()

# Final view scores

In [9]:
# async
await task
final_view_scores, num_interactions = task.result()

print("Final view scores:")
pprint.pprint(final_view_scores)

print("Number of interactions = " + str(num_interactions))

Final view scores:
[('./toytest/view_158', 10),
 ('./toytest/view_114', 7),
 ('./toytest/view_88', 6),
 ('./toytest/view_126', 6),
 ('./toytest/view_44', 4),
 ('./toytest/view_11', 4),
 ('./toytest/view_215', 0),
 ('./toytest/view_188', 0)]
Number of interactions = 6
