# Run HyUCC Algorithm - Example

This notebook shows an example for running the unique column combination discovery algorithm HyUCC.

In [1]:
# Use a Socrata dataset as the example input.

from openclean.data.source.socrata import Socrata
df = Socrata().dataset('bre9-aqqr').load()

In [2]:
df

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,11/28/2020,51001,Accomack,Eastern Shore,1340,107,21
1,11/28/2020,51003,Albemarle,Thomas Jefferson,1896,100,27
2,11/28/2020,51005,Alleghany,Alleghany,316,19,7
3,11/28/2020,51007,Amelia,Piedmont,210,21,6
4,11/28/2020,51009,Amherst,Central Virginia,810,31,6
...,...,...,...,...,...,...,...
60510,06/14/2021,51800,Suffolk,Western Tidewater,7997,460,191
60511,06/14/2021,51810,Virginia Beach,Virginia Beach,36278,1700,412
60512,06/14/2021,51820,Waynesboro,Central Shenandoah,2395,73,38
60513,06/14/2021,51830,Williamsburg,Peninsula,769,29,13


## Run using local Java JRE

In [3]:
# Configure the runtime environment.

# When running the algorithms using the local java installation make sure
# that the environment variable METANOME_JARPATH references the Metanome.jar
# file on your machine. Alternatively, configure the environment settings:

import openclean_metanome.config as config

# env = {config.METANOME_JARPATH: '/path/to/Metanome.jar'}
env = {config.METANOME_JARPATH: config.JARFILE()}

In [4]:
# Run the HyUCC algorithm on the downloaded dataset.

from openclean_metanome.algorithm.hyucc import hyucc

keys = hyucc(df, max_ucc_size=3, env=env)

Metanome Data Profiling Wrapper - Version 0.1.0

Initializing ...
Reading data and calculating plis ...
Sorting plis by number of clusters ...
Inverting plis ...
Extracting integer representations for the records ...
Investigating comparison suggestions ... 
Sorting clusters ...(188ms)
Running initial windows ...(137ms)
Moving window over clusters ... 
Inducing UCC candidates ...
Validating UCCs using plis ...
	Level 1: 4 elements; (V)(C)(G); 0 intersections; 0 validations; 0 invalid; 0 new candidates; --> 0 UCCs
	Level 2: 9 elements; (V)(C)(G); 4 intersections; 4 validations; 2 invalid; 7 new candidates; --> 2 UCCs
Investigating comparison suggestions ... 
Moving window over clusters ... 
Inducing UCC candidates ...
Validating UCCs using plis ...
	Level 3: 13 elements; (V)(-)(-); 13 intersections; 13 validations; 13 invalid; - new candidates; --> 0 UCCs
Translating UCC-tree into result format ...
... done! (2 UCCs)
Time: 815 ms



In [5]:
for ucc in keys:
    print(ucc)

['Report Date', 'FIPS']
['Report Date', 'Locality']


## Run using local Docker instance

In [6]:
# When running HyUCC using Docker, the worker that executes the HyFD
# algorithm, the image identifier for the Docker container, and the
# name of the JAR file in the container need to be specified via the
# environment variables 'METANOME_WORKER', METANOME_CONTAINER' and
# 'METANOME_JARPATH'. The current values for these three variables
# can be accessed using the configuration helper functions WORKER,
# CONTAINER and JARFILE. These functions will return the default
# values if the environment variables are not set.

from openclean_metanome import config

print(f'{config.METANOME_WORKER}={config.WORKER()}')
print(f'{config.METANOME_CONTAINER}={config.CONTAINER()}')
print(f'{config.METANOME_JARPATH}={config.JARFILE()}')

METANOME_WORKER=None
METANOME_CONTAINER=heikomueller/openclean-metanome:0.1.0
METANOME_JARPATH=/home/heiko/.cache/openclean_metanome/Metanome.jar


In [7]:
# Configuration parameters can also be set using a dictionary that
# is passed to the hyucc method. For example, when running HyUCC using
# default container image 'heikomueller/openclean-metanome:0.1.0' the
# Jar-file path needs to be set to 'lib/Metanome.jar'.
# You can use the default worker configuration file that is provided
# with this repository to configure the Docker worker.

env = {
    config.METANOME_WORKER: '../../config/docker_worker.yaml',
    config.METANOME_CONTAINER: 'heikomueller/openclean-metanome:0.1.0',
    config.METANOME_JARPATH: 'lib/Metanome.jar'
}

print(f'{config.METANOME_WORKER}={config.WORKER(env=env)}')
print(f'{config.METANOME_CONTAINER}={config.CONTAINER(env=env)}')
print(f'{config.METANOME_JARPATH}={config.JARFILE(env=env)}')

METANOME_WORKER={'name': 'docker_worker', 'type': 'docker'}
METANOME_CONTAINER=heikomueller/openclean-metanome:0.1.0
METANOME_JARPATH=lib/Metanome.jar


In [8]:
# Run the HyUCC algorithm on the downloaded dataset using the
# default environment settings for Docker workers.

from openclean_metanome.algorithm.hyucc import hyucc
from openclean_metanome.config import Docker

keys = hyucc(df, max_ucc_size=3, env=env)

Metanome Data Profiling Wrapper - Version 0.1.0

Initializing ...
Reading data and calculating plis ...
Sorting plis by number of clusters ...
Inverting plis ...
Extracting integer representations for the records ...
Investigating comparison suggestions ... 
Sorting clusters ...(117ms)
Running initial windows ...(184ms)
Moving window over clusters ... 
Inducing UCC candidates ...
Validating UCCs using plis ...
	Level 1: 4 elements; (V)(C)(G); 0 intersections; 0 validations; 0 invalid; 0 new candidates; --> 0 UCCs
	Level 2: 9 elements; (V)(C)(G); 4 intersections; 4 validations; 2 invalid; 7 new candidates; --> 2 UCCs
Investigating comparison suggestions ... 
Moving window over clusters ... 
Inducing UCC candidates ...
Validating UCCs using plis ...
	Level 3: 13 elements; (V)(-)(-); 13 intersections; 13 validations; 13 invalid; - new candidates; --> 0 UCCs
Translating UCC-tree into result format ...
... done! (2 UCCs)
Time: 833 ms



In [9]:
for ucc in keys:
    print(ucc)

['Report Date', 'FIPS']
['Report Date', 'Locality']
