# Run HyFD Algorithm - Example

This notebook shows an example for running the functional dependency discovery algorithm HyFD.

In [1]:
# Use a Socrata dataset as the example input.

from openclean.data.source.socrata import Socrata
df = Socrata().dataset('bre9-aqqr').load()

In [2]:
df

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,11/28/2020,51001,Accomack,Eastern Shore,1340,107,21
1,11/28/2020,51003,Albemarle,Thomas Jefferson,1896,100,27
2,11/28/2020,51005,Alleghany,Alleghany,316,19,7
3,11/28/2020,51007,Amelia,Piedmont,210,21,6
4,11/28/2020,51009,Amherst,Central Virginia,810,31,6
...,...,...,...,...,...,...,...
60510,06/14/2021,51800,Suffolk,Western Tidewater,7997,460,191
60511,06/14/2021,51810,Virginia Beach,Virginia Beach,36278,1700,412
60512,06/14/2021,51820,Waynesboro,Central Shenandoah,2395,73,38
60513,06/14/2021,51830,Williamsburg,Peninsula,769,29,13


## Run using local Java JRE

In [3]:
# Download the 'Metanome.jar' file if no copy exists on the
# local machine at the path that is defined by config.JARFILE().

from openclean_metanome.download import download_jar

download_jar(verbose=True)

download jar file as /home/heiko/.cache/openclean_metanome/Metanome.jar
file exists


In [4]:
# Run the HyFD algorithm on the downloaded dataset.

from openclean_metanome.algorithm.hyfd import hyfd

fds = hyfd(df, max_lhs_size=3)

Metanome Data Profiling Wrapper - Version 0.1.0

Initializing ...
Reading data and calculating plis ...
Sorting plis by number of clusters ...
Inverting plis ...
Extracting integer representations for the records ...
Investigating comparison suggestions ... 
Sorting clusters ...(127ms)
Running initial windows ...(158ms)
Moving window over clusters ... 
Window signature: [2][2][1][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 0: 1 elements; (V)(C)(G); 0 intersections; 0 validations; 0 invalid; 0 new candidates; --> 0 FDs
	Level 1: 6 elements; (V)(C)(G); 2 intersections; 2 validations; 0 invalid; 0 new candidates; --> 2 FDs
	Level 2: 9 elements; (V)(C)(G); 4 intersections; 16 validations; 6 invalid; 4 new candidates; --> 10 FDs
Investigating comparison suggestions ... 
Moving window over clusters ... 
Window signature: [2][2][2][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 3: 10 elements; (V)(-)(-); 10 intersections; 26 validat

In [5]:
# Print discovered functional dependencies.

for fd in fds:
    print(str(fd))

[FIPS,Total Cases] -> [VDH Health District]
[Locality,Total Cases] -> [VDH Health District]
[Report Date,FIPS] -> [Total Cases]
[Report Date,FIPS] -> [Hospitalizations]
[Report Date,FIPS] -> [Deaths]
[Report Date,FIPS] -> [VDH Health District]
[Report Date,Locality] -> [Total Cases]
[Report Date,Locality] -> [Hospitalizations]
[Report Date,Locality] -> [Deaths]
[Report Date,Locality] -> [VDH Health District]
[FIPS] -> [Locality]
[Locality] -> [FIPS]


## Run using local Docker instance

In [6]:
# When running HyFD using Docker, the worker that executes the HyFD
# algorithm, the image identifier for the Docker container, and the
# name of the JAR file in the container need to be specified via the
# environment variables 'METANOME_WORKER', METANOME_CONTAINER' and
# 'METANOME_JARPATH'. The current values for these three variables
# can be accessed using the configuration helper functions WORKER,
# CONTAINER and JARFILE. These functions will return the default
# values if the environment variables are not set.

from openclean_metanome import config

print(f'{config.METANOME_WORKER}={config.WORKER()}')
print(f'{config.METANOME_CONTAINER}={config.CONTAINER()}')
print(f'{config.METANOME_JARPATH}={config.JARFILE()}')

METANOME_WORKER=None
METANOME_CONTAINER=heikomueller/openclean-metanome:0.1.0
METANOME_JARPATH=/home/heiko/.cache/openclean_metanome/Metanome.jar


In [7]:
# Configuration parameters can also be set using a dictionary that
# is passed to the hyfd method. For example, when running HyFD using
# default container image 'heikomueller/openclean-metanome:0.1.0' the
# Jar-file path needs to be set to 'lib/Metanome.jar'.
# The Docker() helper function is used to set the worker to be a Docker
# worker.

from openclean_metanome.config import Docker

env = {
    config.METANOME_WORKER: Docker(),
    config.METANOME_CONTAINER: 'heikomueller/openclean-metanome:0.1.0',
    config.METANOME_JARPATH: 'lib/Metanome.jar'
}

print(f'{config.METANOME_WORKER}={config.WORKER(env=env)}')
print(f'{config.METANOME_CONTAINER}={config.CONTAINER(env=env)}')
print(f'{config.METANOME_JARPATH}={config.JARFILE(env=env)}')

METANOME_WORKER={'name': '2b847d0399424550841bc3d847154b7e', 'type': 'docker', 'env': [], 'variables': []}
METANOME_CONTAINER=heikomueller/openclean-metanome:0.1.0
METANOME_JARPATH=lib/Metanome.jar


In [8]:
# Run the HyFD algorithm on the downloaded dataset using the
# default environment settings for Docker workers.

from openclean_metanome.algorithm.hyfd import hyfd

fds = hyfd(df, max_lhs_size=3, env=env)

Metanome Data Profiling Wrapper - Version 0.1.0

Initializing ...
Reading data and calculating plis ...
Sorting plis by number of clusters ...
Inverting plis ...
Extracting integer representations for the records ...
Investigating comparison suggestions ... 
Sorting clusters ...(114ms)
Running initial windows ...(166ms)
Moving window over clusters ... 
Window signature: [2][2][1][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 0: 1 elements; (V)(C)(G); 0 intersections; 0 validations; 0 invalid; 0 new candidates; --> 0 FDs
	Level 1: 6 elements; (V)(C)(G); 2 intersections; 2 validations; 0 invalid; 0 new candidates; --> 2 FDs
	Level 2: 9 elements; (V)(C)(G); 4 intersections; 16 validations; 6 invalid; 4 new candidates; --> 10 FDs
Investigating comparison suggestions ... 
Moving window over clusters ... 
Window signature: [2][2][2][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 3: 10 elements; (V)(-)(-); 10 intersections; 26 validat

In [9]:
# Print discovered functional dependencies.

for fd in fds:
    print(str(fd))

[FIPS,Total Cases] -> [VDH Health District]
[Locality,Total Cases] -> [VDH Health District]
[Report Date,FIPS] -> [Total Cases]
[Report Date,FIPS] -> [Hospitalizations]
[Report Date,FIPS] -> [Deaths]
[Report Date,FIPS] -> [VDH Health District]
[Report Date,Locality] -> [Total Cases]
[Report Date,Locality] -> [Hospitalizations]
[Report Date,Locality] -> [Deaths]
[Report Date,Locality] -> [VDH Health District]
[FIPS] -> [Locality]
[Locality] -> [FIPS]
