# Run HyFD Algorithm - Example

This notebook shows an example for running the functional dependency discovery algorithm HyFD.

In [1]:
# Use a Socrata dataset as the example input.

from openclean.data.source.socrata import Socrata
df = Socrata().dataset('bre9-aqqr').load()

In [2]:
df

Unnamed: 0,Report Date,FIPS,Locality,VDH Health District,Total Cases,Hospitalizations,Deaths
0,11/28/2020,51001,Accomack,Eastern Shore,1340,107,21
1,11/28/2020,51003,Albemarle,Thomas Jefferson,1896,100,27
2,11/28/2020,51005,Alleghany,Alleghany,316,19,7
3,11/28/2020,51007,Amelia,Piedmont,210,21,6
4,11/28/2020,51009,Amherst,Central Virginia,810,31,6
...,...,...,...,...,...,...,...
48407,03/15/2021,51800,Suffolk,Western Tidewater,7127,409,175
48408,03/15/2021,51810,Virginia Beach,Virginia Beach,31848,1342,352
48409,03/15/2021,51820,Waynesboro,Central Shenandoah,2185,69,34
48410,03/15/2021,51830,Williamsburg,Peninsula,550,24,11


## Run using local Java JRE

In [3]:
# Download the 'Metanome.jar' file if no copy exists on the
# local machine at the path that is defined by config.JARFILE().

from openclean_metanome.download import download_jar

download_jar(verbose=True)

download jar file as /home/heiko/.cache/openclean_metanome/Metanome.jar
file exists


In [4]:
# Run the HyFD algorithm on the downloaded dataset.

from openclean_metanome.algorithm.hyfd import hyfd

fds = hyfd(df, max_lhs_size=3)

Metanome Data Profiling Wrapper - Version 0.1.0

Initializing ...
Reading data and calculating plis ...
Sorting plis by number of clusters ...
Inverting plis ...
Extracting integer representations for the records ...
Investigating comparison suggestions ... 
Sorting clusters ...(134ms)
Running initial windows ...(139ms)
Moving window over clusters ... 
Window signature: [2][2][1][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 0: 1 elements; (V)(C)(G); 0 intersections; 0 validations; 0 invalid; 0 new candidates; --> 0 FDs
	Level 1: 6 elements; (V)(C)(G); 2 intersections; 2 validations; 0 invalid; 0 new candidates; --> 2 FDs
	Level 2: 9 elements; (V)(C)(G); 4 intersections; 16 validations; 6 invalid; 4 new candidates; --> 10 FDs
Investigating comparison suggestions ... 
Moving window over clusters ... 
Window signature: [2][2][2][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 3: 10 elements; (V)(-)(-); 10 intersections; 26 validat

In [5]:
# Print discovered functional dependencies.

for fd in fds:
    print(str(fd))

[FIPS,Total Cases] -> [VDH Health District]
[Locality,Total Cases] -> [VDH Health District]
[Report Date,FIPS] -> [Total Cases]
[Report Date,FIPS] -> [Hospitalizations]
[Report Date,FIPS] -> [Deaths]
[Report Date,FIPS] -> [VDH Health District]
[Report Date,Locality] -> [Total Cases]
[Report Date,Locality] -> [Hospitalizations]
[Report Date,Locality] -> [Deaths]
[Report Date,Locality] -> [VDH Health District]
[FIPS] -> [Locality]
[Locality] -> [FIPS]


## Run using local Docker instance

In [6]:
# Configure the runtime environment.

# When running the algorithms using Docker make sure that the environment
# variable METANOME_WORKERS references a worker configuration file that
# assigns a docker worker to the Docker image 'heikomueller/openclean-metanome:0.1.0'.
# Note that you can also specify a different image using the environment
# variable 'METANOME_CONTAINER'.

# An example docker worker configuration file is included in the package:

import openclean_metanome.config as config

env = {
    config.METANOME_WORKERS: '../../config/docker_worker.yaml',
    config.METANOME_CONTAINER: 'heikomueller/openclean-metanome:0.1.0'
}

In [7]:
# Run the HyFD algorrithm on the downloaded dataset.

from openclean_metanome.algorithm.hyfd import hyfd

fds = hyfd(df, max_lhs_size=3)

Metanome Data Profiling Wrapper - Version 0.1.0

Initializing ...
Reading data and calculating plis ...
Sorting plis by number of clusters ...
Inverting plis ...
Extracting integer representations for the records ...
Investigating comparison suggestions ... 
Sorting clusters ...(136ms)
Running initial windows ...(146ms)
Moving window over clusters ... 
Window signature: [2][2][1][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 0: 1 elements; (V)(C)(G); 0 intersections; 0 validations; 0 invalid; 0 new candidates; --> 0 FDs
	Level 1: 6 elements; (V)(C)(G); 2 intersections; 2 validations; 0 invalid; 0 new candidates; --> 2 FDs
	Level 2: 9 elements; (V)(C)(G); 4 intersections; 16 validations; 6 invalid; 4 new candidates; --> 10 FDs
Investigating comparison suggestions ... 
Moving window over clusters ... 
Window signature: [2][2][2][1][1][1][1]
Inducing FD candidates ...
Validating FDs using plis ...
	Level 3: 10 elements; (V)(-)(-); 10 intersections; 26 validat

In [8]:
# Print discovered functional dependencies.

for fd in fds:
    print(str(fd))

[FIPS,Total Cases] -> [VDH Health District]
[Locality,Total Cases] -> [VDH Health District]
[Report Date,FIPS] -> [Total Cases]
[Report Date,FIPS] -> [Hospitalizations]
[Report Date,FIPS] -> [Deaths]
[Report Date,FIPS] -> [VDH Health District]
[Report Date,Locality] -> [Total Cases]
[Report Date,Locality] -> [Hospitalizations]
[Report Date,Locality] -> [Deaths]
[Report Date,Locality] -> [VDH Health District]
[FIPS] -> [Locality]
[Locality] -> [FIPS]
