# Data Discovery in Data Lakes with BLEND

### Load libraries and define paths

In [None]:
import os
import sys
from pathlib import Path
import polars as pl
from tabulate import tabulate

In [None]:
data_path = Path("..", "data", "modena")

data_path.absolute(), data_path.exists()

Add BLEND modules to the PYTHONPATH list of paths

In [None]:
modules_path = Path("..", "modules")
blend_module_path = modules_path.joinpath("BLEND")

sys.path.append(str(blend_module_path.resolve()))
sys.path

In [None]:
db_path = data_path.joinpath("index_blend.db")
data_lake_path = data_path.joinpath("data-lake")
queries_path = data_path.joinpath("queries")

In [None]:
db_path.exists()

In [None]:
from blend import BLEND
from blend.utils import clean

### Instantiate BLEND index

In [None]:
index = BLEND(db_path)

### Load the query dataset

We have some datasets in the _query_ folder:

In [None]:
queries = sorted(os.listdir(queries_path))

print('\n\n'.join(queries))

In [None]:
# select one of the available queries
query_table_idx = 1
query_table_name = queries[query_table_idx]

# load the query dataset
qdf = pl.read_csv(queries_path.joinpath(query_table_name))

qdf

## Keyword Search

In many use-cases, one of the simplest and most useful kind of data discovery task is the _keyword_ search.

Basically, we want to identify those datasets whose cell values, considered as a set, have the highest overlap with a user-given query set.

We don't check for any ordering on rows/columns, just the overlap.

In [None]:
# we flatten our query dataframe values to a set
values = list(set(map(clean, {cell for row in qdf.rows() for cell in row})))

len(values)

In [None]:
results = index.keyword_search(values, k=20)

print(f"Query table: {query_table_name}\n")

print(tabulate(results, headers=['dataset', 'overlap']))

**Q:** For which use-cases a keyword search is actually useful?

## Single Column JOIN Search

The dataset above has a single key column, _THE\_KEY_, which is the combination of _SECTION_, _DISTRICT\_CD_ and _ELECTION_

Such a combination might be useful to retrieve related tables using BLEND.

In [None]:
# extract and clean the values of the key
column = qdf.get_column('THE_PK_KEY').map_elements(lambda x: clean(x), pl.String).drop_nulls()

values = column.to_list()

column.head(5)

Execute the search with BLEND, returning the 10 columns with highest overlap with the query. If we run the query several times with the same input, we 
should see always the same results (ties may appear in different order).

In [None]:
results = index.single_column_join_search(values, k=20)

print(f"Query table: {query_table_name}\n")
print(tabulate(results, headers=['dataset', 'column idx', 'overlap (distinct)', 'overlap (general)']))

We can now easily identify the datasets we are most interested with; we can load them by accessing the results list and check
their content.

In [None]:
r_df = pl.read_csv(data_lake_path.joinpath(f"{results[-1][0]}.csv"))
r_df

## Multi-Column JOIN Search - Combination of single-JOIN searches

In many cases, a single column doesn't identify every record of a dataset, and a combination of different attributes is thus required.

Suppose that we do not have anymore the "THE_KEY" column.

In [None]:
qdf = qdf.drop("THE_PK_KEY")
qdf.head()

How can we search joinable tables on "SECTION", "DISTRICT_CD", and "ELECTION" columns at the same time?

To identify joinable tables on multiple columns, can we run several single-column searches? Is this a good option?

In [None]:
values = qdf.get_column('SECTION').map_elements(lambda x: clean(x), pl.String).to_list()
results_section = index.single_column_join_search(values, k=10)

In [None]:
values = qdf.get_column('DISTRICT').map_elements(lambda x: clean(x), pl.String).to_list()
results_district = index.single_column_join_search(values, k=10)

In [None]:
values = qdf.get_column('ELECTION').map_elements(lambda x: clean(x), pl.String).to_list()
results_election = index.single_column_join_search(values, k=10)

In [None]:
from collections import defaultdict


aggregation = defaultdict(int)

for results in [results_section, results_district, results_election]:
    for table, _, _, _ in results:
        aggregation[table] += 1

In [None]:
results = list(sorted(list(aggregation.items()), key=lambda r: r[1], reverse=True))

# select the top-k
results = results[:10]

print(tabulate(results, ['dataset', 'occurrences']))

By combining results from different single-column searches we have some drawbacks in the end: 

- the order isn't always the same, 
- can be costly, if we need to run it on a high number of different columns,
- the alignment of the rows isn't guaranteed.

## Multi-Column JOIN Search - MATE algorithm

Instead, we can use a **multi-column search** approach. 

This is based on MATE (Multi-Attribute Table Extraction) algorithm, which allows us to search n-ary joins without any
other intermediate step.

In [None]:
rows = qdf.select(['SECTION', 'DISTRICT', 'ELECTION'])

# here we simply clean the cell values,
# to put them in the same format used
# in the index
values = [
    list(
        map(
            lambda x: clean(x), 
            row
        )
    ) for row in rows.rows()
]

print(tabulate(values[:10]))

In [None]:
mc_results = index.multi_column_join_search(values, 12, verbose=True)

In [None]:
print(f"Query table: {query_table_name}\n")
print(tabulate(mc_results, headers=['dataset', 'columns', 'join_score']))

The order of the columns **doesn't affect** the final results, but might impact the efficiency (see section 6.1 of MATE paper if you are interested).

The final order in the top-K might slightly change, but overall the top-K tables are the same.

We can swap the columns used before:

In [None]:
rows = qdf.select(['DISTRICT', 'SECTION', 'ELECTION'])

values = [list(map(lambda x: clean(x), row)) for row in rows.rows()]

print(tabulate(values[:5]))

In [None]:
mc_results_v2 = index.multi_column_join_search(values, 10, verbose=True)

In [None]:
# the results are the same obtained above
print(tabulate(mc_results_v2, headers=['dataset', 'columns', 'join_score']))

In [None]:
tables_from_run_1 = {r[0] for r in mc_results}
tables_from_run_2 = {r[0] for r in mc_results_v2}

len(tables_from_run_1.intersection(tables_from_run_2)), tables_from_run_1.difference(tables_from_run_2), tables_from_run_2.difference(tables_from_run_1)

We can do another test with a different combination of the same three columns:

In [None]:
qdf.get_column('ELECTION').unique()

In [None]:
rows = qdf.select(['ELECTION', 'DISTRICT', 'SECTION'])
values = [list(map(lambda x: clean(x), row)) for row in rows.rows()]
print(tabulate(values[:3]))

In [None]:
mc_results_v3 = index.multi_column_join_search(values, 10, verbose=False)

In [None]:
tables_from_run_1 = {r[0] for r in mc_results}
tables_from_run_2 = {r[0] for r in mc_results_v2}
tables_from_run_3 = {r[0] for r in mc_results_v3}

len(tables_from_run_1.intersection(tables_from_run_2).intersection(tables_from_run_3))

Another "desiderata" when working with multi column join discovery, is the possibility to easily add/discard columns

For instance, we can add the "YEAR" column to our query:

In [None]:
rows = qdf.select(['SECTION', 'DISTRICT', 'YEAR', 'ELECTION']).rows()

values = [list(map(lambda x: clean(x), row)) for row in rows]

results = index.multi_column_join_search(values, 10)

print(tabulate(results, ['dataset', 'columns', 'join_score']))

In [None]:
rows = qdf.select(['SECTION', 'DISTRICT', 'YEAR', 'ELECTION', 'DATA_TYPE']).rows()

values = [list(map(lambda x: clean(x), row)) for row in rows]

results = index.multi_column_join_search(values, 10)

print(tabulate(results, ['dataset', 'columns', 'join_score']))

# Example: Parkings and Shops

A user needs to check if shops in Modena have enough parking for people with disabilities: take the "Archi stradali" dataset, and from there identify datasets about parkings for people with disabilities and shops and join them to create a unified view of these data.

In [None]:
query_table_idx = 0
query_table_name = queries[query_table_idx]
print(f"Query dataset: {query_table_name}")

qdf = pl.read_csv(queries_path.joinpath(query_table_name))

qdf.head()

In [None]:
# road_names = qdf.get_column('DENOMINAZI').map_elements(clean, pl.String).unique().to_list()
road_names = list(map(lambda x: clean(x), qdf.get_column('DENOMINAZI').unique()))

road_names[:3]

In [None]:
results = index.single_column_join_search(road_names, 50)
results = [(i, *r) for i, r in enumerate(results)]
print(tabulate(results, headers=['rank', 'dataset', 'column_idx', 'overlap(distinct)', 'overlap']))

In [None]:
parcheggi = pl.read_csv(data_lake_path.joinpath(f"{results[15][1]}.csv"), infer_schema_length=10_000)
parcheggi.head(1)

In [None]:
attivita = pl.read_csv(data_lake_path.joinpath(f"{results[9][1]}.csv"), infer_schema_length=10_000)
attivita.head(1)

In [None]:
# get the join attribute from the first dataset
parcheggi_join_column = parcheggi.columns[results[15][2]]
parcheggi_join_column

In [None]:
# get the join attribute from the second dataset
attivita_join_column = attivita.columns[results[9][2]]
attivita_join_column

In [None]:
# perform the join between the two datasets
join = parcheggi.join(attivita, left_on=parcheggi_join_column, right_on=attivita_join_column)
join.head()

# Example: Correlated Expense

We have a query dataset about the average monthly familiar expense in a wide list of categories.

You have to identify datasets that join on its key attributes and that are also correlated with the expense information.

- Do any dataset appear more than once, with different key attribute for the join-correlation search?

- Is there any significant shift in the mean on the after-join dataset?

In [None]:
query_table_idx = 2
query_table_name = queries[query_table_idx]
print(f"Query dataset: {query_table_name}")

qdf = pl.read_csv(os.path.join(queries_path, query_table_name))

qdf

In [None]:
# using different key columns generates different results
join_key = 'YEAR'
# join_key = 'CATEGORY'

# groub the dataframe by the key column and compute the mean on each group
values = qdf.group_by(join_key).agg(pl.col('AVG MONTH FAMILIAR EXPENSE (€)').mean())

values

In [None]:
# prepare the key and target columns
keys = values.get_column(join_key).cast(pl.String).map_elements(clean, pl.String).to_list()
targets = values.get_column('AVG MONTH FAMILIAR EXPENSE (€)').to_list()

# the order of the results might change, but the overall result set is always be the same
results = index.correlation_search(keys, targets, 10)

print(tabulate(results, ['dataset', 'join_idx', 'target_idx', 'QCR']))