In [126]:
!pip install duckdb==0.7.0




[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [127]:
import duckdb
import time
import pandas as pd

In [128]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [129]:
from py_duckdb.similarity_join import tokenizers
from py_duckdb.similarity_join import jaccard_join, jaccard_join_brute_force
from py_duckdb.similarity_join import evaluate

In [130]:
def join_fn_exec_time(n, join_fn, *args, **kwargs):
    exec_time = []
    for i in range(0, n):
        start_time = time.time()
        join_fn(*args, **kwargs)
        end_time = time.time()
        exec_time.append(end_time - start_time)
        print(end_time - start_time, 's')
    return exec_time

In [131]:
import numpy as np

def test_vs_brute_force(
        con: duckdb.DuckDBPyConnection,
        l_table: str,
        r_table: str,
        l_key_attr: str,
        r_key_attr: str,
        l_join_attr: str,
        r_join_attr: str,
        tokenizer: tokenizers.Tokenizer,
        threshold: float,
        out_table: str,
        n=1
):

    print("FILTERED EXECUTIONS")
    exec_times = join_fn_exec_time(
        n, jaccard_join,
        con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table
    )
    print("Average execution time:", np.average(exec_times))

    print()
    print("BRUTE FORCE EXECUTIONS")
    exec_times_bf = join_fn_exec_time(
        n, jaccard_join_brute_force,
        con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, "bf_" + out_table
    )
    print("Average execution time:", np.average(exec_times_bf))

    print()
    cmp_join = con.execute(
        "select * "
        f"from {out_table} m "
        f"full outer join bf_{out_table} b "
        f"on (b.l_{l_key_attr} = m.l_{l_key_attr} and b.r_{r_key_attr} = m.r_{r_key_attr}) "
        f"or (b.l_{l_key_attr} = m.r_{r_key_attr} and b.r_{r_key_attr} = m.l_{l_key_attr}) "
        f"where m.l_{l_key_attr} is null "
        f"or b.l_{l_key_attr} is null"
    ).fetchall()
    if len(cmp_join) == 0:
        print("SUCCESS! Filtered join and Brute force join returned the same result")
    else:
        print("ERROR! There are mismatches between Filtered and Brute force joins:", cmp_join)

    return {
        'exec_time': exec_times,
        'exec_time_bf': exec_times_bf
    }

In [132]:
def test_confusion_mtx(
        con: duckdb.DuckDBPyConnection,
        l_table: str,
        r_table: str,
        l_key_attr: str,
        r_key_attr: str,
        l_join_attr: str,
        r_join_attr: str,
        tokenizer: tokenizers.Tokenizer,
        threshold: float,
        out_table: str,
        ground_truth_table: str,
        n=1
):
    exec_time = join_fn_exec_time(
        n, jaccard_join,
        con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table
    )
    print("Average execution time:", np.average(exec_time))
    print(evaluate(con, ground_truth_table, out_table))
    print()
    return exec_time

In [133]:
con = duckdb.connect(database=':memory:')

# Dummy dataset

In [134]:
con.execute("drop table if exists data").execute(
    "create table data as "
    "select * "
    "from 'data/purchases.csv'"
).execute("select * from data").fetchall()

[(1, 'Susan', 'Bible charger pan whisk'),
 (2, 'James', 'colander comb razor tuner whisk'),
 (3, 'Paul', 'charger colander comb pan razor'),
 (4, 'Robert', 'Bible headphones tuner'),
 (5, 'Mary', 'headphones charger colander comb pan'),
 (6, 'David', 'razor tuner whisk')]

In [135]:
l_key_attr = 'id'
r_key_attr = 'id'
l_join_attr = 'purchases'
r_join_attr = 'purchases'
tokenizer = tokenizers.WhitespaceTokzr()
threshold = 0.5
out_table = 'matches'
tokenizer.query('data', l_key_attr, l_join_attr)

"select id, len(tks) as len, lower(unnest(tks)) as token from ( select id, list_distinct(list_filter(str_split_regex(val, '[\t\n\r ]'), x -> trim(x) != '')) as tks from input ) "

In [136]:
jaccard_join(
    con, 'data', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table
).execute(f"select * from {out_table}").fetchall()

[(3, 5), (6, 2)]

In [137]:
jaccard_join_brute_force(
    con, 'data', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table
).execute(f"select * from {out_table}").fetchall()

[(2, 6), (3, 5)]

# 10K profiles

In [138]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')

In [139]:
df10.head()

Unnamed: 0,realProfileID,date_of_birth,surname,address_1,street_number,postcode,soc_sec_id,suburb,phone_number,state,given_name,age,address_2
0,0,19390609.0,bishop,daley crescent,41.0,6050,4676841,batlow,08 29028996,qld,molly,31.0,
1,1,19041109.0,aidon,nambucca street,7.0,2002,3414163,devonort,08 75629459,vkf,whkt,,
2,2,19910711.0,anns,,12.0,2287,7844876,ivanhoe,02 11684110,vic,andrew,31.0,
3,3,19390709.0,whitrlsy,robson street,34.0,4065,2418360,christie downs,04 00323207,qld,shsne,,
4,4,19340328.0,roche,rankin street,9.0,3644,7577436,frenchs forest,08 53227250,qld,sophie,30.0,


In [140]:
to_concat = ", ' ', ".join(["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number", "state", "given_name", "age", "address_2"])
to_concat

"date_of_birth, ' ', surname, ' ', address_1, ' ', street_number, ' ', postcode, ' ', soc_sec_id, ' ', suburb, ' ', phone_number, ' ', state, ' ', given_name, ' ', age, ' ', address_2"

In [141]:
con.execute("drop table if exists db10").execute(
    "CREATE TABLE db10 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df10"
).execute("select * from db10").fetchall()

[(0,
  '19390609.0 bishop daley crescent 41.0 6050 4676841 batlow 08 29028996 qld molly 31.0 '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (2,
  '19910711.0 anns  12.0 2287 7844876 ivanhoe 02 11684110 vic andrew 31.0 '),
 (3,
  '19390709.0 whitrlsy robson street 34.0 4065 2418360 christie downs 04 00323207 qld shsne  '),
 (4,
  '19340328.0 roche rankin street 9.0 3644 7577436 frenchs forest 08 53227250 qld sophie 30.0 '),
 (5,
  '19320811.0 fullgrabe beeston street 29.0 3131 6494586 broken hill 04 80080021 nsw emma 29.0 '),
 (6,
  '19601013.0 lodge mason street 48.0 5254 6098877 orchard hills 08 48143359 vic rourke 32.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (8,
  '19671108.0 bishop cromwell circuit 12.0 2226 1718686 harris park 04 01707833 vic jamie 28.0 homestead caravan park'),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),
 

In [142]:
df10gt = pd.read_json("data/10KIdDuplicates.json", lines=True, orient='records', typ='frame')
df10gt.head()

Unnamed: 0,d1Id,d2Id
0,101,8933
1,4101,4614
2,7213,8466
3,4856,7485
4,4829,9725


In [143]:
con.execute("drop table if exists db10gt").execute(
    "CREATE TABLE db10gt AS "
    "SELECT d1Id as l_id, d2Id as r_id "
    "FROM df10gt "
).execute("select count(*) from db10gt").fetchall()

[(8705,)]

In [144]:
l_key_attr = 'id'
r_key_attr = 'id'
l_join_attr = 'val'
r_join_attr = 'val'
out_table = 'matches'

In [145]:
con.execute("drop table if exists db10_sample").execute(
    "CREATE TABLE db10_sample AS "
    "SELECT * "
    "FROM db10 "
    "using sample 5000"
).execute("select * from db10_sample").fetchall()

[(8714,
  '19900824.0 matthews kennerley street 27.0 6155 6410395 belmont 07 75697875 vic isabella 21.0 '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (2,
  '19910711.0 anns  12.0 2287 7844876 ivanhoe 02 11684110 vic andrew 31.0 '),
 (8185,
  ' george fimister circuit 81.0 3944 6757147 summer hill   riley 38.0 mirrimer'),
 (4,
  '19340328.0 roche rankin street 9.0 3644 7577436 frenchs forest 08 53227250 qld sophie 30.0 '),
 (5,
  '19320811.0 fullgrabe beeston street 29.0 3131 6494586 broken hill 04 80080021 nsw emma 29.0 '),
 (7217,
  '19721113.0 de angelis lett place 61.0 3178 2171834 willetton 03 78093149 wa chloe 22.0 '),
 (5118,
  ' cameron lachlam dtreet 2.0 4700 3004860 mannering park 08 94351555 act lavsi 20.0 '),
 (8,
  '19671108.0 bishop cromwell circuit 12.0 2226 1718686 harris park 04 01707833 vic jamie 28.0 homestead caravan park'),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),


In [146]:
con.execute("drop view if exists db10gt_sample").execute(
    "create view db10gt_sample as "
    "select gt.* "
    "from db10_sample s1, db10_sample s2, db10gt gt "
    "where s1.id = gt.l_id "
    "and s2.id = gt.r_id"
).execute("select count(*) from db10gt_sample").fetchall()

[(2144,)]

In [147]:
test_vs_brute_force(
    con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table, 1
)
evaluate(con, 'db10gt_sample', out_table)

FILTERED EXECUTIONS
0.16982388496398926 s
Average execution time: 0.16982388496398926

BRUTE FORCE EXECUTIONS
2.513153076171875 s
Average execution time: 2.513153076171875

SUCCESS! Filtered join and Brute force join returned the same result


{'tp': 315,
 'fp': 0,
 'fn': 1829,
 'pr': 1.0,
 'rc': 0.14692164179104478,
 'fm': 0.25620170801138675}

In [148]:
test_confusion_mtx(
    con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.QGramsTokzr(5),
    0.5,
    out_table, 'db10gt_sample', 5
)

0.6451566219329834 s
0.6250758171081543 s
0.6551463603973389 s
0.6413712501525879 s
0.6258466243743896 s
Average execution time: 0.6385193347930909
{'tp': 453, 'fp': 0, 'fn': 1691, 'pr': 1.0, 'rc': 0.21128731343283583, 'fm': 0.34886407393145935}



[0.6451566219329834,
 0.6250758171081543,
 0.6551463603973389,
 0.6413712501525879,
 0.6258466243743896]

In [149]:
for t in [0.6, 0.5, 0.4, 0.3, 0.2]:
    print("threshold =", t)
    test_confusion_mtx(
        con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.QGramsTokzr(5),
        # tokenizers.WordsTokzr("' '"),
        t, out_table, "db10gt_sample", 1
    )

threshold = 0.6
0.5980720520019531 s
Average execution time: 0.5980720520019531
{'tp': 108, 'fp': 0, 'fn': 2036, 'pr': 1.0, 'rc': 0.05037313432835821, 'fm': 0.09591474245115453}

threshold = 0.5
0.6299095153808594 s
Average execution time: 0.6299095153808594
{'tp': 453, 'fp': 0, 'fn': 1691, 'pr': 1.0, 'rc': 0.21128731343283583, 'fm': 0.34886407393145935}

threshold = 0.4
2.9553420543670654 s
Average execution time: 2.9553420543670654
{'tp': 946, 'fp': 0, 'fn': 1198, 'pr': 1.0, 'rc': 0.4412313432835821, 'fm': 0.6122977346278317}

threshold = 0.3
3.959946632385254 s
Average execution time: 3.959946632385254
{'tp': 1553, 'fp': 0, 'fn': 591, 'pr': 1.0, 'rc': 0.7243470149253731, 'fm': 0.8401406545847985}

threshold = 0.2
4.636536121368408 s
Average execution time: 4.636536121368408
{'tp': 2030, 'fp': 2, 'fn': 116, 'pr': 0.9990157480314961, 'rc': 0.9459459459459459, 'fm': 0.9717568214456679}



In [150]:
# test brute force vs filtered join performances
for t in [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]:
    print("threshold =", t)
    test_vs_brute_force(
        con, 'db10', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.WhitespaceTokzr(),
        t,
        out_table, 5
    )
    print(evaluate(con, 'db10gt', out_table))

threshold = 0.8
FILTERED EXECUTIONS
0.2740962505340576 s
0.257932186126709 s
0.249267578125 s
0.24943017959594727 s
0.25060415267944336 s
Average execution time: 0.25626606941223146

BRUTE FORCE EXECUTIONS
9.1779305934906 s
9.029018640518188 s
9.256737232208252 s
8.941887855529785 s
9.25169587135315 s
Average execution time: 9.131454038619996

SUCCESS! Filtered join and Brute force join returned the same result
{'tp': 1, 'fp': 0, 'fn': 8704, 'pr': 1.0, 'rc': 0.00011487650775416428, 'fm': 0.00022972662531587414}
threshold = 0.7
FILTERED EXECUTIONS
0.30730271339416504 s
0.2852025032043457 s
0.2891700267791748 s
0.2812349796295166 s
0.29786086082458496 s
Average execution time: 0.29215421676635744

BRUTE FORCE EXECUTIONS
11.598785400390625 s
13.404203414916992 s
9.627988815307617 s
9.066170930862427 s
9.325268745422363 s
Average execution time: 10.604483461380005

SUCCESS! Filtered join and Brute force join returned the same result
{'tp': 36, 'fp': 0, 'fn': 8669, 'pr': 1.0, 'rc': 0.004135

In [151]:
# test filtered join performances
for t in [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]:
    print("threshold =", t)
    test_confusion_mtx(
        con, 'db10', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.QGramsTokzr(5),
        t, out_table, "db10gt", 1
    )

threshold = 0.8
1.1847941875457764 s
Average execution time: 1.1847941875457764
{'tp': 3, 'fp': 0, 'fn': 8702, 'pr': 1.0, 'rc': 0.0003446295232624928, 'fm': 0.0006890215893431326}

threshold = 0.7
1.4650497436523438 s
Average execution time: 1.4650497436523438
{'tp': 48, 'fp': 0, 'fn': 8657, 'pr': 1.0, 'rc': 0.005514072372199885, 'fm': 0.010967668228036102}

threshold = 0.6
1.5715103149414062 s
Average execution time: 1.5715103149414062
{'tp': 487, 'fp': 0, 'fn': 8218, 'pr': 1.0, 'rc': 0.055944859276278, 'fm': 0.10596170583115753}

threshold = 0.5
1.737363338470459 s
Average execution time: 1.737363338470459
{'tp': 1928, 'fp': 0, 'fn': 6777, 'pr': 1.0, 'rc': 0.22148190695002873, 'fm': 0.3626445970093107}

threshold = 0.4
14.59551477432251 s
Average execution time: 14.59551477432251
{'tp': 3942, 'fp': 0, 'fn': 4763, 'pr': 1.0, 'rc': 0.45284319356691555, 'fm': 0.6233889459950976}

threshold = 0.3
14.710598468780518 s
Average execution time: 14.710598468780518
{'tp': 6343, 'fp': 0, 'fn': 

In [152]:
con.execute("drop table if exists db10")
con.execute("drop table if exists db10_sample")
con.execute("drop table if exists db10gt")
con.execute("drop view if exists db10gt_sample")
con.execute(f"drop table if exists {out_table}")
con.execute(f"drop table if exists bf_{out_table}")
con.execute("show tables").fetchall()

[('data',)]

# 10K Profiles: inner join 5K x 5K

In [153]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')
# df10 = pd.read_json("data/test.json", lines=True, orient='records', typ='frame')
df10.head()

Unnamed: 0,realProfileID,date_of_birth,surname,address_1,street_number,postcode,soc_sec_id,suburb,phone_number,state,given_name,age,address_2
0,0,19390609.0,bishop,daley crescent,41.0,6050,4676841,batlow,08 29028996,qld,molly,31.0,
1,1,19041109.0,aidon,nambucca street,7.0,2002,3414163,devonort,08 75629459,vkf,whkt,,
2,2,19910711.0,anns,,12.0,2287,7844876,ivanhoe,02 11684110,vic,andrew,31.0,
3,3,19390709.0,whitrlsy,robson street,34.0,4065,2418360,christie downs,04 00323207,qld,shsne,,
4,4,19340328.0,roche,rankin street,9.0,3644,7577436,frenchs forest,08 53227250,qld,sophie,30.0,


In [154]:
head = 0
split_at = int(df10.shape[0] / 2)
tail = int(df10.shape[0])
df10_1 = df10.iloc[head:split_at, :]
df10_2 = df10.iloc[split_at:tail, :]
df10_1.shape

(5000, 13)

In [155]:
df10_2.shape

(5000, 13)

In [156]:
to_concat = ", ' ', ".join(
    ["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number",
     "state", "given_name", "age", "address_2"])

In [157]:
con.execute("drop table if exists db10_1").execute(
    "CREATE TABLE db10_1 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df10_1"
).execute("select * from db10_1").fetchall()

[(0,
  '19390609.0 bishop daley crescent 41.0 6050 4676841 batlow 08 29028996 qld molly 31.0 '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (2,
  '19910711.0 anns  12.0 2287 7844876 ivanhoe 02 11684110 vic andrew 31.0 '),
 (3,
  '19390709.0 whitrlsy robson street 34.0 4065 2418360 christie downs 04 00323207 qld shsne  '),
 (4,
  '19340328.0 roche rankin street 9.0 3644 7577436 frenchs forest 08 53227250 qld sophie 30.0 '),
 (5,
  '19320811.0 fullgrabe beeston street 29.0 3131 6494586 broken hill 04 80080021 nsw emma 29.0 '),
 (6,
  '19601013.0 lodge mason street 48.0 5254 6098877 orchard hills 08 48143359 vic rourke 32.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (8,
  '19671108.0 bishop cromwell circuit 12.0 2226 1718686 harris park 04 01707833 vic jamie 28.0 homestead caravan park'),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),
 

In [158]:
con.execute("drop table if exists db10_2").execute(
    "CREATE TABLE db10_2 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df10_2"
).execute("select * from db10_2").fetchall()

[(5000,
  '19700823.0 wang  8.0 2875 2216347 blue haven 07 17987225  ameli 32.0 '),
 (5001,
  '19280314.0 scaror warramoo crescent 7.0 5752 4933901 moama 08 76672148 vl kiara 23.0 '),
 (5002,
  '19290324.0 colis alarmon crescent 79.0 3204 5111085 warren 03 66459383 act georgia 28.0 arthella'),
 (5003,
  ' wilkins learmonth drive 24.0 2050 5694132 miami 07 04918885 qld benjamin 21.0 hopeview'),
 (5004,
  '19240216.0 fraenkel launceston street 44.0 2088 3703441 dapto 08 57499031  timothy 30.0 '),
 (5005,
  '19120121.0 rees achernar close 61.0 7011 6346462 dapto 02 97724353 nsw kane 36.0 retirement village'),
 (5006,
  '19051490.0 binns wenholz treet  6006 2604157 dapto 03 68821645 sa lucinda 29.0 '),
 (5007,
  ' mccarthy millhouse crescent 35.0 6027 8045670 northmead 08 76825732 qld sophie 30.0 highgate'),
 (5008,
  '19250109.0 hearn stapylton street 1.0 6027 2267622 malvern east 03 64393123  tynan 29.0 '),
 (5009,
  '19791026.0 stuber saville close 75.0 4350 3716864 hay 02 96356289 nsw 

In [159]:
l_key_attr = 'id'
r_key_attr = 'id'
l_join_attr = 'val'
r_join_attr = 'val'
out_table = 'matches'

In [160]:
test_vs_brute_force(
    con, 'db10_1', 'db10_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table,
    1
)

FILTERED EXECUTIONS
1.418555498123169 s
Average execution time: 1.418555498123169

BRUTE FORCE EXECUTIONS
4.715775728225708 s
Average execution time: 4.715775728225708

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [1.418555498123169], 'exec_time_bf': [4.715775728225708]}

In [161]:
df10gt = pd.read_json("data/10KIdDuplicates.json", lines=True, orient='records', typ='frame')
con.execute("drop table if exists db10gt").execute(
    "CREATE TABLE db10gt AS "
    "SELECT d1Id as l_id, d2Id as r_id "
    "FROM df10gt "
    f"where d1Id >= {head} and d1Id < {split_at} "
    f"and d2Id >= {split_at} and d2Id < {tail} "
).execute("select * from db10gt").fetchall()

[(101, 8933),
 (4856, 7485),
 (4829, 9725),
 (1289, 8634),
 (3059, 9179),
 (1884, 8409),
 (1295, 8153),
 (2498, 6623),
 (3670, 7648),
 (3064, 8794),
 (1689, 8239),
 (3699, 5250),
 (3875, 7035),
 (2704, 5934),
 (4040, 9733),
 (895, 8626),
 (1328, 5463),
 (96, 9418),
 (4486, 5499),
 (3252, 9615),
 (4858, 7396),
 (1083, 9433),
 (3267, 8384),
 (3105, 5454),
 (1921, 5428),
 (1483, 9012),
 (1282, 9311),
 (303, 8653),
 (4884, 5268),
 (687, 9557),
 (90, 9958),
 (4631, 9892),
 (1488, 8627),
 (1488, 8625),
 (3302, 5519),
 (4058, 8307),
 (2297, 7018),
 (701, 8420),
 (1299, 7941),
 (3065, 8829),
 (3454, 9318),
 (1502, 7488),
 (2860, 9462),
 (3279, 7464),
 (516, 7416),
 (3488, 6510),
 (3690, 6123),
 (3273, 7966),
 (2092, 7690),
 (2502, 6433),
 (2909, 5433),
 (1477, 9595),
 (684, 9881),
 (880, 9997),
 (4492, 5118),
 (3065, 8883),
 (2107, 6474),
 (4066, 7719),
 (511, 7876),
 (2118, 5569),
 (3850, 9273),
 (2878, 8024),
 (3087, 7062),
 (1510, 6894),
 (1890, 8133),
 (2117, 5676),
 (3282, 7290),
 (3676, 

In [162]:
test_confusion_mtx(
    con, 'db10_1', 'db10_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table, 'db10gt',
    1
)

1.1871764659881592 s
Average execution time: 1.1871764659881592
{'tp': 662, 'fp': 0, 'fn': 3717, 'pr': 1.0, 'rc': 0.1511760675953414, 'fm': 0.26264630033723474}



[1.1871764659881592]

In [163]:
# test filtered join performances
for t in [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2]:
    print("threshold =", t)
    test_confusion_mtx(
        con, 'db10_1', 'db10_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.WhitespaceTokzr(),
        t, out_table, "db10gt", 5
    )

threshold = 0.8
0.2402191162109375 s
0.2465498447418213 s
0.2373199462890625 s
0.24481868743896484 s
0.23926615715026855 s
Average execution time: 0.24163475036621093
{'tp': 0, 'fp': 0, 'fn': 4379, 'pr': 0, 'rc': 0, 'fm': 0}

threshold = 0.7
0.2737607955932617 s
0.35392332077026367 s
0.30709218978881836 s
0.2949237823486328 s
0.2816040515899658 s
Average execution time: 0.3022608280181885
{'tp': 15, 'fp': 0, 'fn': 4364, 'pr': 1.0, 'rc': 0.003425439598081754, 'fm': 0.006827492034592627}

threshold = 0.6
0.33925795555114746 s
0.34383082389831543 s
0.35071468353271484 s
0.3353126049041748 s
0.32963991165161133 s
Average execution time: 0.3397511959075928
{'tp': 127, 'fp': 0, 'fn': 4252, 'pr': 1.0, 'rc': 0.02900205526375885, 'fm': 0.056369285397248114}

threshold = 0.5
1.2534074783325195 s
1.1875863075256348 s
1.19337797164917 s
1.1873340606689453 s
1.1599750518798828 s
Average execution time: 1.1963361740112304
{'tp': 662, 'fp': 0, 'fn': 3717, 'pr': 1.0, 'rc': 0.1511760675953414, 'fm': 0.

In [164]:
con.execute("drop table if exists db10")
con.execute("drop table if exists db10_1")
con.execute("drop table if exists db10_2")
con.execute("drop table if exists db10gt")
con.execute(f"drop table if exists {out_table}")
con.execute(f"drop table if exists bf_{out_table}")
con.execute("show tables").fetchall()

[('data',)]

# Test case: larger Profiles datasets

In [165]:
df50 = pd.read_json("data/50Kprofiles.json", lines=True, orient='records', typ='frame')

In [166]:
con.execute("drop table if exists db50").execute(
    "CREATE TABLE db50 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df50"
).execute("select * from db50").fetchall()

[(0,
  '19551130.0 reid jerrabomberra avenue 191.0 2115 7907036 tylden 02 57675204 nsw nicholas  gold tyne'),
 (1,
  '19821020.0 porra meyers place 2.0 3130 2705215 kingaroy 03 64681506 wa elle 34.0 '),
 (2,
  '19351010.0 pochec duterrau crescent 46.0 2785 2350343 lalor park 03 93693261 vic mitchell 29.0 '),
 (3,
  ' beaumaris ellenborough street 11.0 2285 6735143 innaloo 04 02935530 qld marianne 24.0 '),
 (4, ' ransin ahernuplace 2.0 4496 6721672 armiake  qld christian 38.0 '),
 (5,
  '19740410.0 forsbaw norman fisher circuit 5.0 2707 7456433 bayview hdeights  qph lesa 29.0 warrawong'),
 (6,
  '19960201.0 nguyen hansen circuit 44.0 5244 7748525  04 10399694 nsw caitlin 29.0 '),
 (7,
  '19607722.0 eichimnrger stockdale street 11.0 4224 8900629 cherrbqrook 04 99393660 qld isabrlla 36.0 st john of godzhospital'),
 (8,
  '19280419.0 hawes giles street 3.0 2197 4044861 brandon 04 54963323 vic michael 28.0 '),
 (9,
  '19010611.0 palecek goldner circuit 31.0 6102 8225306 latrobe 07 75127812 

In [167]:
df50gt = pd.read_json("data/50KIdDuplicates.json", lines=True, orient='records', typ='frame')
con.execute("drop table if exists db50gt").execute(
    "CREATE TABLE db50gt AS "
    "SELECT d1Id as l_id, d2Id as r_id "
    "FROM df50gt "
).execute("select count(*) from db50gt").fetchall()

[(43071,)]

In [168]:
test_confusion_mtx(
    con, 'db50', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table, 'db50gt',
    5
)

1.1877882480621338 s
1.3156251907348633 s
1.4545297622680664 s
1.4198644161224365 s
1.450831651687622 s
Average execution time: 1.3657278537750244
{'tp': 6964, 'fp': 0, 'fn': 36107, 'pr': 1.0, 'rc': 0.16168651761045716, 'fm': 0.27836514439892074}



[1.1877882480621338,
 1.3156251907348633,
 1.4545297622680664,
 1.4198644161224365,
 1.450831651687622]

In [169]:
# test filtered join performances
for t in [0.8, 0.7, 0.6, 0.5, 0.4, 0.3]:
    print("threshold =", t)
    test_confusion_mtx(
        con, 'db50', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.WhitespaceTokzr(),
        t, out_table, "db50gt", 5
    )

threshold = 0.8
0.8631250858306885 s
0.9266903400421143 s
0.9174621105194092 s
0.9291422367095947 s
0.9507360458374023 s
Average execution time: 0.9174311637878418
{'tp': 8, 'fp': 0, 'fn': 43063, 'pr': 1.0, 'rc': 0.000185739824940215, 'fm': 0.00037141066412869374}

threshold = 0.7
1.0158491134643555 s
0.9884328842163086 s
0.9827101230621338 s
0.9677655696868896 s
0.9439880847930908 s
Average execution time: 0.9797491550445556
{'tp': 144, 'fp': 0, 'fn': 42927, 'pr': 1.0, 'rc': 0.00334331684892387, 'fm': 0.006664352655328011}

threshold = 0.6
1.271540641784668 s
1.0523426532745361 s
1.1075313091278076 s
1.0617046356201172 s
1.0438823699951172 s
Average execution time: 1.1074003219604491
{'tp': 1455, 'fp': 0, 'fn': 41616, 'pr': 1.0, 'rc': 0.0337814306610016, 'fm': 0.06535507344023717}

threshold = 0.5
1.2229657173156738 s
1.3875885009765625 s
1.4955265522003174 s
1.4247288703918457 s
1.378736972808838 s
Average execution time: 1.3819093227386474
{'tp': 6964, 'fp': 0, 'fn': 36107, 'pr': 1.

In [170]:
head = 0
split_at = 10000 # int(df50.shape[0] / 4)
tail = 20000 # int(df50.shape[0] / 2)
df50_1 = df50.iloc[head:split_at, :]
df50_2 = df50.iloc[split_at:tail, :]
to_concat = ", ' ', ".join(
    ["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number",
     "state", "given_name", "age", "address_2"])
con.execute("drop table if exists db50_1").execute(
    "CREATE TABLE db50_1 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df50_1"
).execute("select * from db50_1").fetchall()
con.execute("drop table if exists db50_2").execute(
    "CREATE TABLE db50_2 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df50_2"
).execute("select * from db50_2").fetchall()

[(10000,
  '19905728.0 campbwll couchman crescent 627.0 2444 7022181 rowvillle 02 59770880 qld jessiea 35.0 '),
 (10001,
  '19771228.0 roche fenton street 8.0  7599960 lansvale 04 36866825 nsw kieren 27.0 '),
 (10002,
  '19320528.0  lalor street 8.0 2086 8634473 st ives 07 77381617 vic nicholas 34.0 '),
 (10003,
  '19970224.0 noble  39.0 3146 3747885 avoca 08 65210614 nsw jacob 9.0 kojonolokan hills'),
 (10004,
  '19591011.0 shepgemr d cloncurry street 80.0 5731 9650365 bayviewhehbghts 03 51619632 nsw amy 29.0 '),
 (10005,
  ' melhado  72.0 2450 5548429 eastern heights 03 01147279  fraser  st francis village'),
 (10006,
  '19660512.0 wilczek novar street 11.0 7301 4593954 apsley 02 85463355 vic lucy 31.0 '),
 (10007,
  '19929807.0 vincendt bauhinasyreet 12.0 4650 1355880 shoalxater 02 14140417 qld nikki  '),
 (10008,
  ' lovellok tauss place 8.0 3550 3131570 keilor east 08 13339663 nsw keeley 12.0 '),
 (10009,
  '19951202.0 rosan were skzreet 2.0 2719 1538013 mill park 02 03960620 nsw 

In [171]:
test_vs_brute_force(
    con, 'db50_1', 'db50_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table,
    1
)

FILTERED EXECUTIONS
4.527768850326538 s
Average execution time: 4.527768850326538

BRUTE FORCE EXECUTIONS
22.97003746032715 s
Average execution time: 22.97003746032715

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [4.527768850326538], 'exec_time_bf': [22.97003746032715]}

In [172]:
con.execute("drop table if exists db50gt").execute(
    "CREATE TABLE db50gt AS "
    "SELECT d1Id as l_id, d2Id as r_id "
    "FROM df50gt "
    f"where d1Id >= {head} and d1Id < {split_at} "
    f"and d2Id >= {split_at} and d2Id < {tail} "
).execute("select * from db50gt").fetchall()

[(71, 11420),
 (825, 14420),
 (7978, 10560),
 (821, 14780),
 (7924, 15048),
 (3205, 13526),
 (7184, 10973),
 (2405, 14415),
 (4770, 14723),
 (3207, 13408),
 (3980, 14783),
 (6314, 17684),
 (7984, 10151),
 (4756, 15946),
 (849, 12552),
 (7178, 11542),
 (2389, 15840),
 (809, 15928),
 (9478, 17374),
 (3942, 18095),
 (2449, 10988),
 (3167, 16937),
 (1591, 16672),
 (8708, 15856),
 (5611, 10777),
 (5575, 13787),
 (7891, 18159),
 (7153, 13942),
 (5533, 17337),
 (3968, 16184),
 (5572, 14123),
 (7180, 11787),
 (3991, 14326),
 (7958, 12797),
 (7881, 19191),
 (2359, 18863),
 (7930, 15237),
 (2377, 17410),
 (3228, 12316),
 (7909, 17050),
 (57, 13409),
 (4031, 11263),
 (1671, 10569),
 (4722, 19497),
 (9486, 17311),
 (2379, 17368),
 (3946, 18415),
 (9528, 13894),
 (792, 18082),
 (7205, 10140),
 (2384, 17064),
 (7895, 18412),
 (8679, 18899),
 (9538, 13132),
 (2452, 11473),
 (771, 19963),
 (3221, 13219),
 (2357, 19408),
 (879, 11013),
 (8771, 11340),
 (9577, 10000),
 (9477, 18296),
 (7135, 16072),
 (4

In [173]:
test_confusion_mtx(
    con, 'db50_1', 'db50_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.2,
    out_table, 'db50gt',
    5
)

63.8821234703064 s
49.77408456802368 s
52.23413395881653 s
56.542542934417725 s
59.47773456573486 s
Average execution time: 56.38212389945984
{'tp': 5070, 'fp': 2133, 'fn': 499, 'pr': 0.7038733860891295, 'rc': 0.9103968396480517, 'fm': 0.7939242092076417}



[63.8821234703064,
 49.77408456802368,
 52.23413395881653,
 56.542542934417725,
 59.47773456573486]

In [174]:
# test filtered join performances
for t in [0.8, 0.7, 0.6, 0.5, 0.4, 0.3]:
    print("threshold =", t)
    test_confusion_mtx(
        con, 'db50_1', 'db50_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.WhitespaceTokzr(),
        t, out_table, "db50gt", 5
    )

threshold = 0.8
0.939723014831543 s
0.7036986351013184 s
0.7032003402709961 s
0.8241255283355713 s
0.7331748008728027 s
Average execution time: 0.7807844638824463
{'tp': 2, 'fp': 0, 'fn': 3434, 'pr': 1.0, 'rc': 0.0005820721769499418, 'fm': 0.0011634671320535197}

threshold = 0.7
0.7208728790283203 s
0.8122119903564453 s
0.8339519500732422 s
0.8139219284057617 s
0.8254051208496094 s
Average execution time: 0.8012727737426758
{'tp': 12, 'fp': 0, 'fn': 3424, 'pr': 1.0, 'rc': 0.0034924330616996507, 'fm': 0.0069605568445475635}

threshold = 0.6
4.353283882141113 s
4.0267393589019775 s
3.544285297393799 s
3.6083998680114746 s
3.5732038021087646 s
Average execution time: 3.821182441711426
{'tp': 113, 'fp': 0, 'fn': 3323, 'pr': 1.0, 'rc': 0.03288707799767171, 'fm': 0.06367990983375599}

threshold = 0.5
5.308811902999878 s
4.636166095733643 s
4.589712858200073 s
4.538482666015625 s
4.587726354598999 s
Average execution time: 4.732179975509643
{'tp': 580, 'fp': 0, 'fn': 2856, 'pr': 1.0, 'rc': 0.

In [175]:
con.execute("drop table if exists db50")
con.execute("drop table if exists db50_1")
con.execute("drop table if exists db50_2")
con.execute("drop table if exists db50gt")
con.execute(f"drop table if exists {out_table}")
con.execute(f"drop table if exists bf_{out_table}")
con.execute("show tables").fetchall()

[('data',)]

In [176]:
df100 = pd.read_json("data/100Kprofiles.json", lines=True, orient='records', typ='frame')

In [177]:
con.execute("drop table if exists db100").execute(
    "CREATE TABLE db100 AS "
    f"SELECT realProfileID as id, concat ({to_concat}) as val "
    "FROM df100"
).execute("select * from db100").fetchall()

[(0,
  '19190809.0 chandler pudney street 376.0 3764 3423802 hemmant 02 24496575 qld luke 35.0 arlington'),
 (1,
  '19300528.0 hawes  8.0 2168 2180735 keilor east 04 87308830 qld millane 36.0 northern tablelands tennis academy'),
 (2,
  '19300112.0 novadk rfreet rankin 1.0 3218 4272019 beulah park 03 39054375 qld jatmike 26.0 '),
 (3,
  '19950727.0 redmond davenport street 12.0 3030 5381275 kangaroo flat 07 69720733 qld madeleine 33.0 vinery'),
 (4,
  '19631123.0 zimmermann corona place 7.0 4030 2894006 greenacres 03 48393238 nsw kayden 31.0 glen elgin'),
 (5,
  '19350925.0 canbkl macfarland crescent 61.0 3164 5282937 chishmolm 02 01482240 nsw sammy 72.0 '),
 (6,
  '19650417.0 psorakis owen crescent 77.0 2452 2052783 mannum 07 48547017 wa laura  '),
 (7,
  '19230205.0 agius arthur circle 2.0 6125 8456442 marsden 02 54492364 qld indiana 32.0 '),
 (8,
  '19830325.0 hegger arabana street 2.0 4178 9175025 wellington point 04 21430300 qld jaykob  '),
 (9, '19930213.0 weetfa  6.0 3106 541836

In [178]:
df100gt = pd.read_json("data/100KIdDuplicates.json", lines=True, orient='records', typ='frame')
con.execute("drop table if exists db100gt").execute(
    "CREATE TABLE db100gt AS "
    "SELECT d1Id as l_id, d2Id as r_id "
    "FROM df100gt "
).execute("select count(*) from db100gt").fetchall()

[(85497,)]

In [179]:
test_confusion_mtx(
    con, 'db100', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table, 'db100gt',
    1
)

2.9065659046173096 s
Average execution time: 2.9065659046173096
{'tp': 14024, 'fp': 0, 'fn': 71473, 'pr': 1.0, 'rc': 0.16402914722153994, 'fm': 0.2818299655349122}



[2.9065659046173096]

In [180]:
# test filtered join performances
for t in [0.8, 0.7, 0.6, 0.5, 0.4]:
    print("threshold =", t)
    test_confusion_mtx(
        con, 'db100', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
        tokenizers.WhitespaceTokzr(),
        t, out_table, "db100gt", 1
    )

threshold = 0.8
2.1905157566070557 s
Average execution time: 2.1905157566070557
{'tp': 25, 'fp': 0, 'fn': 85472, 'pr': 1.0, 'rc': 0.00029240792074575715, 'fm': 0.0005846448866958209}

threshold = 0.7
2.027505874633789 s
Average execution time: 2.027505874633789
{'tp': 301, 'fp': 0, 'fn': 85196, 'pr': 1.0, 'rc': 0.0035205913657789163, 'fm': 0.007016480570642672}

threshold = 0.6
2.8833913803100586 s
Average execution time: 2.8833913803100586
{'tp': 2760, 'fp': 0, 'fn': 82737, 'pr': 1.0, 'rc': 0.032281834450331594, 'fm': 0.06254461402494987}

threshold = 0.5
3.385175943374634 s
Average execution time: 3.385175943374634
{'tp': 14024, 'fp': 0, 'fn': 71473, 'pr': 1.0, 'rc': 0.16402914722153994, 'fm': 0.2818299655349122}

threshold = 0.4
140.23859548568726 s
Average execution time: 140.23859548568726
{'tp': 30678, 'fp': 0, 'fn': 54819, 'pr': 1.0, 'rc': 0.3588196077055335, 'fm': 0.5281342801807618}



In [181]:
con.execute("drop table if exists db100")
con.execute("drop table if exists db100gt")
con.execute(f"drop table if exists {out_table}")
con.execute("show tables").fetchall()

[('data',)]