In [1]:
!pip install duckdb




[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import duckdb
import time
import pandas as pd

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from py_duckdb.similarity_join import tokenizers
from py_duckdb.similarity_join import jaccard_join, jaccard_join_brute_force
from py_duckdb.similarity_join import evaluate

In [5]:
def join_fn_exec_time(n, join_fn, *args, **kwargs):
    exec_time = []
    for i in range(0, n):
        start_time = time.time()
        join_fn(*args, **kwargs)
        end_time = time.time()
        exec_time.append(end_time - start_time)
        print(end_time - start_time, 's')
    return exec_time

In [6]:
import numpy as np

def test_vs_brute_force(
        con: duckdb.DuckDBPyConnection,
        l_table: str,
        r_table: str,
        l_key_attr: str,
        r_key_attr: str,
        l_join_attr: str,
        r_join_attr: str,
        tokenizer: tokenizers.Tokenizer,
        threshold: float,
        out_table_name: str,
        n=1
):

    print("FILTERED EXECUTIONS")
    exec_times = join_fn_exec_time(
        n, jaccard_join,
        con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
    )
    print("Average execution time:", np.average(exec_times))

    print()
    print("BRUTE FORCE EXECUTIONS")
    exec_times_bf = join_fn_exec_time(
        n, jaccard_join_brute_force,
        con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, "bf_" + out_table_name
    )
    print("Average execution time:", np.average(exec_times_bf))

    print()
    cmp_join = con.execute(
        "select * "
        f"from {out_table_name} m "
        f"full outer join bf_{out_table_name} b "
        "on (b.l_rid = m.l_rid and b.r_rid = m.r_rid) "
        "or (b.l_rid = m.r_rid and b.r_rid = m.l_rid) "
        "where m.l_rid is null "
        "or b.l_rid is null"
    ).fetchall()
    if len(cmp_join) == 0:
        print("SUCCESS! Filtered join and Brute force join returned the same result")
    else:
        print("ERROR! There are mismatches between Filtered and Brute force joins:", cmp_join)

    return {
        'exec_time': exec_times,
        'exec_time_bf': exec_times_bf
    }

In [7]:
def test_confusion_mtx(
        con: duckdb.DuckDBPyConnection,
        l_table: str,
        r_table: str,
        l_key_attr: str,
        r_key_attr: str,
        l_join_attr: str,
        r_join_attr: str,
        tokenizer: tokenizers.Tokenizer,
        threshold: float,
        out_table_name: str,
        ground_truth_table: str,
        n=1
):
    exec_time = join_fn_exec_time(
        n, jaccard_join,
        con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
    )
    print("Average execution time:", np.average(exec_time))
    print(evaluate(con, ground_truth_table, out_table_name))
    print()
    return exec_time

In [8]:
con = duckdb.connect(database=':memory:')

# Test case: Actors

In [22]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S1_clean_.csv'"
).execute("select * from src1").fetchall()

[('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'),
 ('S1_8', 'olivia hobson 19760812'),
 ('S1_9', 'michael lierach 19360816'),
 ('S1_10', 'elisabett domiten 19081008'),
 ('S1_11', 'genoveffa hylander 19071008')]

In [23]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S2_clean_.csv'"
).execute("select * from src2").fetchall()

[('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 19321129'),
 ('S2_4', 'olivia hobson 19760812'),
 ('S2_5', 'joshua green 19010219'),
 ('S2_6', 'charlotte hyland 19340909'),
 ('S2_7', 'elisabet domitienn 19071008')]

In [24]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S3_clean_.csv'"
).execute("select * from src3").fetchall()

[('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 19760812'),
 ('S3_7', 'joshua green 19790110'),
 ('S3_8', 'keely clarke 19050410'),
 ('S3_9', 'joshua morriosn 19101123'),
 ('S3_11', 'genovefa hyllande 19071008')]

In [25]:
con.execute("drop view if exists srcall").execute(
    "create view srcall as "
    "select * from src1 "
    "union "
    "select * from src2 "
    "union "
    "select * from src3 "
).execute("select * from srcall").fetchall()

[('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 19760812'),
 ('S3_7', 'joshua green 19790110'),
 ('S3_8', 'keely clarke 19050410'),
 ('S3_9', 'joshua morriosn 19101123'),
 ('S3_11', 'genovefa hyllande 19071008'),
 ('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 19321129'),
 ('S2_4', 'olivia hobson 19760812'),
 ('S2_5', 'joshua green 19010219'),
 ('S2_6', 'charlotte hyland 19340909'),
 ('S2_7', 'elisabet domitienn 19071008'),
 ('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'

In [12]:
# function args
l_table = 'src1'
r_table = 'src2'

l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
# tokenizer = tokenizers.WhitespaceTokzr()
tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'

In [29]:
test_vs_brute_force(
    con, 'srcall', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
0.01784062385559082 s
0.013273000717163086 s
0.014365196228027344 s
0.0134429931640625 s
0.013105630874633789 s
0.009041786193847656 s
0.010432004928588867 s
0.01109933853149414 s
0.009489774703979492 s
0.012964725494384766 s
Average execution time: 0.012505507469177246

BRUTE FORCE EXECUTIONS
0.0075550079345703125 s
0.007014751434326172 s
0.005369424819946289 s
0.004504680633544922 s
0.00493168830871582 s
0.005156278610229492 s
0.004375457763671875 s
0.0049860477447509766 s
0.004986286163330078 s
0.005168914794921875 s
Average execution time: 0.005404853820800781

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [0.01784062385559082,
  0.013273000717163086,
  0.014365196228027344,
  0.0134429931640625,
  0.013105630874633789,
  0.009041786193847656,
  0.010432004928588867,
  0.01109933853149414,
  0.009489774703979492,
  0.012964725494384766],
 'exec_time_bf': [0.0075550079345703125,
  0.007014751434326172,
  0.005369424819946289,
  0.004504680633544922,
  0.00493168830871582,
  0.005156278610229492,
  0.004375457763671875,
  0.0049860477447509766,
  0.004986286163330078,
  0.005168914794921875]}

In [30]:
con.execute("describe matches").fetchall()

[('l_rid', 'VARCHAR', 'YES', None, None, None),
 ('r_rid', 'VARCHAR', 'YES', None, None, None)]

In [33]:
test_vs_brute_force(
    con, 'src1', 'src2', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
0.030547618865966797 s
0.015850067138671875 s
0.020978212356567383 s
0.02318263053894043 s
0.015573501586914062 s
0.016284465789794922 s
0.01708698272705078 s
0.015262365341186523 s
0.01876521110534668 s
0.016921520233154297 s
Average execution time: 0.019045257568359376

BRUTE FORCE EXECUTIONS
0.0060024261474609375 s
0.004806041717529297 s
0.004278421401977539 s
0.004548311233520508 s
0.0044994354248046875 s
0.005831718444824219 s
0.005620002746582031 s
0.006848573684692383 s
0.004189252853393555 s
0.00578761100769043 s
Average execution time: 0.005241179466247558

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [0.030547618865966797,
  0.015850067138671875,
  0.020978212356567383,
  0.02318263053894043,
  0.015573501586914062,
  0.016284465789794922,
  0.01708698272705078,
  0.015262365341186523,
  0.01876521110534668,
  0.016921520233154297],
 'exec_time_bf': [0.0060024261474609375,
  0.004806041717529297,
  0.004278421401977539,
  0.004548311233520508,
  0.0044994354248046875,
  0.005831718444824219,
  0.005620002746582031,
  0.006848573684692383,
  0.004189252853393555,
  0.00578761100769043]}

In [34]:
con.execute("describe bf_matches").fetchall()

[('l_rid', 'VARCHAR', 'YES', None, None, None),
 ('r_rid', 'VARCHAR', 'YES', None, None, None)]

# Test case: NCVR

In [59]:
to_concat = ", ' ', ".join(["entity", "rec_id", "first_name", "last_name", "sex", "age", "birth_place", "house_num", "county_desc", "street_name", "zip_code", "phone_num"])
to_concat

"entity, ' ', rec_id, ' ', first_name, ' ', last_name, ' ', sex, ' ', age, ' ', birth_place, ' ', house_num, ' ', county_desc, ' ', street_name, ' ', zip_code, ' ', phone_num"

In [60]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    f"SELECT id as rid, concat({to_concat}) as val "
    "FROM 'data/NCVR_AF_clean.csv'"
).execute("select * from src1").fetchall()

[('0_22_9865350',
  '22 9865350 whitney baker female 29 in 400 orange poplar 27510  '),
 ('0_40_12768214',
  '40 12768214 abbington pope female 23 nc 1221 wake westview 27605  '),
 ('0_122_9112102',
  '122 9112102 rebecca wilkins female 49 nc 811 new hanover magnolia 28428 8120512'),
 ('0_140_9704280',
  '140 9704280 justin brown male 34 ca 3225 orange us hwy 70 27243  '),
 ('0_222_3198122',
  '222 3198122 stephanie eissens female 38 nc 100 durham village circle 27713  '),
 ('0_240_94472',
  '240 94472 danielle peschon female 28 tn 1049 alamance kelso 27215 6758354'),
 ('0_251_6640272',
  '251 6640272 michelle hinnant female 45 nc 130 johnston braswell 27577 2024737'),
 ('0_340_10179287',
  '340 10179287 mark caccio male 64 ct 213 pender scottsdale 28411 6008095'),
 ('0_351_6376559',
  '351 6376559 nancy hoover female 55 ny 618 iredell isle of pines 28117  '),
 ('0_451_1422321',
  '451 1422321 thomas johnson male 29 nc 501 caswell cherry grove 27379 2696260'),
 ('0_522_12340214',
  '52

In [61]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    f"SELECT id as rid, concat({to_concat}) as val "
    "FROM 'data/NCVR_BF_clean.csv'"
).execute("select * from src2").fetchall()

[('1_40_13913995',
  '40 13913995 abbington pope female 23 nc 104 wayne breezewood 27534  '),
 ('1_41_520429',
  '41 520429 angelon smith female 40 nc 119 buncombe flint 28801  '),
 ('1_122_9000404',
  '122 9000404 rebecca wilkins female 49   1501 nash lafayette 27803  '),
 ('1_140_9350108',
  '140 9350108 justin brown male 34 ca 211 new hanover queen 28401 7417817'),
 ('1_222_13226442',
  '222 13226442 stephanie eissens meacomes female 38 nc 4033 wake enfield ridge 27519  '),
 ('1_240_116265',
  '240 116265 danielle peschon female 28   730 alamance boone station 27215 2789980'),
 ('1_241_8517073',
  '241 8517073 lovie matthews female 26 tn 5634 mecklenburg via romano 28270  '),
 ('1_322_6614729',
  '322 6614729 sandra creech female 52 nc 1347 johnston crocker 27577 4645707'),
 ('1_340_9418808',
  '340 9418808 mark caccio male 64 ct 605 onslow windsong north 28584 6008095'),
 ('1_341_6619987',
  '341 6619987 amanda spencer female 37 nc 607 johnston preston 27576  '),
 ('1_422_10444451'

In [62]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    f"SELECT id as rid, concat({to_concat}) as val "
    "FROM 'data/NCVR_CF_clean.csv'"
).execute("select * from src3").fetchall()

[('2_22_3326652',
  '22 3326652 whitney baker female 29 in 709 durham green 27701  '),
 ('2_31_3904957',
  '31 3904957 latonya mciver female 40 va 30 forsyth glenwood 27106 9166714'),
 ('2_131_12315539',
  '131 12315539 alice feldbusch female 29 nc 119 wake fawn 27587  '),
 ('2_140_12924542',
  '140 12924542 justin brown male 34 ca 8510 wake silhouette 27613  '),
 ('2_222_12748029',
  '222 12748029 stephanie eissens female 38 nc 133 wake wards ridge 27513  '),
 ('2_231_9728052',
  '231 9728052 william mcinerney male 29 nc 265 orange severin 27516  '),
 ('2_240_13265262',
  '240 13265262 danielle peschon female 28 tn 2319 wake hinton 27612  '),
 ('2_331_13927418',
  '331 13927418 james byrd male 53 nc 610 wayne park 27530 9152245'),
 ('2_431_2134645',
  '431 2134645 ainslie guion female 57 ky 323 craven trenton 28523 8144405'),
 ('2_440_12811452',
  '440 12811452 heather spence female 33 nc 6009 wake splitrock 27539  '),
 ('2_522_12715739',
  '522 12715739 judy parker female 56 nc 249 w

In [63]:
con.execute("drop view if exists srcall").execute(
    "create view srcall as "
    "select * from src1 "
    "union "
    "select * from src2 "
    "union "
    "select * from src3 "
).execute("select * from srcall").fetchall()

[('2_22_3326652',
  '22 3326652 whitney baker female 29 in 709 durham green 27701  '),
 ('2_31_3904957',
  '31 3904957 latonya mciver female 40 va 30 forsyth glenwood 27106 9166714'),
 ('2_131_12315539',
  '131 12315539 alice feldbusch female 29 nc 119 wake fawn 27587  '),
 ('2_140_12924542',
  '140 12924542 justin brown male 34 ca 8510 wake silhouette 27613  '),
 ('2_222_12748029',
  '222 12748029 stephanie eissens female 38 nc 133 wake wards ridge 27513  '),
 ('2_231_9728052',
  '231 9728052 william mcinerney male 29 nc 265 orange severin 27516  '),
 ('2_240_13265262',
  '240 13265262 danielle peschon female 28 tn 2319 wake hinton 27612  '),
 ('2_331_13927418',
  '331 13927418 james byrd male 53 nc 610 wayne park 27530 9152245'),
 ('2_431_2134645',
  '431 2134645 ainslie guion female 57 ky 323 craven trenton 28523 8144405'),
 ('2_440_12811452',
  '440 12811452 heather spence female 33 nc 6009 wake splitrock 27539  '),
 ('2_522_12715739',
  '522 12715739 judy parker female 56 nc 249 w

In [64]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
# tokenizer = tokenizers.DelimiterTokzr('|')
tokenizer = tokenizers.WhitespaceTokzr()
# tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'

In [65]:
test_vs_brute_force(
    con, 'srcall', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
0.02042555809020996 s
0.01827406883239746 s
0.011156558990478516 s
0.014172792434692383 s
0.011934280395507812 s
0.012381553649902344 s
0.010427713394165039 s
0.010811090469360352 s
0.010911703109741211 s
0.010053396224975586 s
Average execution time: 0.013054871559143066

BRUTE FORCE EXECUTIONS
0.0060155391693115234 s
0.006018161773681641 s
0.005948781967163086 s
0.005490779876708984 s
0.005956172943115234 s
0.005358695983886719 s
0.005842924118041992 s
0.004953861236572266 s
0.005448102951049805 s
0.0060214996337890625 s
Average execution time: 0.005705451965332032

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [0.02042555809020996,
  0.01827406883239746,
  0.011156558990478516,
  0.014172792434692383,
  0.011934280395507812,
  0.012381553649902344,
  0.010427713394165039,
  0.010811090469360352,
  0.010911703109741211,
  0.010053396224975586],
 'exec_time_bf': [0.0060155391693115234,
  0.006018161773681641,
  0.005948781967163086,
  0.005490779876708984,
  0.005956172943115234,
  0.005358695983886719,
  0.005842924118041992,
  0.004953861236572266,
  0.005448102951049805,
  0.0060214996337890625]}

In [66]:
test_vs_brute_force(
    con, 'src1', 'src2', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
0.030386924743652344 s
0.021141767501831055 s
0.01941704750061035 s
0.01765584945678711 s
0.019254446029663086 s
0.014946460723876953 s
0.018185138702392578 s
0.015486717224121094 s
0.015673398971557617 s
0.014964818954467773 s
Average execution time: 0.018711256980895995

BRUTE FORCE EXECUTIONS
0.004911661148071289 s
0.0042116641998291016 s
0.005642890930175781 s
0.005702972412109375 s
0.0052831172943115234 s
0.004955768585205078 s
0.005891323089599609 s
0.006124019622802734 s
0.007002115249633789 s
0.007067441940307617 s
Average execution time: 0.00567929744720459

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [0.030386924743652344,
  0.021141767501831055,
  0.01941704750061035,
  0.01765584945678711,
  0.019254446029663086,
  0.014946460723876953,
  0.018185138702392578,
  0.015486717224121094,
  0.015673398971557617,
  0.014964818954467773],
 'exec_time_bf': [0.004911661148071289,
  0.0042116641998291016,
  0.005642890930175781,
  0.005702972412109375,
  0.0052831172943115234,
  0.004955768585205078,
  0.005891323089599609,
  0.006124019622802734,
  0.007002115249633789,
  0.007067441940307617]}

In [67]:
con.execute("drop table if exists src1")
con.execute("drop table if exists src2")
con.execute("drop table if exists src3")
con.execute("drop view if exists srcall")

<duckdb.DuckDBPyConnection at 0x2b851a82270>

## Test Case: Profiles

In [26]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')

In [27]:
df10.head()

Unnamed: 0,realProfileID,date_of_birth,surname,address_1,street_number,postcode,soc_sec_id,suburb,phone_number,state,given_name,age,address_2
0,0,19390609.0,bishop,daley crescent,41.0,6050,4676841,batlow,08 29028996,qld,molly,31.0,
1,1,19041109.0,aidon,nambucca street,7.0,2002,3414163,devonort,08 75629459,vkf,whkt,,
2,2,19910711.0,anns,,12.0,2287,7844876,ivanhoe,02 11684110,vic,andrew,31.0,
3,3,19390709.0,whitrlsy,robson street,34.0,4065,2418360,christie downs,04 00323207,qld,shsne,,
4,4,19340328.0,roche,rankin street,9.0,3644,7577436,frenchs forest,08 53227250,qld,sophie,30.0,


In [28]:
to_concat = ", ' ', ".join(["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number", "state", "given_name", "age", "address_2"])
to_concat

"date_of_birth, ' ', surname, ' ', address_1, ' ', street_number, ' ', postcode, ' ', soc_sec_id, ' ', suburb, ' ', phone_number, ' ', state, ' ', given_name, ' ', age, ' ', address_2"

In [29]:
con.execute("drop table if exists db10").execute(
    "CREATE TABLE db10 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df10"
).execute("select * from db10").fetchall()

[(0,
  '19390609.0 bishop daley crescent 41.0 6050 4676841 batlow 08 29028996 qld molly 31.0 '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (2,
  '19910711.0 anns  12.0 2287 7844876 ivanhoe 02 11684110 vic andrew 31.0 '),
 (3,
  '19390709.0 whitrlsy robson street 34.0 4065 2418360 christie downs 04 00323207 qld shsne  '),
 (4,
  '19340328.0 roche rankin street 9.0 3644 7577436 frenchs forest 08 53227250 qld sophie 30.0 '),
 (5,
  '19320811.0 fullgrabe beeston street 29.0 3131 6494586 broken hill 04 80080021 nsw emma 29.0 '),
 (6,
  '19601013.0 lodge mason street 48.0 5254 6098877 orchard hills 08 48143359 vic rourke 32.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (8,
  '19671108.0 bishop cromwell circuit 12.0 2226 1718686 harris park 04 01707833 vic jamie 28.0 homestead caravan park'),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),
 

In [30]:
df10gt = pd.read_json("data/10KIdDuplicates.json", lines=True, orient='records', typ='frame')
df10gt.head()

Unnamed: 0,d1Id,d2Id
0,101,8933
1,4101,4614
2,7213,8466
3,4856,7485
4,4829,9725


In [31]:
con.execute("drop table if exists db10gt").execute(
    "CREATE TABLE db10gt AS "
    "SELECT d1Id as l_rid, d2Id as r_rid "
    "FROM df10gt "
).execute("select count(*) from db10gt").fetchall()

[(8705,)]

In [32]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
out_table_name = 'matches'

In [33]:
con.execute("drop table if exists db10_sample").execute(
    "CREATE TABLE db10_sample AS "
    "SELECT * "
    "FROM db10 "
    "using sample 2000"
).execute("select * from db10_sample").fetchall()

[(5549,
  '19291007.0 nlhle moonbi frrescent 596.0 4053 6815384 pelverata 03 80597624 wxu kaela  '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (5990,
  '19480926.0 bitmeda bingley zurescent 69.0 2460 5367480 padso 03 86356584 vic jye  '),
 (4602,
  '  hammett place 81.0 2564 1390664 greenwood 03 42933651 vic keziah 25.0 marretah'),
 (4731,
  ' fetrsi stromlo crfscent 83.0 5660 4459039 asplry 03 50105737  zck 23.0 '),
 (9298,
  '19730618.0 wight pockley close 71.0 4059 5631166 ringwooe east 02 72441102 qlc aidym 9.0 '),
 (7287,
  '19070525.0 gibb  18.0 6053 2979403 wyndham 04 27164006 wa rupert 32.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (3416,
  '19221101.0 hazell toomey place 39.0 2070 2506925 adelaide 08 31857623 sa tiahnee 32.0 '),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),
 (10,
  '19260421.0 mildren dodd place 41.0 3636

In [34]:
con.execute("drop view if exists db10gt_sample").execute(
    "create view db10gt_sample as "
    "select gt.* "
    "from db10_sample s1, db10_sample s2, db10gt gt "
    "where s1.rid = gt.l_rid "
    "and s2.rid = gt.r_rid"
).execute("select count(*) from db10gt_sample").fetchall()

[(335,)]

In [35]:
test_vs_brute_force(
    con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WordsTokzr("' '"),
    0.2,
    out_table_name, 5
)
evaluate(con, 'db10gt_sample', out_table_name)

FILTERED EXECUTIONS
0.2563135623931885 s
0.23417401313781738 s
0.3459172248840332 s
0.2341442108154297 s
0.2220611572265625 s
Average execution time: 0.25852203369140625

BRUTE FORCE EXECUTIONS
0.5650081634521484 s
0.6438949108123779 s
1.168952465057373 s
0.3944573402404785 s
0.483445405960083 s
Average execution time: 0.6511516571044922

SUCCESS! Filtered join and Brute force join returned the same result


{'tp': 337,
 'fp': 48,
 'fn': 46,
 'pr': 0.8753246753246753,
 'rc': 0.8798955613577023,
 'fm': 0.8776041666666665}

In [36]:
test_confusion_mtx(
    con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.QGramsTokzr(5),
    0.2,
    out_table_name, 'db10gt_sample', 5
)

2.468515634536743 s
1.3657004833221436 s
1.3196666240692139 s
1.356858253479004 s
1.3960342407226562 s
Average execution time: 1.581355047225952
{'tp': 317, 'fp': 2, 'fn': 20, 'pr': 0.9937304075235109, 'rc': 0.9406528189910979, 'fm': 0.9664634146341463}



[2.468515634536743,
 1.3657004833221436,
 1.3196666240692139,
 1.356858253479004,
 1.3960342407226562]

In [37]:
for t in [0.6, 0.5, 0.4, 0.3, 0.2]:
    print("threshold =", t)
    test_confusion_mtx(con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
          tokenizers.QGramsTokzr(5),
          # tokenizers.WordsTokzr("' '"),
          t, out_table_name, "db10gt_sample", 5)

threshold = 0.6
0.3318514823913574 s
0.3403890132904053 s
0.3118736743927002 s
0.37316155433654785 s
0.3085751533508301 s
Average execution time: 0.33317017555236816
{'tp': 22, 'fp': 0, 'fn': 313, 'pr': 1.0, 'rc': 0.06567164179104477, 'fm': 0.12324929971988795}

threshold = 0.5
0.31127142906188965 s
0.32298779487609863 s
0.3307149410247803 s
0.34503626823425293 s
0.3351612091064453 s
Average execution time: 0.32903432846069336
{'tp': 77, 'fp': 0, 'fn': 258, 'pr': 1.0, 'rc': 0.2298507462686567, 'fm': 0.37378640776699024}

threshold = 0.4
0.36742639541625977 s
0.3846776485443115 s
0.40102696418762207 s
0.4333770275115967 s
0.6122121810913086 s
Average execution time: 0.4397440433502197
{'tp': 152, 'fp': 0, 'fn': 183, 'pr': 1.0, 'rc': 0.4537313432835821, 'fm': 0.6242299794661191}

threshold = 0.3
1.7782714366912842 s
1.292816162109375 s
1.1836347579956055 s
1.1250834465026855 s
1.162306785583496 s
Average execution time: 1.3084225177764892
{'tp': 236, 'fp': 0, 'fn': 99, 'pr': 1.0, 'rc': 0

In [39]:
for t in [0.6, 0.5, 0.4, 0.3, 0.2]:
    print("threshold =", t)
    test_confusion_mtx(con, 'db10', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
          # tokenizers.QGramsTokzr(5),
          tokenizers.WordsTokzr("' '"),
          t, out_table_name, "db10gt", 5)

threshold = 0.6
0.4004068374633789 s
0.4106111526489258 s
0.4132263660430908 s
0.4024820327758789 s
0.3557465076446533 s
Average execution time: 0.39649457931518556
{'tp': 277, 'fp': 0, 'fn': 8428, 'pr': 1.0, 'rc': 0.0318207926479035, 'fm': 0.06167891338232019}

threshold = 0.5
0.4031212329864502 s
0.5489144325256348 s
0.5344328880310059 s
0.6598386764526367 s
0.5164644718170166 s
Average execution time: 0.5325543403625488
{'tp': 1341, 'fp': 0, 'fn': 7364, 'pr': 1.0, 'rc': 0.1540493968983343, 'fm': 0.2669719291260203}

threshold = 0.4
0.5690248012542725 s
0.557499885559082 s
0.48482203483581543 s
0.4047112464904785 s
0.3845939636230469 s
Average execution time: 0.4801303863525391
{'tp': 3023, 'fp': 0, 'fn': 5682, 'pr': 1.0, 'rc': 0.3472716829408386, 'fm': 0.515518417462483}

threshold = 0.3
4.468564987182617 s
4.138373374938965 s
3.793401002883911 s
3.5204217433929443 s
4.069042444229126 s
Average execution time: 3.9979607105255126
{'tp': 5251, 'fp': 4, 'fn': 3458, 'pr': 0.999238820171

## Test case: Profiles, non-self join

In [40]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')
# df10 = pd.read_json("data/test.json", lines=True, orient='records', typ='frame')
df10.head()

Unnamed: 0,realProfileID,date_of_birth,surname,address_1,street_number,postcode,soc_sec_id,suburb,phone_number,state,given_name,age,address_2
0,0,19390609.0,bishop,daley crescent,41.0,6050,4676841,batlow,08 29028996,qld,molly,31.0,
1,1,19041109.0,aidon,nambucca street,7.0,2002,3414163,devonort,08 75629459,vkf,whkt,,
2,2,19910711.0,anns,,12.0,2287,7844876,ivanhoe,02 11684110,vic,andrew,31.0,
3,3,19390709.0,whitrlsy,robson street,34.0,4065,2418360,christie downs,04 00323207,qld,shsne,,
4,4,19340328.0,roche,rankin street,9.0,3644,7577436,frenchs forest,08 53227250,qld,sophie,30.0,


In [41]:
split_at = int(df10.shape[0] / 2)
df10_1 = df10.iloc[:split_at, :]
df10_2 = df10.iloc[split_at:, :]
df10_1.shape

(5000, 13)

In [42]:
df10_2.shape

(5000, 13)

In [13]:
to_concat = ", ' ', ".join(
    ["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number",
     "state", "given_name", "age", "address_2"])

In [44]:
con.execute("drop table if exists db10_1").execute(
    "CREATE TABLE db10_1 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df10_1"
).execute("select * from db10_1").fetchall()

[(0,
  '19390609.0 bishop daley crescent 41.0 6050 4676841 batlow 08 29028996 qld molly 31.0 '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (2,
  '19910711.0 anns  12.0 2287 7844876 ivanhoe 02 11684110 vic andrew 31.0 '),
 (3,
  '19390709.0 whitrlsy robson street 34.0 4065 2418360 christie downs 04 00323207 qld shsne  '),
 (4,
  '19340328.0 roche rankin street 9.0 3644 7577436 frenchs forest 08 53227250 qld sophie 30.0 '),
 (5,
  '19320811.0 fullgrabe beeston street 29.0 3131 6494586 broken hill 04 80080021 nsw emma 29.0 '),
 (6,
  '19601013.0 lodge mason street 48.0 5254 6098877 orchard hills 08 48143359 vic rourke 32.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (8,
  '19671108.0 bishop cromwell circuit 12.0 2226 1718686 harris park 04 01707833 vic jamie 28.0 homestead caravan park'),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),
 

In [45]:
con.execute("drop table if exists db10_2").execute(
    "CREATE TABLE db10_2 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df10_2"
).execute("select * from db10_2").fetchall()

[(5000,
  '19700823.0 wang  8.0 2875 2216347 blue haven 07 17987225  ameli 32.0 '),
 (5001,
  '19280314.0 scaror warramoo crescent 7.0 5752 4933901 moama 08 76672148 vl kiara 23.0 '),
 (5002,
  '19290324.0 colis alarmon crescent 79.0 3204 5111085 warren 03 66459383 act georgia 28.0 arthella'),
 (5003,
  ' wilkins learmonth drive 24.0 2050 5694132 miami 07 04918885 qld benjamin 21.0 hopeview'),
 (5004,
  '19240216.0 fraenkel launceston street 44.0 2088 3703441 dapto 08 57499031  timothy 30.0 '),
 (5005,
  '19120121.0 rees achernar close 61.0 7011 6346462 dapto 02 97724353 nsw kane 36.0 retirement village'),
 (5006,
  '19051490.0 binns wenholz treet  6006 2604157 dapto 03 68821645 sa lucinda 29.0 '),
 (5007,
  ' mccarthy millhouse crescent 35.0 6027 8045670 northmead 08 76825732 qld sophie 30.0 highgate'),
 (5008,
  '19250109.0 hearn stapylton street 1.0 6027 2267622 malvern east 03 64393123  tynan 29.0 '),
 (5009,
  '19791026.0 stuber saville close 75.0 4350 3716864 hay 02 96356289 nsw 

In [14]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
out_table_name = 'matches'

In [47]:
test_vs_brute_force(
    con, 'db10_1', 'db10_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WordsTokzr("' '"),
    0.2,
    out_table_name,
    5
)

FILTERED EXECUTIONS
21.128831148147583 s
20.257566690444946 s
21.9219651222229 s
23.73932647705078 s
22.196144819259644 s
Average execution time: 21.84876685142517

BRUTE FORCE EXECUTIONS
4.906538486480713 s
6.2715277671813965 s
5.722679376602173 s
6.072279930114746 s
6.269169569015503 s
Average execution time: 5.848439025878906

SUCCESS! Filtered join and Brute force join returned the same result


{'exec_time': [21.128831148147583,
  20.257566690444946,
  21.9219651222229,
  23.73932647705078,
  22.196144819259644],
 'exec_time_bf': [4.906538486480713,
  6.2715277671813965,
  5.722679376602173,
  6.072279930114746,
  6.269169569015503]}

In [48]:
df10gt = pd.read_json("data/10KIdDuplicates.json", lines=True, orient='records', typ='frame')
con.execute("drop table if exists db10gt").execute(
    "CREATE TABLE db10gt AS "
    "SELECT d1Id as l_rid, d2Id as r_rid "
    "FROM df10gt "
    f"where d1Id < {split_at} and d2Id >= {split_at} "
).execute("select * from db10gt").fetchall()

[(101, 8933),
 (4856, 7485),
 (4829, 9725),
 (1289, 8634),
 (3059, 9179),
 (1884, 8409),
 (1295, 8153),
 (2498, 6623),
 (3670, 7648),
 (3064, 8794),
 (1689, 8239),
 (3699, 5250),
 (3875, 7035),
 (2704, 5934),
 (4040, 9733),
 (895, 8626),
 (1328, 5463),
 (96, 9418),
 (4486, 5499),
 (3252, 9615),
 (4858, 7396),
 (1083, 9433),
 (3267, 8384),
 (3105, 5454),
 (1921, 5428),
 (1483, 9012),
 (1282, 9311),
 (303, 8653),
 (4884, 5268),
 (687, 9557),
 (90, 9958),
 (4631, 9892),
 (1488, 8627),
 (1488, 8625),
 (3302, 5519),
 (4058, 8307),
 (2297, 7018),
 (701, 8420),
 (1299, 7941),
 (3065, 8829),
 (3454, 9318),
 (1502, 7488),
 (2860, 9462),
 (3279, 7464),
 (516, 7416),
 (3488, 6510),
 (3690, 6123),
 (3273, 7966),
 (2092, 7690),
 (2502, 6433),
 (2909, 5433),
 (1477, 9595),
 (684, 9881),
 (880, 9997),
 (4492, 5118),
 (3065, 8883),
 (2107, 6474),
 (4066, 7719),
 (511, 7876),
 (2118, 5569),
 (3850, 9273),
 (2878, 8024),
 (3087, 7062),
 (1510, 6894),
 (1890, 8133),
 (2117, 5676),
 (3282, 7290),
 (3676, 

In [49]:
test_confusion_mtx(
    con, 'db10_1', 'db10_2', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WordsTokzr("' '"),
    0.2,
    out_table_name, 'db10gt',
    5
)

30.850841999053955 s
23.131867170333862 s
25.995912790298462 s
30.115511178970337 s
23.578468799591064 s
Average execution time: 26.734520387649535
{'tp': 4362, 'fp': 582, 'fn': 599, 'pr': 0.8822815533980582, 'rc': 0.879258214069744, 'fm': 0.8807672892478546}



[30.850841999053955,
 23.131867170333862,
 25.995912790298462,
 30.115511178970337,
 23.578468799591064]

In [50]:
con.execute("show tables").fetchall()

[('bf_matches',),
 ('db10',),
 ('db10_1',),
 ('db10_2',),
 ('db10_sample',),
 ('db10gt',),
 ('db10gt_sample',),
 ('matches',)]

## Test case: larger Profiles datasets

In [24]:
df50 = pd.read_json("data/50Kprofiles.json", lines=True, orient='records', typ='frame')

In [25]:
con.execute("drop table if exists db50").execute(
    "CREATE TABLE db50 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df50"
).execute("select * from db50").fetchall()

[(0,
  '19551130.0 reid jerrabomberra avenue 191.0 2115 7907036 tylden 02 57675204 nsw nicholas  gold tyne'),
 (1,
  '19821020.0 porra meyers place 2.0 3130 2705215 kingaroy 03 64681506 wa elle 34.0 '),
 (2,
  '19351010.0 pochec duterrau crescent 46.0 2785 2350343 lalor park 03 93693261 vic mitchell 29.0 '),
 (3,
  ' beaumaris ellenborough street 11.0 2285 6735143 innaloo 04 02935530 qld marianne 24.0 '),
 (4, ' ransin ahernuplace 2.0 4496 6721672 armiake  qld christian 38.0 '),
 (5,
  '19740410.0 forsbaw norman fisher circuit 5.0 2707 7456433 bayview hdeights  qph lesa 29.0 warrawong'),
 (6,
  '19960201.0 nguyen hansen circuit 44.0 5244 7748525  04 10399694 nsw caitlin 29.0 '),
 (7,
  '19607722.0 eichimnrger stockdale street 11.0 4224 8900629 cherrbqrook 04 99393660 qld isabrlla 36.0 st john of godzhospital'),
 (8,
  '19280419.0 hawes giles street 3.0 2197 4044861 brandon 04 54963323 vic michael 28.0 '),
 (9,
  '19010611.0 palecek goldner circuit 31.0 6102 8225306 latrobe 07 75127812 

In [28]:
df50gt = pd.read_json("data/50KIdDuplicates.json", lines=True, orient='records', typ='frame')
con.execute("drop table if exists db50gt").execute(
    "CREATE TABLE db50gt AS "
    "SELECT d1Id as l_rid, d2Id as r_rid "
    "FROM df50gt "
).execute("select count(*) from db50gt").fetchall()

[(43071,)]

In [29]:
test_confusion_mtx(
    con, 'db50', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table_name, 'db50gt',
    5
)

1.8928577899932861 s
1.2876501083374023 s
1.2809226512908936 s
1.2467806339263916 s
1.290759563446045 s
Average execution time: 1.3997941493988038
{'tp': 6964, 'fp': 0, 'fn': 36107, 'pr': 1.0, 'rc': 0.16168651761045716, 'fm': 0.27836514439892074}



[1.8928577899932861,
 1.2876501083374023,
 1.2809226512908936,
 1.2467806339263916,
 1.290759563446045]

In [40]:
con.execute("drop table if exists db50")
con.execute("drop table if exists db50gt")
con.execute(f"drop table if exists {out_table_name}")
con.execute("show tables").fetchall()

[]

In [34]:
df100 = pd.read_json("data/100Kprofiles.json", lines=True, orient='records', typ='frame')

In [35]:
con.execute("drop table if exists db100").execute(
    "CREATE TABLE db100 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df100"
).execute("select * from db100").fetchall()

[(0,
  '19190809.0 chandler pudney street 376.0 3764 3423802 hemmant 02 24496575 qld luke 35.0 arlington'),
 (1,
  '19300528.0 hawes  8.0 2168 2180735 keilor east 04 87308830 qld millane 36.0 northern tablelands tennis academy'),
 (2,
  '19300112.0 novadk rfreet rankin 1.0 3218 4272019 beulah park 03 39054375 qld jatmike 26.0 '),
 (3,
  '19950727.0 redmond davenport street 12.0 3030 5381275 kangaroo flat 07 69720733 qld madeleine 33.0 vinery'),
 (4,
  '19631123.0 zimmermann corona place 7.0 4030 2894006 greenacres 03 48393238 nsw kayden 31.0 glen elgin'),
 (5,
  '19350925.0 canbkl macfarland crescent 61.0 3164 5282937 chishmolm 02 01482240 nsw sammy 72.0 '),
 (6,
  '19650417.0 psorakis owen crescent 77.0 2452 2052783 mannum 07 48547017 wa laura  '),
 (7,
  '19230205.0 agius arthur circle 2.0 6125 8456442 marsden 02 54492364 qld indiana 32.0 '),
 (8,
  '19830325.0 hegger arabana street 2.0 4178 9175025 wellington point 04 21430300 qld jaykob  '),
 (9, '19930213.0 weetfa  6.0 3106 541836

In [36]:
df100gt = pd.read_json("data/100KIdDuplicates.json", lines=True, orient='records', typ='frame')
con.execute("drop table if exists db100gt").execute(
    "CREATE TABLE db100gt AS "
    "SELECT d1Id as l_rid, d2Id as r_rid "
    "FROM df100gt "
).execute("select count(*) from db100gt").fetchall()

[(85497,)]

In [37]:
test_confusion_mtx(
    con, 'db100', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr,
    tokenizers.WhitespaceTokzr(),
    0.5,
    out_table_name, 'db100gt',
    5
)

3.6827125549316406 s
3.0261011123657227 s
5.46365761756897 s
5.887436151504517 s
3.6066043376922607 s
Average execution time: 4.333302354812622
{'tp': 14024, 'fp': 0, 'fn': 71473, 'pr': 1.0, 'rc': 0.16402914722153994, 'fm': 0.2818299655349122}



[3.6827125549316406,
 3.0261011123657227,
 5.46365761756897,
 5.887436151504517,
 3.6066043376922607]

In [39]:
con.execute("drop table if exists db100")
con.execute("drop table if exists db100gt")
con.execute(f"drop table if exists {out_table_name}")
con.execute("show tables").fetchall()

[]