In [1]:
!pip install duckdb



In [2]:
import duckdb
import time
import pandas as pd

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from py_duckdb.similarity_join import tokenizers
from py_duckdb.similarity_join import jaccard_join, jaccard_join_brute_force
from py_duckdb.similarity_join import evaluate

In [5]:
import string
import numpy as np

def test(
        con: duckdb.DuckDBPyConnection,
        l_table: string,
        r_table: string,
        l_key_attr: string,
        r_key_attr: string,
        l_join_attr: string,
        r_join_attr: string,
        tokenizer: tokenizers.Tokenizer,
        threshold: float,
        out_table_name: string,
        n=1
):
    exec_times = []
    exec_times_bf = []

    print("FILTERED EXECUTIONS")
    for t in range(0, n):
        start_time = time.time()
        jaccard_join(
            con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
        )
        end_time = time.time()
        exec_times.append(end_time - start_time)
        print(end_time - start_time, 's')

    print()
    print("BRUTE FORCE EXECUTIONS")
    for t in range(0, n):
        start_time = time.time()
        jaccard_join_brute_force(
            con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, "bf_" + out_table_name
        )
        end_time = time.time()
        exec_times_bf.append(end_time - start_time)
        print(end_time - start_time, 's')

    print()
    cmp_join = con.execute(
        "select * "
        f"from {out_table_name} m "
        f"full outer join bf_{out_table_name} b on b.rid1 = m.rid1 and b.rid2 = m.rid2 "
        "where m.rid1 is null "
        "or b.rid1 is null"
    ).fetchall()
    if len(cmp_join) == 0:
        print("SUCCESS! Filtered join and Brute force join returned the same result")
    else:
        print("ERROR! There are mismatches between Filtered and Brute force joins:", cmp_join)
    print("Average execution time for filtered join:", np.average(exec_times))
    print("Average execution time for brute force join:", np.average(exec_times_bf))

    return {
        'exec_time': exec_times,
        'exec_time_bf': exec_times_bf
    }

In [6]:
con = duckdb.connect(database=':memory:')

# Test case: Actors

In [43]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S1_clean_.csv'"
).execute("select * from src1").fetchall()

[('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'),
 ('S1_8', 'olivia hobson 19760812'),
 ('S1_9', 'michael lierach 19360816'),
 ('S1_10', 'elisabett domiten 19081008'),
 ('S1_11', 'genoveffa hylander 19071008')]

In [44]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S2_clean_.csv'"
).execute("select * from src2").fetchall()

[('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 19321129'),
 ('S2_4', 'olivia hobson 19760812'),
 ('S2_5', 'joshua green 19010219'),
 ('S2_6', 'charlotte hyland 19340909'),
 ('S2_7', 'elisabet domitienn 19071008')]

In [45]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S3_clean_.csv'"
).execute("select * from src3").fetchall()

[('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 19760812'),
 ('S3_7', 'joshua green 19790110'),
 ('S3_8', 'keely clarke 19050410'),
 ('S3_9', 'joshua morriosn 19101123'),
 ('S3_11', 'genovefa hyllande 19071008')]

In [46]:
con.execute("drop view if exists srcall").execute(
    "create view srcall as "
    "select * from src1 "
    "union "
    "select * from src2 "
    "union "
    "select * from src3 "
).execute("select * from srcall").fetchall()

[('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 19760812'),
 ('S3_7', 'joshua green 19790110'),
 ('S3_8', 'keely clarke 19050410'),
 ('S3_9', 'joshua morriosn 19101123'),
 ('S3_11', 'genovefa hyllande 19071008'),
 ('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'),
 ('S1_8', 'olivia hobson 19760812'),
 ('S1_9', 'michael lierach 19360816'),
 ('S1_10', 'elisabett domiten 19081008'),
 ('S1_11', 'genoveffa hylander 19071008'),
 ('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 193

In [47]:
# function args
l_table = 'src1'
r_table = 'src2'

l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
# tokenizer = tokenizers.WordsTokzr(r"' '")
tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'

In [48]:
test(
    con, 'srcall', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
[(25,)] 0.00598597526550293
0.022942066192626953 s
[(25,)] 0.0044100284576416016
0.015324115753173828 s
[(25,)] 0.004149913787841797
0.014997243881225586 s
[(25,)] 0.003513336181640625
0.017217159271240234 s
[(25,)] 0.005988597869873047
0.016370773315429688 s
[(25,)] 0.004504680633544922
0.015601158142089844 s
[(25,)] 0.003433704376220703
0.012436151504516602 s
[(25,)] 0.003414154052734375
0.010949850082397461 s
[(25,)] 0.0029222965240478516
0.010078907012939453 s
[(25,)] 0.002862215042114258
0.01046299934387207 s

BRUTE FORCE EXECUTIONS
0.00566554069519043 s
0.004985809326171875 s
0.0061092376708984375 s
0.03780674934387207 s
0.007559061050415039 s
0.0074520111083984375 s
0.008259057998657227 s
0.0070154666900634766 s
0.007467031478881836 s
0.0060045719146728516 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.014638042449951172
Average execution time for brute force join: 0.009832453727722169


{'exec_time': [0.022942066192626953,
  0.015324115753173828,
  0.014997243881225586,
  0.017217159271240234,
  0.016370773315429688,
  0.015601158142089844,
  0.012436151504516602,
  0.010949850082397461,
  0.010078907012939453,
  0.01046299934387207],
 'exec_time_bf': [0.00566554069519043,
  0.004985809326171875,
  0.0061092376708984375,
  0.03780674934387207,
  0.007559061050415039,
  0.0074520111083984375,
  0.008259057998657227,
  0.0070154666900634766,
  0.007467031478881836,
  0.0060045719146728516]}

In [49]:
test(
    con, 'src1', 'src2', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
[(16,)] 0.004450321197509766
0.026930809020996094 s
[(16,)] 0.003988742828369141
0.02407383918762207 s
[(16,)] 0.004379749298095703
0.025432109832763672 s
[(16,)] 0.004529476165771484
0.024120330810546875 s
[(16,)] 0.003487825393676758
0.017495155334472656 s
[(16,)] 0.005059957504272461
0.035196781158447266 s
[(16,)] 0.011186838150024414
0.03503751754760742 s
[(16,)] 0.003485441207885742
0.013888120651245117 s
[(16,)] 0.003724813461303711
0.014583587646484375 s
[(16,)] 0.0033833980560302734
0.016271591186523438 s

BRUTE FORCE EXECUTIONS
0.004751920700073242 s
0.007367372512817383 s
0.007603168487548828 s
0.008343935012817383 s
0.007978677749633789 s
0.00797891616821289 s
0.008592605590820312 s
0.008536100387573242 s
0.008090019226074219 s
0.0051457881927490234 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.0233029842376709
Average execution time for brute force join: 0.007438850402832031


{'exec_time': [0.026930809020996094,
  0.02407383918762207,
  0.025432109832763672,
  0.024120330810546875,
  0.017495155334472656,
  0.035196781158447266,
  0.03503751754760742,
  0.013888120651245117,
  0.014583587646484375,
  0.016271591186523438],
 'exec_time_bf': [0.004751920700073242,
  0.007367372512817383,
  0.007603168487548828,
  0.008343935012817383,
  0.007978677749633789,
  0.00797891616821289,
  0.008592605590820312,
  0.008536100387573242,
  0.008090019226074219,
  0.0051457881927490234]}

# Test case: NCVR

In [50]:
to_concat = ", ' ', ".join(["entity", "rec_id", "first_name", "last_name", "sex", "age", "birth_place", "house_num", "county_desc", "street_name", "zip_code", "phone_num"])
to_concat

"entity, ' ', rec_id, ' ', first_name, ' ', last_name, ' ', sex, ' ', age, ' ', birth_place, ' ', house_num, ' ', county_desc, ' ', street_name, ' ', zip_code, ' ', phone_num"

In [51]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    f"SELECT id as rid, concat({to_concat}) as val "
    "FROM 'data/NCVR_AF_clean.csv'"
).execute("select * from src1").fetchall()

[('0_22_9865350',
  '22 9865350 whitney baker female 29 in 400 orange poplar 27510  '),
 ('0_40_12768214',
  '40 12768214 abbington pope female 23 nc 1221 wake westview 27605  '),
 ('0_122_9112102',
  '122 9112102 rebecca wilkins female 49 nc 811 new hanover magnolia 28428 8120512'),
 ('0_140_9704280',
  '140 9704280 justin brown male 34 ca 3225 orange us hwy 70 27243  '),
 ('0_222_3198122',
  '222 3198122 stephanie eissens female 38 nc 100 durham village circle 27713  '),
 ('0_240_94472',
  '240 94472 danielle peschon female 28 tn 1049 alamance kelso 27215 6758354'),
 ('0_251_6640272',
  '251 6640272 michelle hinnant female 45 nc 130 johnston braswell 27577 2024737'),
 ('0_340_10179287',
  '340 10179287 mark caccio male 64 ct 213 pender scottsdale 28411 6008095'),
 ('0_351_6376559',
  '351 6376559 nancy hoover female 55 ny 618 iredell isle of pines 28117  '),
 ('0_451_1422321',
  '451 1422321 thomas johnson male 29 nc 501 caswell cherry grove 27379 2696260'),
 ('0_522_12340214',
  '52

In [52]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    f"SELECT id as rid, concat({to_concat}) as val "
    "FROM 'data/NCVR_BF_clean.csv'"
).execute("select * from src2").fetchall()

[('1_40_13913995',
  '40 13913995 abbington pope female 23 nc 104 wayne breezewood 27534  '),
 ('1_41_520429',
  '41 520429 angelon smith female 40 nc 119 buncombe flint 28801  '),
 ('1_122_9000404',
  '122 9000404 rebecca wilkins female 49   1501 nash lafayette 27803  '),
 ('1_140_9350108',
  '140 9350108 justin brown male 34 ca 211 new hanover queen 28401 7417817'),
 ('1_222_13226442',
  '222 13226442 stephanie eissens meacomes female 38 nc 4033 wake enfield ridge 27519  '),
 ('1_240_116265',
  '240 116265 danielle peschon female 28   730 alamance boone station 27215 2789980'),
 ('1_241_8517073',
  '241 8517073 lovie matthews female 26 tn 5634 mecklenburg via romano 28270  '),
 ('1_322_6614729',
  '322 6614729 sandra creech female 52 nc 1347 johnston crocker 27577 4645707'),
 ('1_340_9418808',
  '340 9418808 mark caccio male 64 ct 605 onslow windsong north 28584 6008095'),
 ('1_341_6619987',
  '341 6619987 amanda spencer female 37 nc 607 johnston preston 27576  '),
 ('1_422_10444451'

In [53]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    f"SELECT id as rid, concat({to_concat}) as val "
    "FROM 'data/NCVR_CF_clean.csv'"
).execute("select * from src3").fetchall()

[('2_22_3326652',
  '22 3326652 whitney baker female 29 in 709 durham green 27701  '),
 ('2_31_3904957',
  '31 3904957 latonya mciver female 40 va 30 forsyth glenwood 27106 9166714'),
 ('2_131_12315539',
  '131 12315539 alice feldbusch female 29 nc 119 wake fawn 27587  '),
 ('2_140_12924542',
  '140 12924542 justin brown male 34 ca 8510 wake silhouette 27613  '),
 ('2_222_12748029',
  '222 12748029 stephanie eissens female 38 nc 133 wake wards ridge 27513  '),
 ('2_231_9728052',
  '231 9728052 william mcinerney male 29 nc 265 orange severin 27516  '),
 ('2_240_13265262',
  '240 13265262 danielle peschon female 28 tn 2319 wake hinton 27612  '),
 ('2_331_13927418',
  '331 13927418 james byrd male 53 nc 610 wayne park 27530 9152245'),
 ('2_431_2134645',
  '431 2134645 ainslie guion female 57 ky 323 craven trenton 28523 8144405'),
 ('2_440_12811452',
  '440 12811452 heather spence female 33 nc 6009 wake splitrock 27539  '),
 ('2_522_12715739',
  '522 12715739 judy parker female 56 nc 249 w

In [54]:
con.execute("drop view if exists srcall").execute(
    "create view srcall as "
    "select * from src1 "
    "union "
    "select * from src2 "
    "union "
    "select * from src3 "
).execute("select * from srcall").fetchall()

[('2_22_3326652',
  '22 3326652 whitney baker female 29 in 709 durham green 27701  '),
 ('2_31_3904957',
  '31 3904957 latonya mciver female 40 va 30 forsyth glenwood 27106 9166714'),
 ('2_131_12315539',
  '131 12315539 alice feldbusch female 29 nc 119 wake fawn 27587  '),
 ('2_140_12924542',
  '140 12924542 justin brown male 34 ca 8510 wake silhouette 27613  '),
 ('2_222_12748029',
  '222 12748029 stephanie eissens female 38 nc 133 wake wards ridge 27513  '),
 ('2_231_9728052',
  '231 9728052 william mcinerney male 29 nc 265 orange severin 27516  '),
 ('2_240_13265262',
  '240 13265262 danielle peschon female 28 tn 2319 wake hinton 27612  '),
 ('2_331_13927418',
  '331 13927418 james byrd male 53 nc 610 wayne park 27530 9152245'),
 ('2_431_2134645',
  '431 2134645 ainslie guion female 57 ky 323 craven trenton 28523 8144405'),
 ('2_440_12811452',
  '440 12811452 heather spence female 33 nc 6009 wake splitrock 27539  '),
 ('2_522_12715739',
  '522 12715739 judy parker female 56 nc 249 w

In [55]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
tokenizer = tokenizers.WordsTokzr("' '")
# tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'

In [56]:
test(
    con, 'srcall', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
[(51,)] 0.005589485168457031
0.03141498565673828 s
[(51,)] 0.009368896484375
0.03299570083618164 s
[(51,)] 0.005789995193481445
0.03207707405090332 s
[(51,)] 0.005128383636474609
0.024257421493530273 s
[(51,)] 0.005983114242553711
0.026927709579467773 s
[(51,)] 0.005984067916870117
0.02892279624938965 s
[(51,)] 0.009325027465820312
0.06735086441040039 s
[(51,)] 0.007447004318237305
0.03266429901123047 s
[(51,)] 0.00797414779663086
0.030572175979614258 s
[(51,)] 0.004671812057495117
0.025855541229248047 s

BRUTE FORCE EXECUTIONS
0.04556155204772949 s
0.030078887939453125 s
0.03303861618041992 s
0.031168460845947266 s
0.03575468063354492 s
0.033571720123291016 s
0.03629589080810547 s
0.037735700607299805 s
0.03346133232116699 s
0.03157925605773926 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.03330385684967041
Average execution time for brute force join: 0.03482460975646973


{'exec_time': [0.03141498565673828,
  0.03299570083618164,
  0.03207707405090332,
  0.024257421493530273,
  0.026927709579467773,
  0.02892279624938965,
  0.06735086441040039,
  0.03266429901123047,
  0.030572175979614258,
  0.025855541229248047],
 'exec_time_bf': [0.04556155204772949,
  0.030078887939453125,
  0.03303861618041992,
  0.031168460845947266,
  0.03575468063354492,
  0.033571720123291016,
  0.03629589080810547,
  0.037735700607299805,
  0.03346133232116699,
  0.03157925605773926]}

In [57]:
test(
    con, 'src1', 'src2', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

FILTERED EXECUTIONS
[(2084,)] 0.0039882659912109375
0.036042213439941406 s
[(2084,)] 0.00399327278137207
0.027648448944091797 s
[(2084,)] 0.0045928955078125
0.03495478630065918 s
[(2084,)] 0.028844356536865234
0.06159853935241699 s
[(2084,)] 0.0058650970458984375
0.030590295791625977 s
[(2084,)] 0.005052089691162109
0.03192615509033203 s
[(2084,)] 0.008127212524414062
0.0398862361907959 s
[(2084,)] 0.005283832550048828
0.036449432373046875 s
[(2084,)] 0.0055730342864990234
0.03365445137023926 s
[(2084,)] 0.003453969955444336
0.031200408935546875 s

BRUTE FORCE EXECUTIONS
0.011962652206420898 s
0.009701728820800781 s
0.009755611419677734 s
0.009587526321411133 s
0.010030984878540039 s
0.010613679885864258 s
0.009662628173828125 s
0.009447813034057617 s
0.01187896728515625 s
0.011062860488891602 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.03639509677886963
Average execution time for brute force join: 0.01037044525146

{'exec_time': [0.036042213439941406,
  0.027648448944091797,
  0.03495478630065918,
  0.06159853935241699,
  0.030590295791625977,
  0.03192615509033203,
  0.0398862361907959,
  0.036449432373046875,
  0.03365445137023926,
  0.031200408935546875],
 'exec_time_bf': [0.011962652206420898,
  0.009701728820800781,
  0.009755611419677734,
  0.009587526321411133,
  0.010030984878540039,
  0.010613679885864258,
  0.009662628173828125,
  0.009447813034057617,
  0.01187896728515625,
  0.011062860488891602]}

In [58]:
con.execute("drop table if exists src1")
con.execute("drop table if exists src2")
con.execute("drop table if exists src3")
con.execute("drop view if exists srcall")

<duckdb.DuckDBPyConnection at 0x22641f20cb0>

## Test Case: Profiles

In [7]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')

In [8]:
df10.head()

Unnamed: 0,realProfileID,date_of_birth,surname,address_1,street_number,postcode,soc_sec_id,suburb,phone_number,state,given_name,age,address_2
0,0,19390609.0,bishop,daley crescent,41.0,6050,4676841,batlow,08 29028996,qld,molly,31.0,
1,1,19041109.0,aidon,nambucca street,7.0,2002,3414163,devonort,08 75629459,vkf,whkt,,
2,2,19910711.0,anns,,12.0,2287,7844876,ivanhoe,02 11684110,vic,andrew,31.0,
3,3,19390709.0,whitrlsy,robson street,34.0,4065,2418360,christie downs,04 00323207,qld,shsne,,
4,4,19340328.0,roche,rankin street,9.0,3644,7577436,frenchs forest,08 53227250,qld,sophie,30.0,


In [9]:
to_concat = ", ' ', ".join(["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number", "state", "given_name", "age", "address_2"])
to_concat

"date_of_birth, ' ', surname, ' ', address_1, ' ', street_number, ' ', postcode, ' ', soc_sec_id, ' ', suburb, ' ', phone_number, ' ', state, ' ', given_name, ' ', age, ' ', address_2"

In [10]:
con.execute("drop table if exists db10").execute(
    "CREATE TABLE db10 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df10"
).execute("select * from db10").fetchall()

[(0,
  '19390609.0 bishop daley crescent 41.0 6050 4676841 batlow 08 29028996 qld molly 31.0 '),
 (1,
  '19041109.0 aidon nambucca street 7.0 2002 3414163 devonort 08 75629459 vkf whkt  '),
 (2,
  '19910711.0 anns  12.0 2287 7844876 ivanhoe 02 11684110 vic andrew 31.0 '),
 (3,
  '19390709.0 whitrlsy robson street 34.0 4065 2418360 christie downs 04 00323207 qld shsne  '),
 (4,
  '19340328.0 roche rankin street 9.0 3644 7577436 frenchs forest 08 53227250 qld sophie 30.0 '),
 (5,
  '19320811.0 fullgrabe beeston street 29.0 3131 6494586 broken hill 04 80080021 nsw emma 29.0 '),
 (6,
  '19601013.0 lodge mason street 48.0 5254 6098877 orchard hills 08 48143359 vic rourke 32.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (8,
  '19671108.0 bishop cromwell circuit 12.0 2226 1718686 harris park 04 01707833 vic jamie 28.0 homestead caravan park'),
 (9,
  '19331228.0 grcn lutana street 11.0 2074 8364236 newstead 02 57700508 ws lauren  '),
 

In [11]:
df10gt = pd.read_json("data/10KIdDuplicates.json", lines=True, orient='records', typ='frame')
df10gt.head()

Unnamed: 0,d1Id,d2Id
0,101,8933
1,4101,4614
2,7213,8466
3,4856,7485
4,4829,9725


In [12]:
con.execute("drop table if exists db10gt").execute(
    "CREATE TABLE db10gt AS "
    "SELECT d1Id as rid1, d2Id as rid2 "
    "FROM df10gt "
).execute("select count(*) from db10gt").fetchall()

[(8705,)]

In [13]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
out_table_name = 'matches'

In [14]:
con.execute("drop table if exists db10_sample").execute(
    "CREATE TABLE db10_sample AS "
    "SELECT * "
    "FROM db10 "
    "using sample 2000"
).execute("select * from db10_sample").fetchall()

[(3787,
  '19009229.0 nguuen  3.0 4161 3603062 north bondi 04 84333603 qld  35.0 '),
 (5136,
  '19940607.0 croker power street 17.0 2540 2762256 launceston south 08 11683765 wa karli 32.0 '),
 (8190,
  ' bulkpck couchmanvlfescent 404.0 4285 6830797 greenwith 04 19408554 wa madison 27.0 '),
 (5649,
  '19180210.0 priest barrington crescent 4.0 2560 5181941 beecher 08 80497243 nsw montana 29.0 loccn 7229'),
 (4670,
  '19011115.0 hodgens twelve trees crescent 25.0 5039 7725932 surfers paradise  vic hamish 25.0 pinecroft'),
 (7362,
  ' selth oxley street 18.0 2770 9254363 nicholls 03 39187251  logan 29.0 '),
 (5262,
  ' glass dashwood retreat 15.0 4121 2323091 gosford east 04 21145972 nsw kristo 28.0 '),
 (7,
  '19340921.0 coleman edman close 116.0 2397 8858237 castlecrag 03 58778382  harry 26.0 '),
 (3018,
  '19570312.0 millqr street ikins 6.0 3015 4132382 bundaberg 08 02566322 nsw tayah 24.0 '),
 (3191,
  '19181006.0 melenzed de little circuit 9.0 6430 1299036 moss vale 03 77810786 nsw ba

In [15]:
con.execute("drop view if exists db10gt_sample").execute(
    "create view db10gt_sample as "
    "select gt.* "
    "from db10_sample s1, db10_sample s2, db10gt gt "
    "where s1.rid = gt.rid1 "
    "and s2.rid = gt.rid2"
).execute("select count(*) from db10gt_sample").fetchall()

[(346,)]

In [16]:
test(
    con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizers.QGramsTokzr(5), 0.5, out_table_name
)
evaluate(con, 'db10gt_sample', out_table_name)

FILTERED EXECUTIONS
3.5006356239318848 s

BRUTE FORCE EXECUTIONS
4.1718127727508545 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 3.5006356239318848
Average execution time for brute force join: 4.1718127727508545


{'tp': 82,
 'fp': 0,
 'fn': 264,
 'pr': 1.0,
 'rc': 0.23699421965317918,
 'fm': 0.38317757009345793}

In [37]:
start_time = time.time()
jaccard_join(
    con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizers.QGramsTokzr(5), 0.2, out_table_name
)
print(time.time() - start_time)
evaluate(con, "db10gt_sample", out_table_name)

[(60009,)] 0.200667142868042
1.9039320945739746


{'tp': 326,
 'fp': 0,
 'fn': 20,
 'pr': 1.0,
 'rc': 0.9421965317919075,
 'fm': 0.9702380952380953}

In [38]:
start_time = time.time()
jaccard_join(
    con, 'db10', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizers.QGramsTokzr(5), 0.2, out_table_name
)
print(time.time() - start_time)
evaluate(con, "db10gt", out_table_name)

[(1540046,)] 0.9052159786224365
13.40549373626709


{'tp': 8240,
 'fp': 18,
 'fn': 483,
 'pr': 0.9978202954710583,
 'rc': 0.9446291413504528,
 'fm': 0.9704964371945114}

In [39]:
def test2(
        con: duckdb.DuckDBPyConnection,
        l_table: string,
        r_table: string,
        l_key_attr: string,
        r_key_attr: string,
        l_join_attr: string,
        r_join_attr: string,
        tokenizer: tokenizers.Tokenizer,
        thresholds: list,
        out_table_name: string,
        ground_truth_table: string
):
    for t in thresholds:
        print("threshold =", t)
        test(con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, t, out_table_name)
        print(evaluate(con, ground_truth_table, out_table_name))
        print()

In [40]:
tokenizer = tokenizers.QGramsTokzr(5)

In [41]:
test2(con, 'db10_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, # tokenizers.WordsTokzr("' '"),
      [0.6, 0.5, 0.4, 0.3, 0.2], out_table_name, "db10gt_sample")

threshold = 0.6
FILTERED EXECUTIONS
[(317,)] 0.039620161056518555
0.35471630096435547 s

BRUTE FORCE EXECUTIONS
2.263545513153076 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.35471630096435547
Average execution time for brute force join: 2.263545513153076
{'tp': 17, 'fp': 0, 'fn': 329, 'pr': 1.0, 'rc': 0.049132947976878616, 'fm': 0.09366391184573004}

threshold = 0.5
FILTERED EXECUTIONS
[(1971,)] 0.04826855659484863
0.30413389205932617 s

BRUTE FORCE EXECUTIONS
2.360861301422119 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.30413389205932617
Average execution time for brute force join: 2.360861301422119
{'tp': 82, 'fp': 0, 'fn': 264, 'pr': 1.0, 'rc': 0.23699421965317918, 'fm': 0.38317757009345793}

threshold = 0.4
FILTERED EXECUTIONS
[(6875,)] 0.06397199630737305
0.3836019039154053 s

BRUTE FORCE EXECUTIONS
2.1389076709747314 s

SUCCESS! Filtered 

In [42]:
test2(con, 'db10', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, # tokenizers.WordsTokzr("' '"),
      [0.6, 0.5, 0.4, 0.3, 0.2], out_table_name, "db10gt")

threshold = 0.6
FILTERED EXECUTIONS
[(39241,)] 0.11374020576477051
1.3156731128692627 s

BRUTE FORCE EXECUTIONS
36.75761890411377 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 1.3156731128692627
Average execution time for brute force join: 36.75761890411377
{'tp': 487, 'fp': 0, 'fn': 8218, 'pr': 1.0, 'rc': 0.055944859276278, 'fm': 0.10596170583115753}

threshold = 0.5
FILTERED EXECUTIONS
[(103154,)] 0.2797229290008545
1.8946571350097656 s

BRUTE FORCE EXECUTIONS
38.27973198890686 s

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 1.8946571350097656
Average execution time for brute force join: 38.27973198890686
{'tp': 1928, 'fp': 0, 'fn': 6777, 'pr': 1.0, 'rc': 0.22148190695002873, 'fm': 0.3626445970093107}

threshold = 0.4
FILTERED EXECUTIONS
[(237514,)] 0.35709071159362793
13.972155332565308 s

BRUTE FORCE EXECUTIONS
38.37903046607971 s

SUCCESS! Filtered 

## Test case: larger Profiles datasets

In [34]:
df50 = pd.read_json("data/50Kprofiles.json", lines=True, orient='records', typ='frame')

In [35]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df50"
).execute("select * from src2").fetchall()

[(0,
  '19551130.0 reid jerrabomberra avenue 191.0 2115 7907036 tylden 02 57675204 nsw nicholas  gold tyne'),
 (1,
  '19821020.0 porra meyers place 2.0 3130 2705215 kingaroy 03 64681506 wa elle 34.0 '),
 (2,
  '19351010.0 pochec duterrau crescent 46.0 2785 2350343 lalor park 03 93693261 vic mitchell 29.0 '),
 (3,
  ' beaumaris ellenborough street 11.0 2285 6735143 innaloo 04 02935530 qld marianne 24.0 '),
 (4, ' ransin ahernuplace 2.0 4496 6721672 armiake  qld christian 38.0 '),
 (5,
  '19740410.0 forsbaw norman fisher circuit 5.0 2707 7456433 bayview hdeights  qph lesa 29.0 warrawong'),
 (6,
  '19960201.0 nguyen hansen circuit 44.0 5244 7748525  04 10399694 nsw caitlin 29.0 '),
 (7,
  '19607722.0 eichimnrger stockdale street 11.0 4224 8900629 cherrbqrook 04 99393660 qld isabrlla 36.0 st john of godzhospital'),
 (8,
  '19280419.0 hawes giles street 3.0 2197 4044861 brandon 04 54963323 vic michael 28.0 '),
 (9,
  '19010611.0 palecek goldner circuit 31.0 6102 8225306 latrobe 07 75127812 

In [40]:
df100 = pd.read_json("data/100Kprofiles.json", lines=True, orient='records', typ='frame')

In [41]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df100"
).execute("select * from src3").fetchall()

[(0,
  '19190809.0 chandler pudney street 376.0 3764 3423802 hemmant 02 24496575 qld luke 35.0 arlington'),
 (1,
  '19300528.0 hawes  8.0 2168 2180735 keilor east 04 87308830 qld millane 36.0 northern tablelands tennis academy'),
 (2,
  '19300112.0 novadk rfreet rankin 1.0 3218 4272019 beulah park 03 39054375 qld jatmike 26.0 '),
 (3,
  '19950727.0 redmond davenport street 12.0 3030 5381275 kangaroo flat 07 69720733 qld madeleine 33.0 vinery'),
 (4,
  '19631123.0 zimmermann corona place 7.0 4030 2894006 greenacres 03 48393238 nsw kayden 31.0 glen elgin'),
 (5,
  '19350925.0 canbkl macfarland crescent 61.0 3164 5282937 chishmolm 02 01482240 nsw sammy 72.0 '),
 (6,
  '19650417.0 psorakis owen crescent 77.0 2452 2052783 mannum 07 48547017 wa laura  '),
 (7,
  '19230205.0 agius arthur circle 2.0 6125 8456442 marsden 02 54492364 qld indiana 32.0 '),
 (8,
  '19830325.0 hegger arabana street 2.0 4178 9175025 wellington point 04 21430300 qld jaykob  '),
 (9, '19930213.0 weetfa  6.0 3106 541836

In [38]:
start_time = time.time()
jaccard_join(
    con, 'src2', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizers.QGramsTokzr(5), 0.5, out_table_name
)
time.time() - start_time

330.26074504852295

In [None]:
start_time = time.time()
jaccard_join(
    con, 'src3', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizers.QGramsTokzr(5), 0.5, out_table_name
)
time.time() - start_time