In [1]:
!pip install duckdb



In [2]:
import duckdb
import time
import pandas as pd

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from py_duckdb.similarity_join import tokenizers
from py_duckdb.similarity_join import jaccard_join
from py_duckdb.similarity_join import jaccard_join_brute_force

In [5]:
import string
import numpy as np

def test(
        con: duckdb.DuckDBPyConnection,
        l_table: string,
        r_table: string,
        l_key_attr: string,
        r_key_attr: string,
        l_join_attr: string,
        r_join_attr: string,
        tokenizer: tokenizers.Tokenizer,
        threshold: float,
        out_table_name: string,
        n=1
):
    exec_times = []
    exec_times_bf = []
    for t in range(0, n):
        start_time = time.time()
        jaccard_join(
            con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
        )
        #con.execute()
        end_time = time.time()
        exec_times.append(end_time - start_time)

        start_time = time.time()
        jaccard_join_brute_force(
            con, l_table, r_table, l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, "bf_" + out_table_name
        )
        end_time = time.time()
        exec_times_bf.append(end_time - start_time)

    #x = con.execute("select * from duckdb_views()").fetchall()
    #print(x)

    cmp_join = con.execute(
        "select * "
        f"from {out_table_name} m "
        #f"right outer join bf_{out_table_name} b on b.rid1 = m.rid1 and b.rid2 = m.rid2 "
        f"full outer join bf_{out_table_name} b on b.rid1 = m.rid1 and b.rid2 = m.rid2 "
        "where m.rid1 is null "
    ).fetchall()
    if len(cmp_join) == 0:
        print("SUCCESS! Filtered join and Brute force join returned the same result")
    else:
        print("ERROR! There are mismatches between Filtered and Brute force joins:", cmp_join)
    print("Average execution time for filtered join:", np.average(exec_times))
    print("Average execution time for brute force join:", np.average(exec_times_bf))

In [6]:
con = duckdb.connect(database=':memory:')

# Test case: Actors

In [7]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S1_clean_.csv'"
).execute("select * from src1").fetchall()

[('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'),
 ('S1_8', 'olivia hobson 19760812'),
 ('S1_9', 'michael lierach 19360816'),
 ('S1_10', 'elisabett domiten 19081008'),
 ('S1_11', 'genoveffa hylander 19071008')]

In [8]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S2_clean_.csv'"
).execute("select * from src2").fetchall()

[('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 19321129'),
 ('S2_4', 'olivia hobson 19760812'),
 ('S2_5', 'joshua green 19010219'),
 ('S2_6', 'charlotte hyland 19340909'),
 ('S2_7', 'elisabet domitienn 19071008')]

In [9]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S3_clean_.csv'"
).execute("select * from src3").fetchall()

[('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 19760812'),
 ('S3_7', 'joshua green 19790110'),
 ('S3_8', 'keely clarke 19050410'),
 ('S3_9', 'joshua morriosn 19101123'),
 ('S3_11', 'genovefa hyllande 19071008')]

In [10]:
con.execute("drop view if exists srcall").execute(
    "create view srcall as "
    "select * from src1 "
    "union "
    "select * from src2 "
    "union "
    "select * from src3 "
).execute("select * from srcall").fetchall()

[('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 19760812'),
 ('S3_7', 'joshua green 19790110'),
 ('S3_8', 'keely clarke 19050410'),
 ('S3_9', 'joshua morriosn 19101123'),
 ('S3_11', 'genovefa hyllande 19071008'),
 ('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 19321129'),
 ('S2_4', 'olivia hobson 19760812'),
 ('S2_5', 'joshua green 19010219'),
 ('S2_6', 'charlotte hyland 19340909'),
 ('S2_7', 'elisabet domitienn 19071008'),
 ('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'

In [11]:
# function args
l_table = 'src1'
r_table = 'src2'

l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
# tokenizer = tokenizers.WordsTokzr(r"' '")
tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'

In [12]:
test(
    con, 'srcall', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.010431957244873048
Average execution time for brute force join: 0.005044388771057129


In [13]:
test(
    con, 'src1', 'src2', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.013778972625732421
Average execution time for brute force join: 0.0044021129608154295


# Test case: NCVR

In [14]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    "SELECT id as rid, concat(entity, '|', rec_id, '|', first_name, '|', last_name, '|', sex,age, '|', birth_place, '|', house_num, '|', county_desc, '|', street_name, '|', zip_code, '|', phone_num) as val "
    "FROM 'data/NCVR_AF_clean.csv'"
).execute("select * from src1").fetchall()

[('0_22_9865350',
  '22|9865350|whitney|baker|female29|in|400|orange|poplar|27510| '),
 ('0_40_12768214',
  '40|12768214|abbington|pope|female23|nc|1221|wake|westview|27605| '),
 ('0_122_9112102',
  '122|9112102|rebecca|wilkins|female49|nc|811|new hanover|magnolia|28428|8120512'),
 ('0_140_9704280',
  '140|9704280|justin|brown|male34|ca|3225|orange|us hwy 70|27243| '),
 ('0_222_3198122',
  '222|3198122|stephanie|eissens|female38|nc|100|durham|village circle|27713| '),
 ('0_240_94472',
  '240|94472|danielle|peschon|female28|tn|1049|alamance|kelso|27215|6758354'),
 ('0_251_6640272',
  '251|6640272|michelle|hinnant|female45|nc|130|johnston|braswell|27577|2024737'),
 ('0_340_10179287',
  '340|10179287|mark|caccio|male64|ct|213|pender|scottsdale|28411|6008095'),
 ('0_351_6376559',
  '351|6376559|nancy|hoover|female55|ny|618|iredell|isle of pines|28117| '),
 ('0_451_1422321',
  '451|1422321|thomas|johnson|male29|nc|501|caswell|cherry grove|27379|2696260'),
 ('0_522_12340214',
  '522|12340214

In [28]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    "SELECT id as rid, concat(entity, '|', rec_id, '|', first_name, '|', last_name, '|', sex,age, '|', birth_place, '|', house_num, '|', county_desc, '|', street_name, '|', zip_code, '|', phone_num) as val "
    "FROM 'data/NCVR_BF_clean.csv'"
).execute("select * from src2").fetchall()

[('1_40_13913995',
  '40|13913995|abbington|pope|female23|nc|104|wayne|breezewood|27534| '),
 ('1_41_520429',
  '41|520429|angelon|smith|female40|nc|119|buncombe|flint|28801| '),
 ('1_122_9000404',
  '122|9000404|rebecca|wilkins|female49| |1501|nash|lafayette|27803| '),
 ('1_140_9350108',
  '140|9350108|justin|brown|male34|ca|211|new hanover|queen|28401|7417817'),
 ('1_222_13226442',
  '222|13226442|stephanie|eissens meacomes|female38|nc|4033|wake|enfield ridge|27519| '),
 ('1_240_116265',
  '240|116265|danielle|peschon|female28| |730|alamance|boone station|27215|2789980'),
 ('1_241_8517073',
  '241|8517073|lovie|matthews|female26|tn|5634|mecklenburg|via romano|28270| '),
 ('1_322_6614729',
  '322|6614729|sandra|creech|female52|nc|1347|johnston|crocker|27577|4645707'),
 ('1_340_9418808',
  '340|9418808|mark|caccio|male64|ct|605|onslow|windsong north|28584|6008095'),
 ('1_341_6619987',
  '341|6619987|amanda|spencer|female37|nc|607|johnston|preston|27576| '),
 ('1_422_10444451',
  '422|1

In [16]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    "SELECT id as rid, concat(entity, '|', rec_id, '|', first_name, '|', last_name, '|', sex,age, '|', birth_place, '|', house_num, '|', county_desc, '|', street_name, '|', zip_code, '|', phone_num) as val "
    "FROM 'data/NCVR_CF_clean.csv'"
).execute("select * from src3").fetchall()

[('2_22_3326652',
  '22|3326652|whitney|baker|female29|in|709|durham|green|27701| '),
 ('2_31_3904957',
  '31|3904957|latonya|mciver|female40|va|30|forsyth|glenwood|27106|9166714'),
 ('2_131_12315539',
  '131|12315539|alice|feldbusch|female29|nc|119|wake|fawn|27587| '),
 ('2_140_12924542',
  '140|12924542|justin|brown|male34|ca|8510|wake|silhouette|27613| '),
 ('2_222_12748029',
  '222|12748029|stephanie|eissens|female38|nc|133|wake|wards ridge|27513| '),
 ('2_231_9728052',
  '231|9728052|william|mcinerney|male29|nc|265|orange|severin|27516| '),
 ('2_240_13265262',
  '240|13265262|danielle|peschon|female28|tn|2319|wake|hinton|27612| '),
 ('2_331_13927418',
  '331|13927418|james|byrd|male53|nc|610|wayne|park|27530|9152245'),
 ('2_431_2134645',
  '431|2134645|ainslie|guion|female57|ky|323|craven|trenton|28523|8144405'),
 ('2_440_12811452',
  '440|12811452|heather|spence|female33|nc|6009|wake|splitrock|27539| '),
 ('2_522_12715739',
  '522|12715739|judy|parker|female56|nc|249|wake|money t

In [17]:
con.execute("drop view if exists srcall").execute(
    "create view srcall as "
    "select * from src1 "
    "union "
    "select * from src2 "
    "union "
    "select * from src3 "
).execute("select * from srcall").fetchall()

[('2_22_3326652',
  '22|3326652|whitney|baker|female29|in|709|durham|green|27701| '),
 ('2_31_3904957',
  '31|3904957|latonya|mciver|female40|va|30|forsyth|glenwood|27106|9166714'),
 ('2_131_12315539',
  '131|12315539|alice|feldbusch|female29|nc|119|wake|fawn|27587| '),
 ('2_140_12924542',
  '140|12924542|justin|brown|male34|ca|8510|wake|silhouette|27613| '),
 ('2_222_12748029',
  '222|12748029|stephanie|eissens|female38|nc|133|wake|wards ridge|27513| '),
 ('2_231_9728052',
  '231|9728052|william|mcinerney|male29|nc|265|orange|severin|27516| '),
 ('2_240_13265262',
  '240|13265262|danielle|peschon|female28|tn|2319|wake|hinton|27612| '),
 ('2_331_13927418',
  '331|13927418|james|byrd|male53|nc|610|wayne|park|27530|9152245'),
 ('2_431_2134645',
  '431|2134645|ainslie|guion|female57|ky|323|craven|trenton|28523|8144405'),
 ('2_440_12811452',
  '440|12811452|heather|spence|female33|nc|6009|wake|splitrock|27539| '),
 ('2_522_12715739',
  '522|12715739|judy|parker|female56|nc|249|wake|money t

In [18]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
tokenizer = tokenizers.WordsTokzr("'\|'")
# tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'

In [19]:
test(
    con, 'srcall', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.03225562572479248
Average execution time for brute force join: 0.025445032119750976


In [20]:
test(
    con, 'src1', 'src2', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name, 10
)

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.022009873390197755
Average execution time for brute force join: 0.007908082008361817


## Test Case: Profiles

In [21]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')

In [22]:
df10.head()

Unnamed: 0,realProfileID,date_of_birth,surname,address_1,street_number,postcode,soc_sec_id,suburb,phone_number,state,given_name,age,address_2
0,0,19390609.0,bishop,daley crescent,41.0,6050,4676841,batlow,08 29028996,qld,molly,31.0,
1,1,19041109.0,aidon,nambucca street,7.0,2002,3414163,devonort,08 75629459,vkf,whkt,,
2,2,19910711.0,anns,,12.0,2287,7844876,ivanhoe,02 11684110,vic,andrew,31.0,
3,3,19390709.0,whitrlsy,robson street,34.0,4065,2418360,christie downs,04 00323207,qld,shsne,,
4,4,19340328.0,roche,rankin street,9.0,3644,7577436,frenchs forest,08 53227250,qld,sophie,30.0,


In [23]:
con.execute("drop table if exists src1").execute(
    "CREATE TABLE src1 AS "
    "SELECT realProfileID as rid, concat (date_of_birth, '|', surname, '|', address_1, '|', street_number, '|', postcode, '|',soc_sec_id, '|',"
    "suburb, '|',phone_number, '|', state, '|', given_name, '|',age, '|',address_2 ) as val "
    "FROM df10"
).execute("select * from src1").fetchall()

[(0,
  '19390609.0|bishop|daley crescent|41.0|6050|4676841|batlow|08 29028996|qld|molly|31.0|'),
 (1,
  '19041109.0|aidon|nambucca street|7.0|2002|3414163|devonort|08 75629459|vkf|whkt||'),
 (2,
  '19910711.0|anns||12.0|2287|7844876|ivanhoe|02 11684110|vic|andrew|31.0|'),
 (3,
  '19390709.0|whitrlsy|robson street|34.0|4065|2418360|christie downs|04 00323207|qld|shsne||'),
 (4,
  '19340328.0|roche|rankin street|9.0|3644|7577436|frenchs forest|08 53227250|qld|sophie|30.0|'),
 (5,
  '19320811.0|fullgrabe|beeston street|29.0|3131|6494586|broken hill|04 80080021|nsw|emma|29.0|'),
 (6,
  '19601013.0|lodge|mason street|48.0|5254|6098877|orchard hills|08 48143359|vic|rourke|32.0|'),
 (7,
  '19340921.0|coleman|edman close|116.0|2397|8858237|castlecrag|03 58778382||harry|26.0|'),
 (8,
  '19671108.0|bishop|cromwell circuit|12.0|2226|1718686|harris park|04 01707833|vic|jamie|28.0|homestead caravan park'),
 (9,
  '19331228.0|grcn|lutana street|11.0|2074|8364236|newstead|02 57700508|ws|lauren||'),
 

In [30]:
df50 = pd.read_json("data/50Kprofiles.json", lines=True, orient='records', typ='frame')

In [31]:
con.execute("drop table if exists src2").execute(
    "CREATE TABLE src2 AS "
    "SELECT realProfileID as rid, concat (date_of_birth, '|', surname, '|', address_1, '|', street_number, '|', postcode, '|',soc_sec_id, '|',"
    "suburb, '|',phone_number, '|', state, '|', given_name, '|',age, '|',address_2 ) as val "
    "FROM df50"
).execute("select * from src2").fetchall()

[(0,
  '19551130.0|reid|jerrabomberra avenue|191.0|2115|7907036|tylden|02 57675204|nsw|nicholas||gold tyne'),
 (1,
  '19821020.0|porra|meyers place|2.0|3130|2705215|kingaroy|03 64681506|wa|elle|34.0|'),
 (2,
  '19351010.0|pochec|duterrau crescent|46.0|2785|2350343|lalor park|03 93693261|vic|mitchell|29.0|'),
 (3,
  '|beaumaris|ellenborough street|11.0|2285|6735143|innaloo|04 02935530|qld|marianne|24.0|'),
 (4, '|ransin|ahernuplace|2.0|4496|6721672|armiake||qld|christian|38.0|'),
 (5,
  '19740410.0|forsbaw|norman fisher circuit|5.0|2707|7456433|bayview hdeights||qph|lesa|29.0|warrawong'),
 (6,
  '19960201.0|nguyen|hansen circuit|44.0|5244|7748525||04 10399694|nsw|caitlin|29.0|'),
 (7,
  '19607722.0|eichimnrger|stockdale street|11.0|4224|8900629|cherrbqrook|04 99393660|qld|isabrlla|36.0|st john of godzhospital'),
 (8,
  '19280419.0|hawes|giles street|3.0|2197|4044861|brandon|04 54963323|vic|michael|28.0|'),
 (9,
  '19010611.0|palecek|goldner circuit|31.0|6102|8225306|latrobe|07 75127812|

In [35]:
df100 = pd.read_json("data/100Kprofiles.json", lines=True, orient='records', typ='frame')

In [36]:
con.execute("drop table if exists src3").execute(
    "CREATE TABLE src3 AS "
    "SELECT realProfileID as rid, concat (date_of_birth, '|', surname, '|', address_1, '|', street_number, '|', postcode, '|',soc_sec_id, '|',"
    "suburb, '|',phone_number, '|', state, '|', given_name, '|',age, '|',address_2 ) as val "
    "FROM df100"
).execute("select * from src3").fetchall()

[(0,
  '19190809.0|chandler|pudney street|376.0|3764|3423802|hemmant|02 24496575|qld|luke|35.0|arlington'),
 (1,
  '19300528.0|hawes||8.0|2168|2180735|keilor east|04 87308830|qld|millane|36.0|northern tablelands tennis academy'),
 (2,
  '19300112.0|novadk|rfreet rankin|1.0|3218|4272019|beulah park|03 39054375|qld|jatmike|26.0|'),
 (3,
  '19950727.0|redmond|davenport street|12.0|3030|5381275|kangaroo flat|07 69720733|qld|madeleine|33.0|vinery'),
 (4,
  '19631123.0|zimmermann|corona place|7.0|4030|2894006|greenacres|03 48393238|nsw|kayden|31.0|glen elgin'),
 (5,
  '19350925.0|canbkl|macfarland crescent|61.0|3164|5282937|chishmolm|02 01482240|nsw|sammy|72.0|'),
 (6,
  '19650417.0|psorakis|owen crescent|77.0|2452|2052783|mannum|07 48547017|wa|laura||'),
 (7,
  '19230205.0|agius|arthur circle|2.0|6125|8456442|marsden|02 54492364|qld|indiana|32.0|'),
 (8,
  '19830325.0|hegger|arabana street|2.0|4178|9175025|wellington point|04 21430300|qld|jaykob||'),
 (9, '19930213.0|weetfa||6.0|3106|541836

In [24]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
tokenizer = tokenizers.WordsTokzr("'\|'")
# tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.5
out_table_name = 'matches'
tokenizer.query()

"select src, rid, len(tks) as rlen, lower(unnest(tks)) as token from ( select src, rid, list_distinct(list_filter(str_split_regex(val, '\\|'), x -> trim(x) != '')) as tks from input ) "

In [25]:
con.execute("drop table if exists src1_sample").execute(
    "CREATE TABLE src1_sample AS "
    "SELECT * "
    "FROM src1 "
    "using sample 2000"
).execute("select * from src1_sample").fetchall()

[(2513,
  '19881013.0|leslie|doyle terrace||3340|1183674|yarloop|04 72304060|nsw|hunter|23.0|'),
 (8122,
  '19530222.0|hingston|crisp circuit|9.0|4170|3996760|cressy|02 59932276|nsw|lachlan|28.0|'),
 (5038,
  '19690119.0|mcgreror|leahy close|7.0|2430|8229328|frenche forest|03 17036920|nsw|howly||'),
 (3,
  '19390709.0|whitrlsy|robson street|34.0|4065|2418360|christie downs|04 00323207|qld|shsne||'),
 (4,
  '19340328.0|roche|rankin street|9.0|3644|7577436|frenchs forest|08 53227250|qld|sophie|30.0|'),
 (8860, '|ctosty mac||42.0|5031|9650428|craigyr|08 65206978|nseq|ownd||'),
 (5754,
  '19740714.0|hillary|cracknell street|21.0|6173|5747986|fairlight|07 62485658||levi|33.0|'),
 (7,
  '19340921.0|coleman|edman close|116.0|2397|8858237|castlecrag|03 58778382||harry|26.0|'),
 (3514,
  '19740102.0|braithwaite|palmer street||4221|5585216|scarness|04 26147307|vic|alex|33.0|'),
 (9124,
  '19911130.0|ludzioweit|whiteman street|12.0|7303|1669170|yarraville|02 75406204|nsw|rachel|33.0|glenhurst'),


In [26]:
test(
    con, 'src1_sample', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
)

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.08862972259521484
Average execution time for brute force join: 0.18835902214050293


In [27]:
test(
    con, 'src1', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
)

SUCCESS! Filtered join and Brute force join returned the same result
Average execution time for filtered join: 0.29428815841674805
Average execution time for brute force join: 4.044426202774048


In [34]:
jaccard_join(
    con, 'src2', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
)

<duckdb.DuckDBPyConnection at 0x1d3087018f0>

In [37]:
jaccard_join(
    con, 'src3', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
)

<duckdb.DuckDBPyConnection at 0x1d3087018f0>

In [39]:
con.execute(f"select * from {out_table_name}").fetchall()

[(51645, 65302),
 (3951, 64720),
 (40676, 66765),
 (45316, 66244),
 (67276, 68039),
 (19498, 68350),
 (52816, 67681),
 (44417, 68436),
 (9800, 71697),
 (8489, 74287),
 (58061, 75084),
 (52672, 74084),
 (26007, 75042),
 (74157, 74365),
 (73592, 75848),
 (9964, 76044),
 (3991, 77427),
 (43373, 77759),
 (44455, 80083),
 (68104, 79671),
 (40959, 79100),
 (59648, 81882),
 (54678, 80396),
 (38004, 60716),
 (27767, 63096),
 (38895, 82106),
 (13673, 82897),
 (46374, 83015),
 (77026, 81977),
 (76217, 84248),
 (34552, 85338),
 (7837, 85381),
 (72003, 86355),
 (49574, 85016),
 (6572, 85841),
 (36750, 85187),
 (57422, 86072),
 (38861, 85476),
 (32024, 86634),
 (6017, 86680),
 (84594, 88726),
 (12002, 88789),
 (42221, 83064),
 (24536, 90618),
 (65383, 90370),
 (57122, 90635),
 (62872, 91425),
 (79887, 88461),
 (36433, 90635),
 (26701, 91913),
 (54091, 65695),
 (68942, 79671),
 (19354, 79740),
 (74640, 79019),
 (73617, 82089),
 (68801, 82262),
 (64789, 82770),
 (39698, 89371),
 (34860, 92075),
 (564