In [188]:
!pip install duckdb



In [189]:
import duckdb

In [190]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [191]:
from py_duckdb.similarity_join import tokenizers

In [192]:
con = duckdb.connect(database=':memory:')

In [193]:
q = 3
t = 0.5

## Self join

In [194]:
# must be a table to use INSERT afterwards
con.execute("drop table if exists input").execute(
    "CREATE TABLE input AS "
    "SELECT id as rid, concat(given_name, ' ', surname, ' ', date_of_birth) as val "
    "FROM 'data/S1_clean_.csv'")

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [195]:
con.execute(
    "insert into input "
    "select id, concat(given_name, ' ', surname, ' ', date_of_birth) "
    "from 'data/S2_clean_.csv'")

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [196]:
con.execute(
    "insert into input "
    "select id, concat(given_name, ' ', surname, ' ', date_of_birth) "
    "from 'data/S3_clean_.csv'")

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [197]:
con.execute("select * from input").fetchall()

[('S1_0', 'joshua morrison 19101123'),
 ('S1_1', 'jordan white 19371126'),
 ('S1_2', 'emmerson lock 19211129'),
 ('S1_3', 'alexandra grosser 19720305'),
 ('S1_4', 'michael wuchatsch 19190110'),
 ('S1_5', 'emmerson loyck 19211129'),
 ('S1_6', 'rhys schuetz 19440909'),
 ('S1_7', 'joshua greenj 19790110'),
 ('S1_8', 'olivia hobson 19760812'),
 ('S1_9', 'michael lierach 19360816'),
 ('S1_10', 'elisabett domiten 19081008'),
 ('S1_11', 'genoveffa hylander 19071008'),
 ('S2_0', 'braecon schuetz 19440909'),
 ('S2_1', 'alexandra grosvenor 19930305'),
 ('S2_2', 'michael liersch 19360816'),
 ('S2_3', 'emmeron loyk 19321129'),
 ('S2_4', 'olivia hobson 19760812'),
 ('S2_5', 'joshua green 19010219'),
 ('S2_6', 'charlotte hyland 19340909'),
 ('S2_7', 'elisabet domitienn 19071008'),
 ('S3_0', 'emmerson loyck 19211129'),
 ('S3_1', 'michel wuchatsch 19190110'),
 ('S3_3', 'liersch michael 19360816'),
 ('S3_4', 'charlotte hyland 19460401'),
 ('S3_5', 'braedon schuetz 19440909'),
 ('S3_6', 'olivia hobson 1

In [198]:
tokenizer = tokenizers.WordsTokzr()
#tokenizer = tokenizers.QGramsTokzr(3)
tokzr_query = ''

table = "input"
key_attr = "rid"
join_attr = "val"

if isinstance(tokenizer, tokenizers.WordsTokzr):
    separators = tokenizer.get_info()
    tokzr_query = (f"select {key_attr}, len(tks) as rlen, lower(unnest(tks)) as token "
                   "from ( "
                   f"select distinct {key_attr}, str_split_regex({join_attr}, {separators}) as tks """
                   f"from {table} "
                   ") ")
elif isinstance(tokenizer, tokenizers.QGramsTokzr):
    q = tokenizer.get_info()
    tokzr_query = (f"select distinct {key_attr}, rlen "
                   f", substring(concat(repeat('#', {q} - 1), "
                   f"lower({join_attr}), "
                   f"repeat('#',{q} - 1)),"
                   f"x, {q}) as token "
                   "from ("
                   f"select *, len({join_attr}) + {q} - 1 as rlen, unnest(generate_series(1, rlen)) as x "
                   f"from {table} "
                   ")")

tokzr_query

'select rid, len(tks) as rlen, lower(unnest(tks)) as token from ( select distinct rid, str_split_regex(val, \'[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c]\') as tks from input ) '

In [199]:
con.execute("drop view if exists tokens").execute(
    "create view tokens as " + tokzr_query
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [200]:
con.execute("select * from tokens").fetchall()

[('S1_0', 3, 'joshua'),
 ('S1_0', 3, 'morrison'),
 ('S1_0', 3, '19101123'),
 ('S1_1', 3, 'jordan'),
 ('S1_1', 3, 'white'),
 ('S1_1', 3, '19371126'),
 ('S1_2', 3, 'emmerson'),
 ('S1_2', 3, 'lock'),
 ('S1_2', 3, '19211129'),
 ('S1_3', 3, 'alexandra'),
 ('S1_3', 3, 'grosser'),
 ('S1_3', 3, '19720305'),
 ('S1_4', 3, 'michael'),
 ('S1_4', 3, 'wuchatsch'),
 ('S1_4', 3, '19190110'),
 ('S1_5', 3, 'emmerson'),
 ('S1_5', 3, 'loyck'),
 ('S1_5', 3, '19211129'),
 ('S1_6', 3, 'rhys'),
 ('S1_6', 3, 'schuetz'),
 ('S1_6', 3, '19440909'),
 ('S1_7', 3, 'joshua'),
 ('S1_7', 3, 'greenj'),
 ('S1_7', 3, '19790110'),
 ('S1_8', 3, 'olivia'),
 ('S1_8', 3, 'hobson'),
 ('S1_8', 3, '19760812'),
 ('S1_9', 3, 'michael'),
 ('S1_9', 3, 'lierach'),
 ('S1_9', 3, '19360816'),
 ('S1_10', 3, 'elisabett'),
 ('S1_10', 3, 'domiten'),
 ('S1_10', 3, '19081008'),
 ('S1_11', 3, 'genoveffa'),
 ('S1_11', 3, 'hylander'),
 ('S1_11', 3, '19071008'),
 ('S2_0', 3, 'braecon'),
 ('S2_0', 3, 'schuetz'),
 ('S2_0', 3, '19440909'),
 ('S2_1', 

In [201]:
# brute force join, to get the gold standard
con.execute("drop view if exists bfjoin").execute(
    "create view bfjoin as "
    "select r1.rid as rid1, r2.rid as rid2, count(*) as overlap "
    "from tokens as r1, tokens as r2 "
    "where r1.token = r2.token "
    "and r1.rid < r2.rid "
    "group by r1.rid, r1.rlen, r2.rid, r2.rlen "
    f"having count(*) >= ceil({t} / (1+{t}) * (r1.rlen + r2.rlen))"
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [202]:
con.execute("select * from bfjoin").fetchall()

[('S1_0', 'S3_9', 2),
 ('S1_2', 'S3_0', 2),
 ('S1_2', 'S1_5', 2),
 ('S1_4', 'S3_1', 2),
 ('S1_5', 'S3_0', 3),
 ('S1_6', 'S3_5', 2),
 ('S1_6', 'S2_0', 2),
 ('S1_7', 'S3_7', 2),
 ('S1_8', 'S3_6', 3),
 ('S1_8', 'S2_4', 3),
 ('S1_9', 'S3_3', 2),
 ('S1_9', 'S2_2', 2),
 ('S2_0', 'S3_5', 2),
 ('S2_2', 'S3_3', 3),
 ('S2_4', 'S3_6', 3),
 ('S2_5', 'S3_7', 2),
 ('S2_6', 'S3_4', 2)]

## PrefixR

In [203]:
con.execute("drop view if exists doc_frequency").execute(
    "CREATE VIEW doc_frequency AS "
    "SELECT token, count(*) AS df "
    "FROM tokens "
    "GROUP BY token "
    # ORDER BY later!
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [204]:
con.execute("SELECT * FROM doc_frequency").fetchall()

[('joshua', 5),
 ('morrison', 1),
 ('19101123', 2),
 ('jordan', 1),
 ('white', 1),
 ('19371126', 1),
 ('emmerson', 3),
 ('lock', 1),
 ('19211129', 3),
 ('alexandra', 2),
 ('grosser', 1),
 ('19720305', 1),
 ('michael', 4),
 ('wuchatsch', 2),
 ('19190110', 2),
 ('loyck', 2),
 ('rhys', 1),
 ('schuetz', 3),
 ('19440909', 3),
 ('greenj', 1),
 ('19790110', 2),
 ('olivia', 3),
 ('hobson', 3),
 ('19760812', 3),
 ('lierach', 1),
 ('19360816', 3),
 ('elisabett', 1),
 ('domiten', 1),
 ('19081008', 1),
 ('genoveffa', 1),
 ('hylander', 1),
 ('19071008', 3),
 ('braecon', 1),
 ('grosvenor', 1),
 ('19930305', 1),
 ('liersch', 2),
 ('emmeron', 1),
 ('loyk', 1),
 ('19321129', 1),
 ('green', 2),
 ('19010219', 1),
 ('charlotte', 2),
 ('hyland', 2),
 ('19340909', 1),
 ('elisabet', 1),
 ('domitienn', 1),
 ('michel', 1),
 ('19460401', 1),
 ('braedon', 1),
 ('keely', 1),
 ('clarke', 1),
 ('19050410', 1),
 ('morriosn', 1),
 ('genovefa', 1),
 ('hyllande', 1)]

In [205]:
con.execute("drop view if exists R").execute(
    "CREATE VIEW R AS ("
    "select rid, rlen, token "
    ", row_number() OVER (PARTITION BY rid ORDER BY df, token) as pos "
    "from ("
    "SELECT tt.*, df.df "
    "FROM tokens AS tt "
    "JOIN doc_frequency AS df ON tt.token = df.token "
    ")"
    ")"
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [206]:
x = con.execute("select * from R").fetchall()
len(x), x

(90,
 [('S1_0', 3, 'morrison', 1),
  ('S1_0', 3, '19101123', 2),
  ('S1_0', 3, 'joshua', 3),
  ('S1_1', 3, '19371126', 1),
  ('S1_1', 3, 'jordan', 2),
  ('S1_1', 3, 'white', 3),
  ('S1_10', 3, '19081008', 1),
  ('S1_10', 3, 'domiten', 2),
  ('S1_10', 3, 'elisabett', 3),
  ('S1_11', 3, 'genoveffa', 1),
  ('S1_11', 3, 'hylander', 2),
  ('S1_11', 3, '19071008', 3),
  ('S1_2', 3, 'lock', 1),
  ('S1_2', 3, '19211129', 2),
  ('S1_2', 3, 'emmerson', 3),
  ('S1_3', 3, '19720305', 1),
  ('S1_3', 3, 'grosser', 2),
  ('S1_3', 3, 'alexandra', 3),
  ('S1_4', 3, '19190110', 1),
  ('S1_4', 3, 'wuchatsch', 2),
  ('S1_4', 3, 'michael', 3),
  ('S1_5', 3, 'loyck', 1),
  ('S1_5', 3, '19211129', 2),
  ('S1_5', 3, 'emmerson', 3),
  ('S1_6', 3, 'rhys', 1),
  ('S1_6', 3, '19440909', 2),
  ('S1_6', 3, 'schuetz', 3),
  ('S1_7', 3, 'greenj', 1),
  ('S1_7', 3, '19790110', 2),
  ('S1_7', 3, 'joshua', 3),
  ('S1_8', 3, '19760812', 1),
  ('S1_8', 3, 'hobson', 2),
  ('S1_8', 3, 'olivia', 3),
  ('S1_9', 3, 'lierach', 

In [207]:
con.execute("drop view if exists PrefixR").execute(
    "create view PrefixR as "
    "select rid, rlen, token, pos "
    "from R "
    f"where rlen - pos + 1 >= ceil(rlen * {t}) "
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [208]:
x = con.execute("select * from PrefixR").fetchall()
len(x), x

(60,
 [('S1_0', 3, 'morrison', 1),
  ('S1_0', 3, '19101123', 2),
  ('S1_1', 3, '19371126', 1),
  ('S1_1', 3, 'jordan', 2),
  ('S1_10', 3, '19081008', 1),
  ('S1_10', 3, 'domiten', 2),
  ('S1_11', 3, 'genoveffa', 1),
  ('S1_11', 3, 'hylander', 2),
  ('S1_2', 3, 'lock', 1),
  ('S1_2', 3, '19211129', 2),
  ('S1_3', 3, '19720305', 1),
  ('S1_3', 3, 'grosser', 2),
  ('S1_4', 3, '19190110', 1),
  ('S1_4', 3, 'wuchatsch', 2),
  ('S1_5', 3, 'loyck', 1),
  ('S1_5', 3, '19211129', 2),
  ('S1_6', 3, 'rhys', 1),
  ('S1_6', 3, '19440909', 2),
  ('S1_7', 3, 'greenj', 1),
  ('S1_7', 3, '19790110', 2),
  ('S1_8', 3, '19760812', 1),
  ('S1_8', 3, 'hobson', 2),
  ('S1_9', 3, 'lierach', 1),
  ('S1_9', 3, '19360816', 2),
  ('S2_0', 3, 'braecon', 1),
  ('S2_0', 3, '19440909', 2),
  ('S2_1', 3, '19930305', 1),
  ('S2_1', 3, 'grosvenor', 2),
  ('S2_2', 3, 'liersch', 1),
  ('S2_2', 3, '19360816', 2),
  ('S2_3', 3, '19321129', 1),
  ('S2_3', 3, 'emmeron', 2),
  ('S2_4', 3, '19760812', 1),
  ('S2_4', 3, 'hobson

In [209]:
con.execute("drop view if exists candset").execute(
    "CREATE VIEW candset AS ("
    "SELECT R1.rid AS rid1, R2.rid AS rid2 "
    ", MAX(R1.pos) as maxPos1, MAX(R2.pos) as maxPos2, count(*) as prOverlap "
    "FROM PrefixR R1, PrefixR R2 "
    "WHERE R1.rid < R2.rid "
    "AND R1.token = R2.token "
    # length filter
    f"AND R1.rlen >= ceil({t} * R2.rlen)"
    # prefix filter
    f"AND R1.rlen - R1.pos + 1 >= CEIL(R1.rlen * 2 * {t} / (1+{t})) "
    # positional filter
    "AND LEAST((R1.rlen - R1.pos + 1), (R2.rlen - R2.pos + 1)) >= "
    f"CEIL((R1.rlen + R2.rlen) * {t} / (1 + {t})) "
    "GROUP BY R1.rid, R2.rid "
    ")"
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [210]:
con.execute("select * from candset").fetchall()

[('S1_0', 'S3_9', 2, 2, 1),
 ('S1_2', 'S3_0', 2, 2, 1),
 ('S1_4', 'S3_1', 1, 2, 1),
 ('S1_5', 'S3_0', 2, 2, 2),
 ('S1_6', 'S3_5', 2, 2, 1),
 ('S1_7', 'S3_7', 2, 1, 1),
 ('S1_8', 'S3_6', 2, 2, 2),
 ('S1_9', 'S3_3', 2, 2, 1),
 ('S2_0', 'S3_5', 2, 2, 1),
 ('S2_2', 'S3_3', 2, 2, 2),
 ('S2_4', 'S3_6', 2, 2, 2),
 ('S2_5', 'S3_7', 2, 2, 1),
 ('S2_6', 'S3_4', 2, 2, 1),
 ('S1_2', 'S1_5', 2, 2, 1),
 ('S1_6', 'S2_0', 2, 2, 1),
 ('S1_8', 'S2_4', 2, 2, 2),
 ('S1_9', 'S2_2', 2, 2, 1)]

In [211]:
con.execute("drop view if exists matches").execute(
    "create view matches as "
    "select r1.rid as rid1, r2.rid as rid2 "
    "from R r1, R r2, candset c "
    "where c.rid1 = r1.rid "
    "and c.rid2 = r2.rid "
    "and r1.token = r2.token "
    "and r1.pos > maxPos1 "
    "and r2.pos > maxPos2 "
    "group by r1.rid, r2.rid, r1.rlen, r2.rlen, prOverlap "
    f"having count(*) + prOverlap >= (r1.rlen + r2.rlen) * {t} / (1+{t})"
)

<duckdb.DuckDBPyConnection at 0x1a681a5d970>

In [212]:
con.execute("select * from matches").fetchall()

[('S1_0', 'S3_9'),
 ('S1_2', 'S3_0'),
 ('S1_4', 'S3_1'),
 ('S1_5', 'S3_0'),
 ('S1_6', 'S3_5'),
 ('S1_7', 'S3_7'),
 ('S1_8', 'S3_6'),
 ('S1_9', 'S3_3'),
 ('S2_0', 'S3_5'),
 ('S2_2', 'S3_3'),
 ('S2_4', 'S3_6'),
 ('S2_5', 'S3_7'),
 ('S2_6', 'S3_4'),
 ('S1_2', 'S1_5'),
 ('S1_6', 'S2_0'),
 ('S1_8', 'S2_4'),
 ('S1_9', 'S2_2')]

In [213]:
con.execute(
    "select i1.val, i2.val "
    # ", i1.rid, i2.rid "
    "from matches m "
    "join input i1 on i1.rid = m.rid1 "
    "join input i2 on i2.rid = m.rid2"
).fetchall()

[('joshua morrison 19101123', 'joshua morriosn 19101123'),
 ('emmerson lock 19211129', 'emmerson loyck 19211129'),
 ('michael wuchatsch 19190110', 'michel wuchatsch 19190110'),
 ('emmerson loyck 19211129', 'emmerson loyck 19211129'),
 ('rhys schuetz 19440909', 'braedon schuetz 19440909'),
 ('joshua greenj 19790110', 'joshua green 19790110'),
 ('olivia hobson 19760812', 'olivia hobson 19760812'),
 ('michael lierach 19360816', 'liersch michael 19360816'),
 ('braecon schuetz 19440909', 'braedon schuetz 19440909'),
 ('michael liersch 19360816', 'liersch michael 19360816'),
 ('olivia hobson 19760812', 'olivia hobson 19760812'),
 ('joshua green 19010219', 'joshua green 19790110'),
 ('charlotte hyland 19340909', 'charlotte hyland 19460401'),
 ('emmerson lock 19211129', 'emmerson loyck 19211129'),
 ('rhys schuetz 19440909', 'braecon schuetz 19440909'),
 ('olivia hobson 19760812', 'olivia hobson 19760812'),
 ('michael lierach 19360816', 'michael liersch 19360816')]

# Debug

In [214]:
con.execute(
    "select * "
    "from matches m "
    "full outer join bfjoin b on b.rid1 = m.rid1 and b.rid2 = m.rid2 "
    "where m.rid1 is null "
).fetchall()

[]