In [1]:
!pip install duckdb



In [2]:
import duckdb
import pandas as pd

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from py_duckdb.similarity_join import tokenizers
from py_duckdb.similarity_join import jaccard_join
from py_duckdb.similarity_join import jaccard_join_brute_force
from py_duckdb.similarity_join.default_names import *

In [9]:
_con = duckdb.connect(database=':memory:')

In [10]:
l_key_attr = 'rid'
r_key_attr = 'rid'
l_join_attr = 'val'
r_join_attr = 'val'
tokenizer = tokenizers.WordsTokzr("' '")
# tokenizer = tokenizers.QGramsTokzr(3)
threshold = 0.2
out_table_name = 'matches'
tokenizer.query()

"select src, rid, len(tks) as rlen, lower(unnest(tks)) as token from ( select src, rid, list_distinct(list_filter(str_split_regex(val, ' '), x -> trim(x) != '')) as tks from input ) "

In [11]:
df = pd.read_json("data/test.json", lines=True, orient='records', typ='frame')
df.head()

Unnamed: 0,address_1,date_of_birth,address_2,postcode,soc_sec_id,given_name,realProfileID,surname,street_number,suburb,phone_number,state,age
0,collings street,19910319,orange hill,3995,9244976,sophie,6880,campbell,3,slacks creek,02 26906292,vic,10
1,giles street,19920522,,5540,7083881,jasmine,9221,fitzpatrick,10,cecil hills,02 26372297,vic,3


In [12]:
to_concat = ", ' ', ".join(["date_of_birth", "surname", "address_1", "street_number", "postcode", "soc_sec_id", "suburb", "phone_number", "state", "given_name", "age", "address_2"])
to_concat

"date_of_birth, ' ', surname, ' ', address_1, ' ', street_number, ' ', postcode, ' ', soc_sec_id, ' ', suburb, ' ', phone_number, ' ', state, ' ', given_name, ' ', age, ' ', address_2"

In [13]:
_con.execute("drop table if exists data").execute(
    "CREATE TABLE data AS "
    f"SELECT realProfileID as rid, concat ({to_concat}) as val "
    "FROM df"
).execute("select * from data").fetchall()

[(6880,
  '19910319 campbell collings street 3 3995 9244976 slacks creek 02 26906292 vic sophie 10 orange hill'),
 (9221,
  '19920522 fitzpatrick giles street 10 5540 7083881 cecil hills 02 26372297 vic jasmine 3 ')]

In [14]:
jaccard_join(
    _con, 'data', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
).execute(f"select * from {out_table_name}").fetchall()

[(6880, 9221)]

In [15]:
jaccard_join_brute_force(
    _con, 'data', '', l_key_attr, r_key_attr, l_join_attr, r_join_attr, tokenizer, threshold, out_table_name
).execute(f"select * from {out_table_name}").fetchall()

[]

In [17]:
l_table = "data"

In [18]:
_con.execute(f"drop table if exists {INPUT_TABLE}").execute(
    f"create table {INPUT_TABLE} as ("
    f"select '{l_table}' as src, {l_key_attr} as rid, {l_join_attr} as val "
    f"from '{l_table}' )"
).execute(f"select * from {INPUT_TABLE}").fetchall()

[('data',
  6880,
  '19910319 campbell collings street 3 3995 9244976 slacks creek 02 26906292 vic sophie 10 orange hill'),
 ('data',
  9221,
  '19920522 fitzpatrick giles street 10 5540 7083881 cecil hills 02 26372297 vic jasmine 3 ')]

In [19]:
_con.execute(f"drop table if exists {TOKENS_VIEW}").execute(
    f"create table {TOKENS_VIEW} as "
    + tokenizer.query()
).execute(f"select * from {TOKENS_VIEW}").fetchall()

[('data', 6880, 16, 'street'),
 ('data', 6880, 16, 'collings'),
 ('data', 6880, 16, '19910319'),
 ('data', 6880, 16, 'campbell'),
 ('data', 6880, 16, '3'),
 ('data', 6880, 16, '3995'),
 ('data', 6880, 16, 'slacks'),
 ('data', 6880, 16, '9244976'),
 ('data', 6880, 16, 'creek'),
 ('data', 6880, 16, '02'),
 ('data', 6880, 16, '26906292'),
 ('data', 6880, 16, 'vic'),
 ('data', 6880, 16, 'sophie'),
 ('data', 6880, 16, '10'),
 ('data', 6880, 16, 'orange'),
 ('data', 6880, 16, 'hill'),
 ('data', 9221, 14, 'cecil'),
 ('data', 9221, 14, '19920522'),
 ('data', 9221, 14, '10'),
 ('data', 9221, 14, 'fitzpatrick'),
 ('data', 9221, 14, 'giles'),
 ('data', 9221, 14, '5540'),
 ('data', 9221, 14, 'street'),
 ('data', 9221, 14, '7083881'),
 ('data', 9221, 14, 'hills'),
 ('data', 9221, 14, '02'),
 ('data', 9221, 14, '26372297'),
 ('data', 9221, 14, 'vic'),
 ('data', 9221, 14, 'jasmine'),
 ('data', 9221, 14, '3')]

In [20]:
_con.execute(f"drop table if exists {DOC_FREQ_VIEW}").execute(
    f"CREATE table {DOC_FREQ_VIEW} AS "
    "SELECT token, count(*) AS df "
    f"FROM {TOKENS_VIEW} "
    "GROUP BY token "
).execute(f"drop table if exists {TOKENS_DOC_FREQ_VIEW}").execute(
    f"CREATE table {TOKENS_DOC_FREQ_VIEW} AS "
    f"select rid, rlen, {TOKENS_VIEW}.token, df "
    f", row_number() OVER (PARTITION BY rid ORDER BY df, {TOKENS_VIEW}.token) as pos "
    f"from {TOKENS_VIEW}, {DOC_FREQ_VIEW} "
    f"where {TOKENS_VIEW}.token = {DOC_FREQ_VIEW}.token"
)

<duckdb.DuckDBPyConnection at 0x249e19d74b0>

In [21]:
_con.execute(f"select * from {TOKENS_DOC_FREQ_VIEW}").fetchall()

[(6880, 16, '19910319', 1, 1),
 (6880, 16, '26906292', 1, 2),
 (6880, 16, '3995', 1, 3),
 (6880, 16, '9244976', 1, 4),
 (6880, 16, 'campbell', 1, 5),
 (6880, 16, 'collings', 1, 6),
 (6880, 16, 'creek', 1, 7),
 (6880, 16, 'hill', 1, 8),
 (6880, 16, 'orange', 1, 9),
 (6880, 16, 'slacks', 1, 10),
 (6880, 16, 'sophie', 1, 11),
 (6880, 16, '02', 2, 12),
 (6880, 16, '10', 2, 13),
 (6880, 16, '3', 2, 14),
 (6880, 16, 'street', 2, 15),
 (6880, 16, 'vic', 2, 16),
 (9221, 14, '19920522', 1, 1),
 (9221, 14, '26372297', 1, 2),
 (9221, 14, '5540', 1, 3),
 (9221, 14, '7083881', 1, 4),
 (9221, 14, 'cecil', 1, 5),
 (9221, 14, 'fitzpatrick', 1, 6),
 (9221, 14, 'giles', 1, 7),
 (9221, 14, 'hills', 1, 8),
 (9221, 14, 'jasmine', 1, 9),
 (9221, 14, '02', 2, 10),
 (9221, 14, '10', 2, 11),
 (9221, 14, '3', 2, 12),
 (9221, 14, 'street', 2, 13),
 (9221, 14, 'vic', 2, 14)]

In [22]:
t = threshold

In [23]:
_con.execute(f"drop table if exists {PREFIXES_VIEW}").execute(
    f"create table {PREFIXES_VIEW} as "
    "select rid, rlen, token, pos, df "
    f"from {TOKENS_DOC_FREQ_VIEW} "
    f"where rlen - pos + 1 >= ceil(rlen * {t}) "
).execute(f"select * from {PREFIXES_VIEW}").fetchall()

[(6880, 16, '19910319', 1, 1),
 (6880, 16, '26906292', 2, 1),
 (6880, 16, '3995', 3, 1),
 (6880, 16, '9244976', 4, 1),
 (6880, 16, 'campbell', 5, 1),
 (6880, 16, 'collings', 6, 1),
 (6880, 16, 'creek', 7, 1),
 (6880, 16, 'hill', 8, 1),
 (6880, 16, 'orange', 9, 1),
 (6880, 16, 'slacks', 10, 1),
 (6880, 16, 'sophie', 11, 1),
 (6880, 16, '02', 12, 2),
 (6880, 16, '10', 13, 2),
 (9221, 14, '19920522', 1, 1),
 (9221, 14, '26372297', 2, 1),
 (9221, 14, '5540', 3, 1),
 (9221, 14, '7083881', 4, 1),
 (9221, 14, 'cecil', 5, 1),
 (9221, 14, 'fitzpatrick', 6, 1),
 (9221, 14, 'giles', 7, 1),
 (9221, 14, 'hills', 8, 1),
 (9221, 14, 'jasmine', 9, 1),
 (9221, 14, '02', 10, 2),
 (9221, 14, '10', 11, 2),
 (9221, 14, '3', 12, 2)]

In [24]:
_con.execute(f"drop table if exists {CANDIDATE_SET_VIEW}").execute(
    f"CREATE table {CANDIDATE_SET_VIEW} AS "
    "SELECT pr1.rid AS rid1, pr2.rid AS rid2 "
    ", MAX(pr1.pos) as maxPos1, MAX(pr2.pos) as maxPos2, count(*) as prOverlap "
    f"FROM {PREFIXES_VIEW} pr1, {PREFIXES_VIEW} pr2 "
    "WHERE pr1.rid < pr2.rid "
    "AND pr1.token = pr2.token "
    # length filter
    f"AND pr1.rlen >= ceil({t} * pr2.rlen)"
    # prefix filter
    # f"AND pr1.rlen - pr1.pos + 1 >= CEIL(pr1.rlen * 2 * {t} / (1+{t})) "
    # positional filter
    "AND LEAST((pr1.rlen - pr1.pos + 1), (pr2.rlen - pr2.pos + 1)) >= "
    f"CEIL((pr1.rlen + pr2.rlen) * {t} / (1 + {t})) "
    "GROUP BY pr1.rid, pr2.rid "
).execute(f"select * from {CANDIDATE_SET_VIEW}").fetchall()

[(6880, 9221, 12, 10, 1)]

In [41]:
_con.execute(
    # Start from the last match included to include the pairs in which the prefixes match entirely but the
    # suffixes do not match at all
    "select r1.rid as rid1, r2.rid as rid2 "
    # f", count(*) + prOverlap - 1, (r1.rlen + r2.rlen) * {t} / (1+{t}) "
    f"from {TOKENS_DOC_FREQ_VIEW} r1, {TOKENS_DOC_FREQ_VIEW} r2, {CANDIDATE_SET_VIEW} c "
    "where c.rid1 = r1.rid "
    "and c.rid2 = r2.rid "
    "and r1.token = r2.token "
    "and r1.pos >= maxPos1 "# "and r1.pos > maxPos1 "
    "and r2.pos >= maxPos2 "# "and r2.pos > maxPos2 "
    "group by r1.rid, r2.rid, r1.rlen, r2.rlen, prOverlap "
    f"having count(*) + prOverlap - 1 >= (r1.rlen + r2.rlen) * {t} / (1+{t})"
).fetchall()

[(6880, 9221, 5, 5.0)]

In [42]:
_con.execute(
    "select r1.rid as rid1, r2.rid as rid2 "
    # f", count(*) as overlap "
    # f", ceil({t} / (1+{t}) * (r1.rlen + r2.rlen)) "
    f"from {TOKENS_VIEW} as r1, {TOKENS_VIEW} as r2 "
    "where r1.token = r2.token "
    "and r1.rid < r2.rid "
    "group by r1.rid, r1.rlen, r2.rid, r2.rlen "
    # f"having count(*) >= ({t} / (1+{t}) * (r1.rlen + r2.rlen))" # -> 5 >= 5.000000001 KO
    f"having count(*) >= ((r1.rlen + r2.rlen) * {t} / (1+{t}))" # -> 5 >= 5.0 OK
).fetchall()

[(6880, 9221, 5)]

### Calcolo "a mano" la similarità delle coppie presenti nel Ground Truth

In [7]:
df10id = pd.read_json("data/10KIdDuplicates.json", lines=True, orient='records', typ='frame')
_con.execute("drop table if exists db10id").execute(
    "CREATE TABLE db10id AS "
    "SELECT d1Id as rid1, d2Id as rid2 "
    "FROM df10id "
).execute("select count(*) from db10id").fetchall()

[(8705,)]

In [8]:
df10 = pd.read_json("data/10Kprofiles.json", lines=True, orient='records', typ='frame')
_con.execute("drop table if exists db10").execute(
    "CREATE TABLE db10 AS "
    "SELECT realProfileID as rid, concat (date_of_birth, '|', surname, '|', address_1, '|', street_number, '|', postcode, '|',soc_sec_id, '|',"
    "suburb, '|',phone_number, '|', state, '|', given_name, '|',age, '|',address_2 ) as val "
    "FROM df10"
).execute("select * from db10").fetchall()

[(0,
  '19390609.0|bishop|daley crescent|41.0|6050|4676841|batlow|08 29028996|qld|molly|31.0|'),
 (1,
  '19041109.0|aidon|nambucca street|7.0|2002|3414163|devonort|08 75629459|vkf|whkt||'),
 (2,
  '19910711.0|anns||12.0|2287|7844876|ivanhoe|02 11684110|vic|andrew|31.0|'),
 (3,
  '19390709.0|whitrlsy|robson street|34.0|4065|2418360|christie downs|04 00323207|qld|shsne||'),
 (4,
  '19340328.0|roche|rankin street|9.0|3644|7577436|frenchs forest|08 53227250|qld|sophie|30.0|'),
 (5,
  '19320811.0|fullgrabe|beeston street|29.0|3131|6494586|broken hill|04 80080021|nsw|emma|29.0|'),
 (6,
  '19601013.0|lodge|mason street|48.0|5254|6098877|orchard hills|08 48143359|vic|rourke|32.0|'),
 (7,
  '19340921.0|coleman|edman close|116.0|2397|8858237|castlecrag|03 58778382||harry|26.0|'),
 (8,
  '19671108.0|bishop|cromwell circuit|12.0|2226|1718686|harris park|04 01707833|vic|jamie|28.0|homestead caravan park'),
 (9,
  '19331228.0|grcn|lutana street|11.0|2074|8364236|newstead|02 57700508|ws|lauren||'),
 

In [10]:
l_table = "db10"
tokenizer = tokenizers.WordsTokzr("'\|'")

_con.execute(f"drop table if exists {INPUT_TABLE}").execute(
    f"create table {INPUT_TABLE} as ("
    f"select '{l_table}' as src, {l_key_attr} as rid, {l_join_attr} as val "
    f"from '{l_table}' )"
).execute(f"select * from {INPUT_TABLE}").fetchall()
_con.execute(f"drop table if exists {TOKENS_VIEW}").execute(
    f"create table {TOKENS_VIEW} as "
    + tokenizer.query()
).execute(f"select * from {TOKENS_VIEW}").fetchall()

[('db10', 0, 11, '19390609.0'),
 ('db10', 0, 11, '41.0'),
 ('db10', 0, 11, 'bishop'),
 ('db10', 0, 11, 'daley crescent'),
 ('db10', 0, 11, '6050'),
 ('db10', 0, 11, '4676841'),
 ('db10', 0, 11, 'batlow'),
 ('db10', 0, 11, '08 29028996'),
 ('db10', 0, 11, 'qld'),
 ('db10', 0, 11, 'molly'),
 ('db10', 0, 11, '31.0'),
 ('db10', 1, 10, 'aidon'),
 ('db10', 1, 10, '19041109.0'),
 ('db10', 1, 10, 'nambucca street'),
 ('db10', 1, 10, '08 75629459'),
 ('db10', 1, 10, 'devonort'),
 ('db10', 1, 10, '7.0'),
 ('db10', 1, 10, '3414163'),
 ('db10', 1, 10, '2002'),
 ('db10', 1, 10, 'vkf'),
 ('db10', 1, 10, 'whkt'),
 ('db10', 2, 10, '12.0'),
 ('db10', 2, 10, '19910711.0'),
 ('db10', 2, 10, 'vic'),
 ('db10', 2, 10, 'anns'),
 ('db10', 2, 10, '2287'),
 ('db10', 2, 10, '7844876'),
 ('db10', 2, 10, '02 11684110'),
 ('db10', 2, 10, 'ivanhoe'),
 ('db10', 2, 10, 'andrew'),
 ('db10', 2, 10, '31.0'),
 ('db10', 3, 10, 'christie downs'),
 ('db10', 3, 10, 'robson street'),
 ('db10', 3, 10, '19390709.0'),
 ('db10', 3

In [15]:
_con.execute(
    "select * , ("
    "select count(*) as overlap "
    f"from {TOKENS_VIEW} t1, {TOKENS_VIEW} t2 "
    "where t1.rid = id.rid1 "
    "and t2.rid = id.rid2 "
    "and t1.token = t2.token "
    ")"
    f"from db10id id "
).fetchall()

[(101, 8933, 8),
 (4101, 4614, 5),
 (7213, 8466, 4),
 (4856, 7485, 2),
 (4829, 9725, 6),
 (1289, 8634, 5),
 (3059, 9179, 3),
 (6802, 9806, 3),
 (6218, 9126, 4),
 (5849, 6986, 9),
 (3119, 4205, 4),
 (1884, 8409, 6),
 (5663, 6060, 4),
 (9371, 9596, 6),
 (8195, 8902, 9),
 (1295, 8153, 5),
 (4301, 4419, 4),
 (5059, 7040, 5),
 (2498, 6623, 2),
 (2734, 3420, 2),
 (3670, 7648, 7),
 (358, 4020, 7),
 (3064, 8794, 4),
 (772, 2425, 7),
 (389, 1449, 6),
 (7025, 7724, 4),
 (1689, 8239, 4),
 (557, 3895, 5),
 (1554, 3063, 3),
 (3699, 5250, 3),
 (2924, 4049, 7),
 (5656, 6673, 5),
 (8975, 9725, 3),
 (3875, 7035, 9),
 (2704, 5934, 5),
 (4040, 9733, 6),
 (895, 8626, 3),
 (3317, 4217, 6),
 (1328, 5463, 5),
 (184, 2109, 4),
 (96, 9418, 7),
 (781, 1715, 4),
 (4486, 5499, 2),
 (3252, 9615, 3),
 (4858, 7396, 4),
 (2129, 4535, 4),
 (1083, 9433, 4),
 (3267, 8384, 5),
 (8392, 8991, 1),
 (3105, 5454, 3),
 (1921, 5428, 4),
 (6033, 8204, 3),
 (1483, 9012, 7),
 (1282, 9311, 6),
 (6822, 8240, 2),
 (303, 8653, 7),
 (4

In [24]:
_con.execute(
    "select id.rid1, id.rid2, count(*) as overlap, overlap * 100 / (any_value(t1.rlen) + (any_value(t2.rlen) - overlap)) as jaccard "
    f"from db10id id, {TOKENS_VIEW} t1, {TOKENS_VIEW} t2 "
    "where t1.rid = id.rid1 "
    "and t2.rid = id.rid2 "
    "and t1.token = t2.token "
    "group by id.rid1, id.rid2 "
    "order by jaccard"
).fetchall()

[(14, 7100, 1, 4),
 (1116, 2239, 1, 4),
 (1598, 6100, 1, 4),
 (2024, 2905, 1, 4),
 (2149, 4857, 1, 4),
 (2521, 5886, 1, 4),
 (2521, 5597, 1, 4),
 (2717, 6146, 1, 4),
 (4387, 7835, 1, 4),
 (4809, 8982, 1, 4),
 (6113, 8747, 1, 4),
 (6123, 7110, 1, 4),
 (7184, 7789, 1, 4),
 (5807, 9610, 1, 4),
 (5546, 6330, 1, 4),
 (4882, 8659, 1, 4),
 (45, 7205, 1, 5),
 (54, 9209, 1, 5),
 (127, 7809, 1, 5),
 (1, 8684, 1, 5),
 (12, 4572, 1, 5),
 (59, 5230, 1, 5),
 (120, 5616, 1, 5),
 (152, 4082, 1, 5),
 (291, 5443, 1, 5),
 (502, 9203, 1, 5),
 (471, 8605, 1, 5),
 (567, 1758, 1, 5),
 (654, 7490, 1, 5),
 (663, 4705, 1, 5),
 (753, 7522, 1, 5),
 (651, 705, 1, 5),
 (705, 2452, 1, 5),
 (594, 6272, 1, 5),
 (814, 4377, 1, 5),
 (877, 8991, 1, 5),
 (882, 8338, 1, 5),
 (890, 8279, 1, 5),
 (907, 8108, 1, 5),
 (965, 9331, 1, 5),
 (913, 5333, 1, 5),
 (927, 5850, 1, 5),
 (943, 4093, 1, 5),
 (796, 1090, 1, 5),
 (1040, 9642, 1, 5),
 (1124, 9795, 1, 5),
 (1052, 7152, 1, 5),
 (1083, 6969, 1, 5),
 (538, 3676, 1, 5),
 (705, 46