[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobinL/uk_address_matcher/blob/main/match_example_data.ipynb)


In [4]:
# !pip install uk_address_matcher==0.0.1.dev7

In [11]:
import duckdb
import pandas as pd
from IPython.display import display

from uk_address_matcher.cleaning_pipelines import (
    clean_data_using_precomputed_rel_tok_freq,
)
from uk_address_matcher.splink_model import _performance_predict

pd.options.display.max_colwidth = 1000

# -----------------------------------------------------------------------------
# Step 1: Load in some example data.  If using your own data, it must be in
# the same format as the example data.
# -----------------------------------------------------------------------------



p_fhrs = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/fhrs_addresses_sample.parquet"
p_ch = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/companies_house_addresess_postcode_overlap.parquet"

con = duckdb.connect(database=":memory:")
con.execute(f"CREATE TABLE df_fhrs AS SELECT * FROM read_parquet('{p_fhrs}')")
con.execute(f"CREATE TABLE df_ch AS SELECT * FROM read_parquet('{p_ch}')")



# Display length of the dataset
print(f"Length of FHRS dataset: {len(df_fhrs):,.0f}")
print(f"Length of Companies House dataset: {len(df_ch):,.0f}")

display(df_fhrs.limit(5).df())
display(df_ch.limit(5).df())


Length of FHRS dataset: 5,000
Length of Companies House dataset: 21,952


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,300681,fhrs,AMERICA ROAD EARLS COLNE COLCHESTER,C06 2LB
1,300359,fhrs,THE AIRFIELD EARLS COLNE COLCHESTER ESSEX,C06 2NS
2,300880,fhrs,THE AIRFIELD EARLS COLNE COLCHESTER ESSEX,C06 2NS
3,300752,fhrs,9A BRIDGE STREET BURES HAMLET SUDBURY SUFFOLK,C08 5AD
4,300821,fhrs,MOUNT HILL HALSTEAD ESSEX,C09 1AA


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,702725,companies_house,UNIT 11 SPITFIRE DRIVE EARLS COLNE BUSINESS PARK EARLS COLNE ESSEX,C06 2NS
1,988738,companies_house,BARKER HOUSE PHOENIX ROAD HAVERHILL,CB9 7AE
2,15507930,companies_house,HILLTOP VILLA PALE GREEN HELIONS BUMPSTEAD HAVERHILL,CB9 7AF
3,6932985,companies_house,"ESSEX COTTAGE PALE GREEN, HELIONS BUMPSTEAD HAVERHILL SUFFOLK",CB9 7AF
4,13021290,companies_house,CHESTNUT LODGE HELIONS BUMPSTEAD HAVERHILL SUFFOLK,CB9 7AF


In [6]:
# -----------------------------------------------------------------------------
# Step 2: Clean the data/feature engineering to prepare for matching model
# -----------------------------------------------------------------------------

df_fhrs_clean = clean_data_using_precomputed_rel_tok_freq(df_fhrs, con=con)
df_ch_clean = clean_data_using_precomputed_rel_tok_freq(df_ch, con=con)


In [7]:


linker, predictions = _performance_predict(
    [df_fhrs_clean, df_ch_clean],
    con=con,
    match_weight_threshold=-10,
    output_all_cols=True,
    include_full_postcode_block=True,
)



Initialise df_concat_with_tf took 1.56 seconds
Time taken to block: 1.92 seconds
Time taken to predict: 1.88 seconds


In [8]:
# ------------------------------------------------------------------------------------
# Step 3: Inspect the results:
# ------------------------------------------------------------------------------------

sql = """
SELECT
    match_probability,
    match_weight,
    concat_ws(' ', original_address_concat_l, postcode_l) AS address_l,
    concat_ws(' ', original_address_concat_r, postcode_r) AS address_r,
    unique_id_l,
    unique_id_r,
    source_dataset_l,
    source_dataset_r
FROM
    predictions
WHERE
    match_weight > 5
QUALIFY
    row_number() OVER (
        PARTITION BY unique_id_l ORDER BY match_weight DESC
    ) = 1

"""

top_predict = con.sql(sql).df()

display(top_predict.head())


Unnamed: 0,match_probability,match_weight,address_l,address_r,unique_id_l,unique_id_r,source_dataset_l,source_dataset_r
0,1.0,24.100119,46-54 HIGH STREET INGATESTONE ESSEX CM4 9DW,46 - 54 HIGH STREET INGATESTONE ESSEX CM4 9DW,547373,431542,companies_house,fhrs
1,0.504517,0.026069,LYNDERSWOOD FARM LONDON ROAD BLACK NOTLEY BRAINTREE ESSEX CM77 8QN,UPPER LONDON ROAD BLACK NOTLEY BRAINTREE ESSEX CM77 8QN,713093,300326,companies_house,fhrs
2,0.999977,15.417853,POUND FARM POUND LANE NORTH TUDDENHAM DEREHAM NORFOLK NR20 3DA,POUND FARM POUND LANE NORTH TUDDENHAM NORFOLK NR20 3DA,842828,1176617,companies_house,fhrs
3,0.99933,10.543259,2 PRIORY COTTAGE THE STREET PRESTON ST MARY SUDBURY SUFFOLK CO10 9NF,THE SUFFOLK SPICE CO 2 PRIORY COTTAGES THE STREET PRESTON ST MARY SUDBURY CO10 9NF,1064753,1395180,companies_house,fhrs
4,0.66443,0.9855,124 NEWLAND STREET WITHAM ESSEX CM8 1BA,130 NEWLAND STREET WITHAM ESSEX CM8 1BA,1075328,1083715,companies_house,fhrs


In [9]:


sql = """
SELECT * FROM predictions WHERE match_weight > 0
QUALIFY row_number() OVER (PARTITION BY unique_id_l ORDER BY match_weight DESC) = 1
order by random()
limit 3
"""

recs = con.sql(sql).df().to_dict(orient="records")


for rec in recs:
    print("-" * 80)
    print(rec["unique_id_l"], rec["original_address_concat_l"])
    print(rec["unique_id_r"], rec["original_address_concat_r"])
    display(linker.waterfall_chart([rec]))


--------------------------------------------------------------------------------
12387655 UNIT 7A RADFORD CRESCENT BILLERICAY ESSEX
1035706 UNIT 19 RADFORD CRESCENT BILLERICAY ESSEX


--------------------------------------------------------------------------------
14599094 20A RUNWELL ROAD WICKFORD ESSEX
194522 20 RUNWELL ROAD WICKFORD ESSEX


--------------------------------------------------------------------------------
07768244 67 NEWLAND STREET NEWLAND STREET WITHAM ESSEX
1422832 63B NEWLAND STREET WITHAM ESSEX
