[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobinL/uk_address_matcher/blob/main/match_example_data.ipynb)


In [9]:
!pip install --pre uk_address_matcher



In [3]:
import duckdb
import pandas as pd

from uk_address_matcher.post_linkage.analyse_results import (
    distinguishability_summary,
)
from uk_address_matcher.post_linkage.identify_distinguishing_tokens import (
    improve_predictions_using_distinguishing_tokens,
)
from uk_address_matcher import clean_data_using_precomputed_rel_tok_freq, get_linker
import time

pd.options.display.max_colwidth = 1000

pd.options.display.max_colwidth = 1000

# -----------------------------------------------------------------------------
# Step 1: Load in some example data.  If using your own data, it must be in
# the same format as the example data.
# -----------------------------------------------------------------------------
# Any additional columns should be retained as-is by the cleaning code

p_fhrs = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/fhrs_addresses_sample.parquet"
p_ch = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/companies_house_addresess_postcode_overlap.parquet"

con = duckdb.connect(database=":memory:")
con.sql(f"CREATE TABLE df_fhrs AS SELECT * FROM read_parquet('{p_fhrs}')")
con.sql(f"CREATE TABLE df_ch AS SELECT * FROM read_parquet('{p_ch}')")
df_fhrs = con.table("df_fhrs")
df_ch = con.table("df_ch")

# Display length of the dataset
print(f"Length of FHRS dataset: {len(df_fhrs.df()):,.0f}")
print(f"Length of Companies House dataset: {len(df_ch.df()):,.0f}")

display(df_fhrs.limit(5).df())
display(df_ch.limit(5).df())


Length of FHRS dataset: 5,000
Length of Companies House dataset: 21,952


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,1543406,fhrs,1 OAK CHILTON DAY CENTRE UNIT 2 MARTINS ROAD CHILTON INDUSTRIAL ESTATE SUDBURY,CO10 2FT
1,1395196,fhrs,38 STATION ROAD SUDBURY SUFFOLK,CO10 2SS
2,1394874,fhrs,33 SWAN STREET BOXFORD SUDBURY SUFFOLK,CO10 5NZ
3,1649158,fhrs,11A FRIARS STREET SUDBURY SUFFOLK,CO10 2AA
4,1689685,fhrs,13-14 MARKET PLACE LAVENHAM SUDBURY SUFFOLK,CO10 9QZ


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,6911165,companies_house,NORFOLK HOUSE 22 -24 MARKET PLACE SWAFFHAM NORFOLK,PE37 7QH
1,13048420,companies_house,10-11 THURLOW STREET BEDFORD,MK40 1LR
2,12870226,companies_house,69 RUNWELL ROAD WICKFORD,SS11 7HL
3,9146129,companies_house,249 ONGAR ROAD BRENTWOOD,CM15 9DZ
4,12061693,companies_house,C/O CJAS 105 HIGH STREET BRENTWOOD,CM14 4RR


In [2]:
# -----------------------------------------------------------------------------
# Step 2: Clean the data/feature engineering to prepare for matching model
# -----------------------------------------------------------------------------

df_fhrs_clean = clean_data_using_precomputed_rel_tok_freq(df_fhrs, con=con)
df_ch_clean = clean_data_using_precomputed_rel_tok_freq(df_ch, con=con)


In [4]:
linker = get_linker(
    df_addresses_to_match=df_fhrs_clean,
    df_addresses_to_search_within=df_ch_clean,
    con=con,
    include_full_postcode_block=True,
    additional_columns_to_retain=["original_address_concat"],
)

df_predict = linker.inference.predict(
    threshold_match_weight=-50, experimental_optimisation=True
)
df_predict_ddb = df_predict.as_duckdbpyrelation()

Blocking time: 0.01 seconds
Predict time: 1.15 seconds


In [None]:
start_time = time.time()
df_predict_improved = improve_predictions_using_distinguishing_tokens(
    df_predict=df_predict_ddb,
    con=con,
    match_weight_threshold=-20,
)

df_predict_improved.show(max_width=500, max_rows=5)

end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

┌─────────────┬─────────────┬────────────────┬───────────────────────┬─────────────────────┬────────────────────────────┬───────────────────────┬───────────────────────────────────┬───────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────┬────────────┬────────────────────────────────────────────────────┬────────────┐
│ unique_id_r │ unique_id_l │ dist_tok_match │ match_weight_original │    match_weight     │ match_probability_original │   match_probability   │ canonical_distinguishing_tokens_1 │ messy_distinguishing_tokens_1 │                              original_address_concat_l                              │ postcode_l │             original_address_concat_r              │ postcode_r │
│   varchar   │   varchar   │    boolean     │        double         │       double        │           double           │        double         │             varchar[]             │           varchar[]           │                             

In [6]:
dsum_1 = distinguishability_summary(
    df_predict=df_predict_ddb, df_addresses_to_match=df_fhrs_clean, con=con
)
dsum_1.show(max_width=500, max_rows=20)

dsum_2 = distinguishability_summary(
    df_predict=df_predict_improved, df_addresses_to_match=df_fhrs_clean, con=con
)
dsum_2.show(max_width=500, max_rows=20)


┌─────────────────────────────┬───────┬────────────┐
│ distinguishability_category │ count │ percentage │
│           varchar           │ int64 │  varchar   │
├─────────────────────────────┼───────┼────────────┤
│ 01: One match only          │   757 │ 15.14%     │
│ 02: Distinguishability > 10 │   601 │ 12.02%     │
│ 03: Distinguishability > 5  │   116 │ 2.32%      │
│ 04: Distinguishability > 1  │   449 │ 8.98%      │
│ 05: Distinguishability > 0  │   126 │ 2.52%      │
│ 06.: Distinguishability = 0 │  1781 │ 35.62%     │
│ 99: No match                │  1170 │ 23.40%     │
└─────────────────────────────┴───────┴────────────┘

┌─────────────────────────────┬───────┬────────────┐
│ distinguishability_category │ count │ percentage │
│           varchar           │ int64 │  varchar   │
├─────────────────────────────┼───────┼────────────┤
│ 01: One match only          │   758 │ 15.16%     │
│ 02: Distinguishability > 10 │   830 │ 16.60%     │
│ 03: Distinguishability > 5  │   388 │ 7.76%

In [8]:
from uk_address_matcher.post_linkage.analyse_results import distinguishability_by_id

distinguishability_by_id(df_predict=df_predict_improved, df_addresses_to_match=df_fhrs_clean, con=con).df().sample(10)


Unnamed: 0,unique_id_r,unique_id_l,original_address_concat_l,postcode_l,match_probability,match_weight,distinguishability,distinguishability_category,original_address_concat_r,postcode_r
4786,300821,,,,,,,99: No match,MOUNT HILL HALSTEAD ESSEX,C09 1AA
2939,55538,13384142.0,12 RUNWELL RAOD WICKFORD ESSEX,SS11 7AB,0.000351,-16.473931,0.0,06.: Distinguishability = 0,22-28 RUNWELL ROAD WICKFORD ESSEX,SS11 7AB
4496,108578,,,,,,,99: No match,MILTON ERNEST LOWER SCHOOL THURLEIGH ROAD MILTON ERNEST BEDFORD,MK44 1RF
845,1689688,15110190.0,FLAT 265 ROMAN ROAD MOUNTNESSING BRENTWOOD,CM15 0UH,0.98496,11.033219,22.50715,02: Distinguishability > 10,265 ROMAN ROAD MOUNTNESSING ESSEX,CM15 0UH
2832,1612840,8277743.0,THE WHITE HART INN 15 HIGH STREET HALSTEAD ESSEX,CO9 2AA,0.000351,-16.473931,0.0,06.: Distinguishability = 0,7 HIGH STREET HALSTEAD ESSEX,CO9 2AA
4511,1552285,,,,,,,99: No match,UNIT 4 WESTGATE PARK FODDERWICK BASILDON ESSEX,SS14 1WP
931,1558364,15460542.0,OFFICE F 12 THE PLAZA 1 RUTHERFORD PARK GREAT NOTLEY BRAINTREE ESSEX,CM77 7AU,0.954001,9.374331,13.758781,02: Distinguishability > 10,S 12 THE PLAZA ENTERPRISE AND INNOVATION CENTRE 1 RUTHERFORD PARK GREAT NOTLEY BRAINTREE,CM77 7AU
3630,1646994,14354135.0,WALNUT TREE HOUSE HALL STREET LONG MELFORD SUDBURY,CO10 9JG,8.8e-05,-18.473931,0.0,06.: Distinguishability = 0,THE BULL HOTEL THE BULL HOTEL HALL STREET LONG MELFORD SUDBURY,CO10 9JG
1214,1576531,14282879.0,UNIT 4 CHURCH ARCADE BEDFORD,MK40 1LQ,0.733768,6.462643,22.936574,02: Distinguishability > 10,4 CHURCH ARCADE BEDFORD,MK40 1LQ
3955,1691544,,,,,,,99: No match,VENTENFALL PROJECT - MURPHY CONSTRUCTION SITE SCARNING NORFOLK,NR19 2QN


In [13]:


sql = """
SELECT * FROM df_predict_ddb WHERE match_weight > 0
QUALIFY row_number() OVER (PARTITION BY unique_id_l ORDER BY match_weight DESC) = 1
order by random()
limit 3
"""

recs = con.sql(sql).df().to_dict(orient="records")


for rec in recs:
    print("-" * 80)
    print(rec["unique_id_l"], rec["original_address_concat_l"])
    print(rec["unique_id_r"], rec["original_address_concat_r"])
    display(linker.visualisations.waterfall_chart([rec]))


--------------------------------------------------------------------------------
12261043 OLIVERS BARN MALDON ROAD WITHAM ESSEX
300994 OLIVERS FARM NURSERIES MALDON ROAD WITHAM ESSEX


--------------------------------------------------------------------------------
14928577 WOODLAND MANOR GREEN LANE CLAPHAM BEDFORD BEDFORDSHIRE
1082241 WOODLAND MANOR GREEN LANE CLAPHAM BEDFORD


--------------------------------------------------------------------------------
12539680 BEDFORD HEIGHTS BRICKHILL DRIVE BEDFORD
846804 BEDFORD HEIGHTS BRICKHILL DRIVE BEDFORD
