[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobinL/uk_address_matcher/blob/main/match_example_data.ipynb)


In [5]:
# !pip install uk_address_matcher==0.0.1.dev9

In [1]:
import duckdb
import pandas as pd
from IPython.display import display

from uk_address_matcher.cleaning_pipelines import (
    clean_data_using_precomputed_rel_tok_freq,
)
from uk_address_matcher.splink_model import _performance_predict

pd.options.display.max_colwidth = 1000

# -----------------------------------------------------------------------------
# Step 1: Load in some example data.  If using your own data, it must be in»
# the same format as the example data.
# -----------------------------------------------------------------------------
p_fhrs = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/fhrs_addresses_sample.parquet"
p_ch = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/companies_house_addresess_postcode_overlap.parquet"

con = duckdb.connect(database=":memory:")
con.sql(f"CREATE TABLE df_fhrs AS SELECT * FROM read_parquet('{p_fhrs}')")
con.sql(f"CREATE TABLE df_ch AS SELECT * FROM read_parquet('{p_ch}')")
df_fhrs = con.table("df_fhrs")
df_ch = con.table("df_ch")


# Display length of the dataset
print(f"Length of FHRS dataset: {len(df_fhrs):,.0f}")
print(f"Length of Companies House dataset: {len(df_ch):,.0f}")

display(df_fhrs.limit(5).df())
display(df_ch.limit(5).df())


Length of FHRS dataset: 5,000
Length of Companies House dataset: 21,952


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,1543406,fhrs,1 OAK CHILTON DAY CENTRE UNIT 2 MARTINS ROAD CHILTON INDUSTRIAL ESTATE SUDBURY,CO10 2FT
1,1395196,fhrs,38 STATION ROAD SUDBURY SUFFOLK,CO10 2SS
2,1394874,fhrs,33 SWAN STREET BOXFORD SUDBURY SUFFOLK,CO10 5NZ
3,1649158,fhrs,11A FRIARS STREET SUDBURY SUFFOLK,CO10 2AA
4,1689685,fhrs,13-14 MARKET PLACE LAVENHAM SUDBURY SUFFOLK,CO10 9QZ


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,6911165,companies_house,NORFOLK HOUSE 22 -24 MARKET PLACE SWAFFHAM NORFOLK,PE37 7QH
1,13048420,companies_house,10-11 THURLOW STREET BEDFORD,MK40 1LR
2,12870226,companies_house,69 RUNWELL ROAD WICKFORD,SS11 7HL
3,9146129,companies_house,249 ONGAR ROAD BRENTWOOD,CM15 9DZ
4,12061693,companies_house,C/O CJAS 105 HIGH STREET BRENTWOOD,CM14 4RR


In [2]:
# -----------------------------------------------------------------------------
# Step 2: Clean the data/feature engineering to prepare for matching model
# -----------------------------------------------------------------------------

df_fhrs_clean = clean_data_using_precomputed_rel_tok_freq(df_fhrs, con=con)
df_ch_clean = clean_data_using_precomputed_rel_tok_freq(df_ch, con=con)


In [3]:


linker, predictions = _performance_predict(
    df_addresses_to_match=df_fhrs_clean,
    df_addresses_to_search_within=df_ch_clean,
    con=con,
    match_weight_threshold=-10,
    output_all_cols=True,
    print_timings=True
)



Initialise df_concat_with_tf took 1.77 seconds
Time taken to block: 0.86 seconds
Time taken to predict: 1.93 seconds


In [4]:
from uk_address_matcher.analyse_results import distinguishability_summary
distinguishability_summary(df_predict=predictions, df_addresses_to_match=df_fhrs_clean, con=con)

┌─────────────────────────────┬───────┬────────────┐
│ distinguishability_category │ count │ percentage │
│           varchar           │ int64 │  varchar   │
├─────────────────────────────┼───────┼────────────┤
│ 01: One match only          │   746 │ 14.92%     │
│ 02: Distinguishability > 10 │   528 │ 10.56%     │
│ 03: Distinguishability > 5  │   191 │ 3.82%      │
│ 04: Distinguishability > 1  │   455 │ 9.10%      │
│ 05: Distinguishability > 0  │   132 │ 2.64%      │
│ 06.: Distinguishability = 0 │  1130 │ 22.60%     │
│ 99: No match                │  1818 │ 36.36%     │
└─────────────────────────────┴───────┴────────────┘

In [11]:
from uk_address_matcher.analyse_results import distinguishability_by_id

distinguishability_by_id(df_predict=predictions, df_addresses_to_match=df_fhrs_clean, con=con).df().sample(10)


Unnamed: 0,unique_id_l,distinguishability,match_probability,match_weight,distinguishability_category,original_address_concat_l,postcode_l,original_address_concat_r,postcode_r
974,1287102,19.162582,0.999945,14.148082,02: Distinguishability > 10,16A LIME STREET BEDFORD,MK40 1LD,16A LIME STREET BEDFORD,MK40 1LD
212,102651,,0.66443,0.9855,01: One match only,3 BERRY DRIVE BROMHAM BEDFORD,MK43 8QA,17 BERRY DRIVE BROMHAM BEDFORD,MK43 8QA
185,109162,,0.887892,2.9855,01: One match only,42 HOOKHAMS LANE RENHOLD BEDFORD,MK41 0JU,43 HOOKHAMS LANE RENHOLD BEDFORD,MK41 0JU
2941,1695007,0.0,0.011125,-6.473931,06.: Distinguishability = 0,THE OLD ALEHOUSE NEW ROAD NEDGING WITH NAUGHTON IPSWICH,IP7 7BX,OLD ALE HOUSE NEW ROAD NAUGHTON IPSWICH SUFFOLK,IP7 7BX
2604,833172,0.0,0.152542,-2.473931,06.: Distinguishability = 0,DAY CENTRE RASHS GREEN DEREHAM NORFOLK,NR19 1JG,4 RASHS GREEN DEREHAM NORFOLK,NR19 1JG
2817,1298101,0.0,0.030009,-5.0145,06.: Distinguishability = 0,24 CROWN STREET BRENTWOOD ESSEX,CM14 4BA,29A CROWN STREET BRENTWOOD ESSEX,CM14 4BA
2185,930407,0.0,0.999635,11.417853,06.: Distinguishability = 0,BARLEYLANDS FARM BARLEYLANDS ROAD BILLERICAY ESSEX,CM11 2UD,BARLEYLANDS FARM OFFICE BARLEYLANDS ROAD BILLERICAY ESSEX,CM11 2UD
1967,50444,0.5,0.504517,0.026069,05: Distinguishability > 0,UNIT 11 FESTIVAL LEISURE PARK FESTIVAL WAY BASILDON ESSEX,SS14 3WB,8 FESTIVAL LEISURE PARK BASILDON,SS14 3WB
1725,1560971,3.321928,0.310345,-1.152003,04: Distinguishability > 1,MARKET PLACE NEW BUCKENHAM NORWICH,NR16 2AN,LOVELLS MARKET PLACE NEW BUCKENHAM NORWICH,NR16 2AN
777,300715,15.691256,0.999999,20.176756,02: Distinguishability > 10,125 SWAN STREET SIBLE HEDINGHAM HALSTEAD ESSEX,CO9 3PP,125 SWAN STREET SIBLE HEDINGHAM HALSTEAD,CO9 3PP


In [6]:


sql = """
SELECT * FROM predictions WHERE match_weight > 0
QUALIFY row_number() OVER (PARTITION BY unique_id_l ORDER BY match_weight DESC) = 1
order by random()
limit 3
"""

recs = con.sql(sql).df().to_dict(orient="records")


for rec in recs:
    print("-" * 80)
    print(rec["unique_id_l"], rec["original_address_concat_l"])
    print(rec["unique_id_r"], rec["original_address_concat_r"])
    display(linker.waterfall_chart([rec]))


--------------------------------------------------------------------------------
1544642 15 RONALD CLOSE KEMPSTON BEDFORD
11926343 11A RONALD CLOSE KEMPSTON BEDFORD


--------------------------------------------------------------------------------
106498 47 HIGH STREET KEMPSTON BEDFORD
14698176 47 HIGH STREET KEMPSTON BEDFORD


--------------------------------------------------------------------------------
107501 46 ST JOHNS STREET KEMPSTON BEDFORD
12040504 10 ST JOHNS STREET KEMPSTON BEDFORD
