[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobinL/uk_address_matcher/blob/main/match_example_data.ipynb)


In [9]:
# !pip install uk_address_matcher==0.0.1.dev11

In [10]:
import duckdb
import pandas as pd
from IPython.display import display

from uk_address_matcher.cleaning_pipelines import (
    clean_data_using_precomputed_rel_tok_freq,
)
from uk_address_matcher.splink_model import _performance_predict

pd.options.display.max_colwidth = 1000

# -----------------------------------------------------------------------------
# Step 1: Load in some example data.  If using your own data, it must be in»
# the same format as the example data.
# -----------------------------------------------------------------------------
p_fhrs = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/fhrs_addresses_sample.parquet"
p_ch = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/companies_house_addresess_postcode_overlap.parquet"

con = duckdb.connect(database=":memory:")
con.sql(f"CREATE TABLE df_fhrs AS SELECT * FROM read_parquet('{p_fhrs}')")
con.sql(f"CREATE TABLE df_ch AS SELECT * FROM read_parquet('{p_ch}')")
df_fhrs = con.table("df_fhrs")
df_ch = con.table("df_ch")


# Display length of the dataset
print(f"Length of FHRS dataset: {len(df_fhrs):,.0f}")
print(f"Length of Companies House dataset: {len(df_ch):,.0f}")

display(df_fhrs.limit(5).df())
display(df_ch.limit(5).df())


Length of FHRS dataset: 5,000
Length of Companies House dataset: 21,952


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,1543406,fhrs,1 OAK CHILTON DAY CENTRE UNIT 2 MARTINS ROAD CHILTON INDUSTRIAL ESTATE SUDBURY,CO10 2FT
1,1395196,fhrs,38 STATION ROAD SUDBURY SUFFOLK,CO10 2SS
2,1394874,fhrs,33 SWAN STREET BOXFORD SUDBURY SUFFOLK,CO10 5NZ
3,1649158,fhrs,11A FRIARS STREET SUDBURY SUFFOLK,CO10 2AA
4,1689685,fhrs,13-14 MARKET PLACE LAVENHAM SUDBURY SUFFOLK,CO10 9QZ


Unnamed: 0,unique_id,source_dataset,address_concat,postcode
0,6911165,companies_house,NORFOLK HOUSE 22 -24 MARKET PLACE SWAFFHAM NORFOLK,PE37 7QH
1,13048420,companies_house,10-11 THURLOW STREET BEDFORD,MK40 1LR
2,12870226,companies_house,69 RUNWELL ROAD WICKFORD,SS11 7HL
3,9146129,companies_house,249 ONGAR ROAD BRENTWOOD,CM15 9DZ
4,12061693,companies_house,C/O CJAS 105 HIGH STREET BRENTWOOD,CM14 4RR


In [11]:
# -----------------------------------------------------------------------------
# Step 2: Clean the data/feature engineering to prepare for matching model
# -----------------------------------------------------------------------------

df_fhrs_clean = clean_data_using_precomputed_rel_tok_freq(df_fhrs, con=con)
df_ch_clean = clean_data_using_precomputed_rel_tok_freq(df_ch, con=con)


In [12]:
linker, predictions = _performance_predict(
    df_addresses_to_match=df_fhrs_clean,
    df_addresses_to_search_within=df_ch_clean,
    con=con,
    match_weight_threshold=-10,
    output_all_cols=True,
    print_timings=True
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Initialise df_concat_with_tf took 2.20 seconds
Time taken to block: 0.76 seconds
Time taken to predict: 2.01 seconds


In [None]:
from uk_address_matcher.analyse_results import distinguishability_summary
distinguishability_summary(df_predict=predictions, df_addresses_to_match=df_fhrs_clean, con=con)

┌─────────────────────────────┬───────┬────────────┐
│ distinguishability_category │ count │ percentage │
│           varchar           │ int64 │  varchar   │
├─────────────────────────────┼───────┼────────────┤
│ 01: One match only          │   734 │ 14.68%     │
│ 02: Distinguishability > 10 │   514 │ 10.28%     │
│ 03: Distinguishability > 5  │   161 │ 3.22%      │
│ 04: Distinguishability > 1  │   442 │ 8.84%      │
│ 05: Distinguishability > 0  │   128 │ 2.56%      │
│ 06.: Distinguishability = 0 │  1159 │ 23.18%     │
│ 99: No match                │  1862 │ 37.24%     │
└─────────────────────────────┴───────┴────────────┘

In [None]:
from uk_address_matcher.analyse_results import distinguishability_by_id

distinguishability_by_id(df_predict=predictions, df_addresses_to_match=df_fhrs_clean, con=con).df().sample(10)


Unnamed: 0,unique_id_l,distinguishability,match_probability,match_weight,distinguishability_category,original_address_concat_l,postcode_l,original_address_concat_r,postcode_r
851,1681333.0,14.83289,0.999981,15.680887,02: Distinguishability > 10,UNIT F WREXHAM ROAD LAINDON BASILDON,SS15 6PX,UNIT F WREXHAM ROAD LAINDON BASILDON,SS15 6PX
1714,1395075.0,2.0,0.152542,-2.473931,04: Distinguishability > 1,BRIGHT HORIZONS THOMAS GAINSBOROUGH SCHOOL WELLS HALL ROAD GREAT CORNARD SUDBURY,CO10 0NH,63 WELLS HALL ROAD GREAT CORNARD SUDBURY,CO10 0NH
4333,,,,,99: No match,UNIT 1 ST JOHNS CENTRE ROPE WALK BEDFORD,MK42 0XE,,
3312,,,,,99: No match,RAYNE ROAD BRAINTREE ESSEX,CM7 2QS,,
4515,,,,,99: No match,BRENTWOOD ROAD BRENTWOOD ESSEX,CM13 3PN,,
2197,1645032.0,0.0,0.998783,9.680887,06.: Distinguishability = 0,WHERSTEAD PARK THE STREET WHERSTEAD IPSWICH,IP9 2BJ,WHERSTEAD PARK THE STREET WHERSTEAD IPSWICH,IP9 2BJ
3969,,,,,99: No match,ST JOSEPHS ST JOSEPHS HOME THE CROFT SUDBURY SUFFOLK,CO10 1HR,,
230,1395007.0,,0.418605,-0.473931,01: One match only,BOOTS UK LTD UNIT 5 APPLEGATE CENTRE APPLEGATE MEWS GREAT CORNARD,CO10 0GL,UNIT 6 APPLEGATE CENTRE APPLEGATE MEWS GREAT CORNARD SUDBURY,CO10 0GL
2708,300433.0,0.0,0.101124,-3.152003,06.: Distinguishability = 0,18 TRINITY STREET HALSTEAD ESSEX,CO9 1JA,8A TRINITY STREET HALSTEAD,CO9 1JA
503,1683747.0,,0.011125,-6.473931,01: One match only,ST GILES CHURCH OF ENGLAND PRIMARY SCHOOL CHURCH STREET GREAT MAPLESTEAD ESSEX,CO9 2RG,RAFTERS CHURCH STREET GREAT MAPLESTEAD HALSTEAD,CO9 2RG


In [None]:


sql = """
SELECT * FROM predictions WHERE match_weight > 0
QUALIFY row_number() OVER (PARTITION BY unique_id_l ORDER BY match_weight DESC) = 1
order by random()
limit 3
"""

recs = con.sql(sql).df().to_dict(orient="records")


for rec in recs:
    print("-" * 80)
    print(rec["unique_id_l"], rec["original_address_concat_l"])
    print(rec["unique_id_r"], rec["original_address_concat_r"])
    display(linker.waterfall_chart([rec]))


--------------------------------------------------------------------------------
802004 FOUR ACRES FARM LOWER PARK ROAD WICKFORD ESSEX
14836927 FOUR ACRES FARM LOWER PARK ROAD WICKFORD


--------------------------------------------------------------------------------
637096 141 KINGS ROAD BRENTWOOD ESSEX
11734868 141 KINGS ROAD BRENTWOOD ESSEX


--------------------------------------------------------------------------------
816884 CHURCH FARM TURVEY ROAD CARLTON BEDFORD
11105141 CHURCH FARM TURVEY ROAD BEDFORD
