[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobinL/uk_address_matcher/blob/main/match_example_data.ipynb)


In [1]:
!pip install --pre uk_address_matcher



In [2]:
import duckdb
import pandas as pd

from uk_address_matcher.post_linkage.analyse_results import (
    best_matches_summary,
)
from uk_address_matcher.post_linkage.identify_distinguishing_tokens import (
    improve_predictions_using_distinguishing_tokens,
)
from uk_address_matcher import clean_data_using_precomputed_rel_tok_freq, get_linker
import time

pd.options.display.max_colwidth = 1000

pd.options.display.max_colwidth = 1000

# -----------------------------------------------------------------------------
# Step 1: Load in some example data.  If using your own data, it must be in
# the same format as the example data.
# -----------------------------------------------------------------------------
# Any additional columns should be retained as-is by the cleaning code

p_fhrs = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/fhrs_addresses_sample.parquet"
p_ch = "https://github.com/RobinL/uk_address_matcher/raw/main/example_data/companies_house_addresess_postcode_overlap.parquet"

con = duckdb.connect(database=":memory:")
con.sql(f"CREATE TABLE df_fhrs AS SELECT * FROM read_parquet('{p_fhrs}')")
con.sql(f"CREATE TABLE df_ch AS SELECT * FROM read_parquet('{p_ch}')")
df_fhrs = con.table("df_fhrs")
df_ch = con.table("df_ch")

# Display length of the dataset
print(f"Length of FHRS dataset: {len(df_fhrs.df()):,.0f}")
print(f"Length of Companies House dataset: {len(df_ch.df()):,.0f}")

df_fhrs.limit(5).show(max_width=500)
df_ch.limit(5).show(max_width=500)


Length of FHRS dataset: 5,000
Length of Companies House dataset: 21,952
┌───────────┬────────────────┬────────────────────────────────────────────────────────────────────────────────┬──────────┐
│ unique_id │ source_dataset │                                 address_concat                                 │ postcode │
│  varchar  │    varchar     │                                    varchar                                     │ varchar  │
├───────────┼────────────────┼────────────────────────────────────────────────────────────────────────────────┼──────────┤
│ 1543406   │ fhrs           │ 1 OAK CHILTON DAY CENTRE UNIT 2 MARTINS ROAD CHILTON INDUSTRIAL ESTATE SUDBURY │ CO10 2FT │
│ 1395196   │ fhrs           │ 38 STATION ROAD SUDBURY SUFFOLK                                                │ CO10 2SS │
│ 1394874   │ fhrs           │ 33 SWAN STREET BOXFORD SUDBURY SUFFOLK                                         │ CO10 5NZ │
│ 1649158   │ fhrs           │ 11A FRIARS STREET SUDBURY SUFFOLK   

In [3]:
# -----------------------------------------------------------------------------
# Step 2: Clean the data/feature engineering to prepare for matching model
# -----------------------------------------------------------------------------

df_fhrs_clean = clean_data_using_precomputed_rel_tok_freq(df_fhrs, con=con)
df_ch_clean = clean_data_using_precomputed_rel_tok_freq(df_ch, con=con)


In [4]:
linker = get_linker(
    df_addresses_to_match=df_fhrs_clean,
    df_addresses_to_search_within=df_ch_clean,
    con=con,
    include_full_postcode_block=True,
    additional_columns_to_retain=["original_address_concat"],
)

df_predict = linker.inference.predict(
    threshold_match_weight=-50, experimental_optimisation=True
)
df_predict_ddb = df_predict.as_duckdbpyrelation()

Blocking time: 0.14 seconds
Predict time: 0.56 seconds


In [5]:
start_time = time.time()
df_predict_improved = improve_predictions_using_distinguishing_tokens(
    df_predict=df_predict_ddb,
    con=con,
    match_weight_threshold=-20,
)

df_predict_improved.show(max_width=500, max_rows=5)

end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

┌─────────────┬─────────────┬─────────────────────┬──────────────────────┬──────────────────────┬────────────────────┬─────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬───────────────────────────┬───────────────────────────────────────────┬──────────────────────────────────────────────────┬────────────────────────────────────────────────────────────────┬────────────┬─────────────────────────────────────────────────────┬────────────┐
│ unique_id_r │ unique_id_l │    mw_adjustment    │ match_weight_origi…  │ match_probability_…  │    match_weight    │  match_probability  │ overlapping_tokens…  │ tokens_elsewhere_i…  │    missing_tokens    │ overlapping_bigram…  │ bigrams_elsewhere_in_bl…  │ overlapping_bigrams_this_l_and_r_filtered │ bigrams_elsewhere_in_block_but_not_this_filtered │                   original_address_concat_l                    │ postcode_l │              original_address_concat_r              │ postcode_r 

In [6]:
dsum_1 = best_matches_summary(
    df_predict=df_predict_ddb, df_addresses_to_match=df_fhrs, con=con
)
dsum_1.show(max_width=500, max_rows=20)

dsum_2 = best_matches_summary(
    df_predict=df_predict_improved, df_addresses_to_match=df_fhrs, con=con
)
dsum_2.show(max_width=500, max_rows=20)


┌─────────────────────────────┬───────┬────────────┐
│ distinguishability_category │ count │ percentage │
│           varchar           │ int64 │  varchar   │
├─────────────────────────────┼───────┼────────────┤
│ 01: One match only          │   749 │ 14.98%     │
│ 02: Distinguishability > 10 │   641 │ 12.82%     │
│ 03: Distinguishability > 5  │   199 │ 3.98%      │
│ 04: Distinguishability > 1  │   643 │ 12.86%     │
│ 05: Distinguishability > 0  │   132 │ 2.64%      │
│ 06.: Distinguishability = 0 │  1535 │ 30.70%     │
│ 99: No match                │  1101 │ 22.02%     │
└─────────────────────────────┴───────┴────────────┘

┌─────────────────────────────┬───────┬────────────┐
│ distinguishability_category │ count │ percentage │
│           varchar           │ int64 │  varchar   │
├─────────────────────────────┼───────┼────────────┤
│ 01: One match only          │   751 │ 15.02%     │
│ 02: Distinguishability > 10 │   860 │ 17.20%     │
│ 03: Distinguishability > 5  │   220 │ 4.40%

In [7]:
from uk_address_matcher.post_linkage.analyse_results import (
    best_matches_with_distinguishability,
)

best_matches_with_distinguishability(
    df_predict=df_predict_improved,
    df_addresses_to_match=df_fhrs,
    con=con,
).show(max_width=500)


┌─────────────┬─────────────┬───────────────────────────────────────────────────────────────────────────┬────────────┬────────────────────────────────────────────────────────────────────────────┬────────────┬────────────────────┬────────────────────┬────────────────────┬─────────────────────────────┐
│ unique_id_r │ unique_id_l │                             address_concat_r                              │ postcode_r │                         original_address_concat_l                          │ postcode_l │ match_probability  │    match_weight    │ distinguishability │ distinguishability_category │
│   varchar   │   varchar   │                                  varchar                                  │  varchar   │                                  varchar                                   │  varchar   │       double       │       double       │       double       │           varchar           │
├─────────────┼─────────────┼─────────────────────────────────────────────────────────────────