[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RobinL/uk_address_matcher/blob/main/interactive_comparison.ipynb)


In [1]:
# !pip install uk_address_matcher==0.0.1.dev11 ipywidgets


In [5]:
import ipywidgets as widgets
from IPython.display import display
import duckdb
import pandas as pd

from uk_address_matcher.cleaning_pipelines import (
    clean_data_using_precomputed_rel_tok_freq,
)
from uk_address_matcher.splink_model import _performance_predict

record_1 = {
    "address_concat": "flat 11A 243 high street birmingham",
    "postcode": "B12 0AB",
}

record_2 = {
    "address_concat": "flat A, 11 spitfire court 243 high street birmingham",
    "postcode": "B12 0AB",
}

fields = ["address_concat", "postcode"]

left_text_boxes = []
right_text_boxes = []

inputs_to_interactive_output = {}

for f in fields:
    wl = widgets.Text(
        description=f, value=str(record_1[f]), layout=widgets.Layout(width="500px"),continuous_update=False
    )
    left_text_boxes.append(wl)
    inputs_to_interactive_output[f"{f}_l"] = wl
    wr = widgets.Text(
        description=f, value=str(record_2[f]), layout=widgets.Layout(width="500px"), continuous_update=False
    )
    right_text_boxes.append(wr)
    inputs_to_interactive_output[f"{f}_r"] = wr

b1 = widgets.VBox(left_text_boxes)
b2 = widgets.VBox(right_text_boxes)
ui = widgets.HBox([b1, b2])


def myfn(**kwargs):
    my_args = dict(kwargs)
    con = duckdb.connect(database=":memory:")

    record_left = {"unique_id": 1, "source_dataset": "dataset 1"}
    record_right = {"unique_id": 2, "source_dataset": "dataset 2"}

    for key, value in my_args.items():
        if value == "":
            value = None
        if key.endswith("_l"):
            record_left[key[:-2]] = value
        elif key.endswith("_r"):
            record_right[key[:-2]] = value

    dataset_1 = pd.DataFrame([record_left])
    con.register("dataset_1", dataset_1)
    dataset_2 = pd.DataFrame([record_right])
    con.register("dataset_2", dataset_2)

    cleaned_1 = clean_data_using_precomputed_rel_tok_freq(dataset_1, con=con)
    cleaned_2 = clean_data_using_precomputed_rel_tok_freq(dataset_2, con=con)

    linker, predictions = _performance_predict(
        df_addresses_to_match=cleaned_1,
        df_addresses_to_search_within=cleaned_2,
        con=con,
        match_weight_threshold=None,
        output_all_cols=True,
        full_block=True

    )

    recs = predictions.df().to_dict(orient="records")

    display(linker.waterfall_chart(recs))


out = widgets.interactive_output(myfn, inputs_to_interactive_output)

display(ui, out)

HBox(children=(VBox(children=(Text(value='flat 11A 243 high street birmingham', continuous_update=False, descr…

Output()

In [3]:
import ipywidgets as widgets
from IPython.display import display
import duckdb
import pandas as pd
import re

from uk_address_matcher.cleaning_pipelines import (
    clean_data_using_precomputed_rel_tok_freq,
)
from uk_address_matcher.splink_model import _performance_predict
from uk_address_matcher.display_results import display_l_r

# Initial records
record_1 = {
    "full_address": "flat 11A 243 high street birmingham B12 0AB",
}

record_2 = {
    "full_address": "flat A, 11 spitfire court 243 high street birmingham B12 0AB",
}

record_3 = {
    "full_address": "12B 243 high street birmingham B12 0AB",
}

fields = ["full_address"]

left_text_boxes = []
right_text_boxes = []
extra_text_boxes = []

inputs_to_interactive_output = {}

# Create text boxes for left and right records
for f in fields:
    wl = widgets.Text(
        description=f,
        value=str(record_1[f]),
        layout=widgets.Layout(width="500px"),
        continuous_update=True,
    )
    left_text_boxes.append(wl)
    inputs_to_interactive_output[f"{f}_l"] = wl

    wr = widgets.Text(
        description=f,
        value=str(record_2[f]),
        layout=widgets.Layout(width="500px"),
        continuous_update=True,
    )
    right_text_boxes.append(wr)
    inputs_to_interactive_output[f"{f}_r"] = wr

    we = widgets.Text(
        description=f,
        value=str(record_3[f]),
        layout=widgets.Layout(width="500px"),
        continuous_update=True,
    )
    extra_text_boxes.append(we)
    inputs_to_interactive_output[f"{f}_e"] = we

# Arrange text boxes in a layout
header_left = widgets.Label("Original Address")
header_right = widgets.Label("Candidate Matches")

b1 = widgets.VBox([header_left] + left_text_boxes)
b2 = widgets.VBox([header_right] + right_text_boxes + extra_text_boxes)
ui = widgets.HBox([b1, b2])


def extract_postcode(full_address):
    match = re.search(r"(\b[A-Z0-9]+\b\s?){1,2}$", full_address)
    if match:
        return match.group(0).strip()
    return None


def myfn(**kwargs):
    my_args = dict(kwargs)
    con = duckdb.connect(database=":memory:")

    record_left = {"unique_id": 1, "source_dataset": "dataset 1"}
    record_right_1 = {"unique_id": 2, "source_dataset": "dataset 2"}
    record_right_2 = {"unique_id": 3, "source_dataset": "dataset 2"}

    for key, value in my_args.items():
        if value == "":
            value = None
        if key.endswith("_l"):

            record_left["postcode"] = extract_postcode(value)
            value = value.replace(record_left["postcode"], "").strip()
            record_left["address_concat"] = value
        elif key.endswith("_r"):

            record_right_1["postcode"] = extract_postcode(value)
            value = value.replace(record_right_1["postcode"], "").strip()
            record_right_1["address_concat"] = value

        elif key.endswith("_e"):

            record_right_2["postcode"] = extract_postcode(value)
            value = value.replace(record_right_2["postcode"], "").strip()
            record_right_2["address_concat"] = value

    dataset_1 = pd.DataFrame([record_left])
    con.register("dataset_1", dataset_1)
    dataset_2 = pd.DataFrame([record_right_1, record_right_2])
    con.register("dataset_2", dataset_2)

    cleaned_1 = clean_data_using_precomputed_rel_tok_freq(dataset_1, con=con)
    cleaned_2 = clean_data_using_precomputed_rel_tok_freq(dataset_2, con=con)

    linker, predictions = _performance_predict(
        df_addresses_to_match=cleaned_1,
        df_addresses_to_search_within=cleaned_2,
        con=con,
        match_weight_threshold=None,
        output_all_cols=True,
        full_block=True,
    )

    sql = """
    select match_probability, match_weight, concat_ws(' ', original_address_concat_l, postcode_l) as address_l, concat_ws(' ', original_address_concat_r, postcode_r) as address_r, unique_id_l, unique_id_r,  source_dataset_l, source_dataset_r,
    postcode_l, postcode_r
    from predictions
    order by unique_id_r
    """
    recs = con.sql(sql).df()

    pd.options.display.max_colwidth = 150
    pd.options.display.max_columns = 1000
    display(recs)

    # Display a break
    # display(widgets.HTML("<hr>"))
    # display(d)


out = widgets.interactive_output(myfn, inputs_to_interactive_output)

display(ui, out)

HBox(children=(VBox(children=(Label(value='Original Address'), Text(value='flat 11A 243 high street birmingham…

Output()

In [4]:
import ipywidgets as widgets
from IPython.display import display
import duckdb
import pandas as pd
import re

from uk_address_matcher.cleaning_pipelines import clean_data_using_precomputed_rel_tok_freq

# Initial record
record_1 = {
    "full_address": "flat 11A 243 high street birmingham B12 0AB",
}

# Create a text box for the address input
address_input = widgets.Text(
    description="Full Address", value=str(record_1["full_address"]), layout=widgets.Layout(width="500px"), continuous_update=True
)

def extract_postcode(full_address):
    match = re.search(r"(\b[A-Z0-9]+\b\s?){1,2}$", full_address)
    if match:
        return match.group(0).strip()
    return None

def format_token_rel_freq(data):
    formatted_tokens = " ".join(
        [f"{item['tok']} {item['rel_freq']:.3e}" for item in data]
    )
    return formatted_tokens

def myfn(full_address):
    con = duckdb.connect(database=":memory:")

    record = {"unique_id": 1, "source_dataset": "dataset 1"}

    record["postcode"] = extract_postcode(full_address)

    # remove contents of record["postcode"] from full_address
    full_address = full_address.replace(record["postcode"], "").strip()
    record["address_concat"] = full_address

    dataset = pd.DataFrame([record])
    con.register("dataset", dataset)

    cleaned = clean_data_using_precomputed_rel_tok_freq(dataset, con=con)
    cleaned_df = cleaned.df()




    cleaned_df2 = cleaned_df.drop(columns=['token_rel_freq_arr', 'common_end_tokens'])

    display(cleaned_df2.iloc[:, :7])
    if cleaned_df2.shape[1] > 7:
        display(cleaned_df2.iloc[:, 7:])
    display(list(cleaned_df.loc[0, "token_rel_freq_arr"]))
    display(list(cleaned_df.loc[0, "common_end_tokens"]))

out = widgets.interactive_output(myfn, {'full_address': address_input})

display(address_input, out)


Text(value='flat 11A 243 high street birmingham B12 0AB', description='Full Address', layout=Layout(width='500…

Output()