In [5]:
import pandas as pd

df = pd.read_csv("data/clean_county.csv", low_memory=False)
dfA = df.sample(100)
dfB = df.sample(100)


In [34]:
import pandas as pd
import altair as alt
from IPython.display import IFrame
from IPython.display import display

alt.renderers.enable('mimetype')

from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl


#columns = ["first_name", "middle_name", "last_name", "res_street_address", "birth_year", "zip_code"]
missing_percent = [0.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.0]

x = [1000]
for size in x:

    sample_size = round(size * 1.5)
    sample_set = df.sample(sample_size)
    cut = round(sample_size / 3)

    for i, col in enumerate(df.columns):
        sample_set.loc[sample_set.sample(frac=missing_percent[i]).index, col] = None

    dfA_first = sample_set[0:cut]
    dfB_first = sample_set[0:cut]

    dfA_last = sample_set[(cut):(2 * cut)]
    dfB_last = sample_set.tail(cut)

    frame_a = [dfA_first, dfA_last]
    frame_b = [dfB_first, dfB_last]

    dfA = pd.concat(frame_a)
    dfB = pd.concat(frame_b)

settings = {
    "link_type": "link_only",
    "unique_id_column_name": "id",
    "comparisons": [
        cl.levenshtein_at_thresholds(col_name="first_name", distance_threshold_or_thresholds=1, include_exact_match_level=False),
        cl.levenshtein_at_thresholds(col_name="last_name", distance_threshold_or_thresholds=1, include_exact_match_level=False),
        cl.levenshtein_at_thresholds(col_name="middle_name", distance_threshold_or_thresholds=1, include_exact_match_level=False),
        cl.levenshtein_at_thresholds(col_name="res_street_address", distance_threshold_or_thresholds=1, include_exact_match_level=False),
        cl.exact_match(col_name="birth_year")
    ],
    "blocking_rules_to_generate_predictions": [
       "l.zip_code = r.zip_code"
    ],
    "retain_matching_columns": False,
    "max_iterations": 100,
    "em_convergence": 1e-4
}

linker = DuckDBLinker([dfA,dfB], settings)

In [35]:
linker.estimate_u_using_random_sampling(target_rows=1e6)
training = ["l.first_name = r.first_name",
            "l.middle_name = r.middle_name",
            "l.last_name = r.last_name",
            "l.res_street_address = r.res_street_address",
            "l.birth_year = r.birth_year"
            ]

for i in training:
    linker.estimate_parameters_using_expectation_maximisation(i)

df_predict = linker.predict(threshold_match_probability=0.95)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - last_name (no m values are trained).
    - middle_name (no m values are trained).
    - res_street_address (no m values are trained).
    - birth_year (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.first_name = r.first_name

Parameter estimates will be made for the following comparison(s):
    - last_name
    - middle_name
    - res_street_address
    - birth_year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - first_name

Iteration 1: Largest change in params was 0.169 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.00208 in probability_two_random_records_match
Iteration 3: Larges

In [39]:
df_full_predict = df_predict.as_pandas_dataframe()
print(len(df_full_predict))
display(df_full_predict)




505


Unnamed: 0,match_weight,match_probability,source_dataset_l,id_l,source_dataset_r,id_r
0,36.327828,1.000000,_a,36542.0,_b,36542.0
1,36.327828,1.000000,_a,110985.0,_b,110985.0
2,26.916930,1.000000,_a,48143.0,_b,48143.0
3,28.634680,1.000000,_a,40799.0,_b,40799.0
4,29.231716,1.000000,_a,54531.0,_b,54531.0
...,...,...,...,...,...,...
500,36.327828,1.000000,_a,72802.0,_b,72802.0
501,36.327828,1.000000,_a,34087.0,_b,34087.0
502,19.820818,0.999999,_a,75694.0,_b,75694.0
503,36.327828,1.000000,_a,9459.0,_b,9459.0


In [40]:
test = df_full_predict.sort_values("match_weight")
test

Unnamed: 0,match_weight,match_probability,source_dataset_l,id_l,source_dataset_r,id_r
92,4.823960,0.965898,_a,76979.0,_b,111080.0
94,5.420995,0.977191,_a,90027.0,_b,4182.0
262,5.420995,0.977191,_a,28089.0,_b,122275.0
263,5.420995,0.977191,_a,44045.0,_b,122275.0
93,5.420995,0.977191,_a,12949.0,_b,12186.0
...,...,...,...,...,...,...
201,36.327828,1.000000,_a,106826.0,_b,106826.0
199,36.327828,1.000000,_a,74596.0,_b,74596.0
198,36.327828,1.000000,_a,106734.0,_b,106734.0
236,36.327828,1.000000,_a,28688.0,_b,28688.0


In [41]:
print(dfA.loc[dfA["id"] == 76979])

Unnamed: 0,id,first_name,middle_name,last_name,res_street_address,birth_year,zip_code
18244,21312.0,COREY,MASON,COBB,415 DOGGETT DR,2004.0,272.0
8072,9459.0,LAWRENCE,DOUGLAS,BLANK,136 WESTVIEW DR,1980.0,272.0
64655,75694.0,JAMES,,MERRITT,,1925.0,272.0
29173,34087.0,JACOB,CHARLES,FAIRCLOTH,5101 S NC HWY 49,1995.0,272.0
62172,72802.0,MEGAN,DOWNING,MCCLURE,622 JOHNSON AVE,1982.0,272.0
...,...,...,...,...,...,...,...
19256,22451.0,THOMAS,ANDREW,COLLINS,1704 N NC HWY 62,1981.0,272.0
23050,26921.0,EMILEE,CAROL,DAVIDSON,909 CENTRAL AVE,2002.0,272.0
63485,74306.0,HELEN,TOBELMANN,MCLEOD,1211 ROCKWOOD AVE,1949.0,272.0
26908,31436.0,EDWARD,JOSEPH,DUSZLAK,2033 STUART CT,,272.0
