In [7]:
# Load normalized data for blocking stage

import pandas as pd 
from itertools import combinations
import re

df = pd.read_csv("data/clear_data.csv", dtype={"Phone_norm": str, "Zip_norm": str})

In [None]:
df = df.copy()
# utility features for blocking
df['email_domain'] = df['Email_norm'].str.split('@').str[-1]
df['phone_last4']  = df['Phone_norm'].str[-4:]
df['name0']        = df['Name_norm'].str[:1]


In [None]:
blocks = {
    'domain_zip':  df['email_domain'] + '_' + df['Zip_norm'],
    'pl4_zip':     df['phone_last4']  + '_' + df['Zip_norm'],
    'name0_zip':   df['name0']        + '_' + df['Zip_norm'],
}
if 'City_norm' in df:
    blocks['city_zip'] = df['City_norm'] + '_' + df['Zip_norm']


In [15]:
# Apply grouping or merging to reduce search space for record linkage

def true_pairs(df, uid_col='uid'):
    T=set()
    for _, g in df.groupby(uid_col).groups.items():
        g=list(g)
        for i,j in combinations(sorted(g),2):
            T.add((i,j))
    return T

def candidate_pairs_from_blocks(df, block_specs: dict):
    C=set(); size_hist={}
    for name, ser in block_specs.items():
        groups = ser.groupby(ser).groups
        sizes=[]
        for _, idxs in groups.items():
            idxs=list(map(int, idxs)); m=len(idxs)
            if m>1:
                sizes.append(m)
                for i,j in combinations(sorted(idxs),2):
                    C.add((i,j))
        size_hist[name] = pd.Series(sizes).value_counts().sort_index()
    return C, size_hist

def blocking_metrics(df, blocks, uid_col='uid'):
    T = true_pairs(df, uid_col)
    C, size_hist = candidate_pairs_from_blocks(df, blocks)
    n=len(df); total = n*(n-1)//2
    PC = len(T & C)/max(len(T),1)                       # recall (pairs completeness)
    RR = 1 - len(C)/total                               # reduction ratio
    PQ = len(T & C)/max(len(C),1)                       # pairs quality
    return {'PC':PC,'RR':RR,'PQ':PQ,
            'true_pairs':len(T),'cand_pairs':len(C),
            'size_hist':size_hist, 'cand_set':C}

m = blocking_metrics(df, blocks, uid_col='uid')
print({k:v for k,v in m.items() if k not in ['size_hist','cand_set']})
for name,hist in m['size_hist'].items():
    print(f'\n{name} – размер→кол-во групп'); print(hist.head(10))


{'PC': 1.0, 'RR': 0.9988178922051515, 'PQ': 0.9905660377358491, 'true_pairs': 315, 'cand_pairs': 318}

domain_zip – размер→кол-во групп
2    74
3    72
Name: count, dtype: int64

pl4_zip – размер→кол-во групп
2    76
3    71
Name: count, dtype: int64

name0_zip – размер→кол-во групп
2    78
3    73
Name: count, dtype: int64

city_zip – размер→кол-во групп
2    74
3    72
Name: count, dtype: int64


In [None]:
# Check number of candidate pairs generated by blocking

cand_pairs = m['cand_set']      
len(cand_pairs)                  

318

In [None]:
# Export candidate pairs to CSV for downstream matching

cand_df = pd.DataFrame(list(m['cand_set']), columns=['i','j'])
cand_df.to_csv('out/cand_pairs.csv', index=False)  
