In [12]:
import pandas as pd 
from itertools import combinations
import re

df = pd.read_csv("data/clear_data.csv", dtype={"Phone_norm": str, "Zip_norm": str})

In [14]:
df = df.copy()

# гарантируем строки и нормализацию
df['Email_norm'] = df['Email_norm'].str.strip().str.lower()
df['Phone_norm'] = df['Phone_norm'].astype(str).str.replace(r'\D','', regex=True)
df['Zip_norm']   = df['Zip_norm'].astype(str).str.zfill(5)
df['Name_norm']  = df['Name_norm'].str.strip().str.lower()
df['Street_norm']= df['Street_norm'].str.strip().str.lower()
if 'City_norm' in df:
    df['City_norm'] = df['City_norm'].str.strip().str.lower()

# служебные фичи для блокинга
df['email_domain'] = df['Email_norm'].str.split('@').str[-1]
df['phone_last4']  = df['Phone_norm'].str[-4:]
df['name0']        = df['Name_norm'].str[:1]


In [15]:
blocks = {
    'domain_zip':  df['email_domain'] + '_' + df['Zip_norm'],
    'pl4_zip':     df['phone_last4']  + '_' + df['Zip_norm'],
    'name0_zip':   df['name0']        + '_' + df['Zip_norm'],
}
# при наличии города можешь добавить:
if 'City_norm' in df:
    blocks['city_zip'] = df['City_norm'] + '_' + df['Zip_norm']


In [19]:
def true_pairs(df, uid_col='uid'):
    T=set()
    for _, g in df.groupby(uid_col).groups.items():
        g=list(g)
        for i,j in combinations(sorted(g),2):
            T.add((i,j))
    return T

def candidate_pairs_from_blocks(df, block_specs: dict):
    C=set(); size_hist={}
    for name, ser in block_specs.items():
        groups = ser.groupby(ser).groups
        sizes=[]
        for _, idxs in groups.items():
            idxs=list(map(int, idxs)); m=len(idxs)
            if m>1:
                sizes.append(m)
                for i,j in combinations(sorted(idxs),2):
                    C.add((i,j))
        size_hist[name] = pd.Series(sizes).value_counts().sort_index()
    return C, size_hist

def blocking_metrics(df, blocks, uid_col='uid'):
    T = true_pairs(df, uid_col)
    C, size_hist = candidate_pairs_from_blocks(df, blocks)
    n=len(df); total = n*(n-1)//2
    PC = len(T & C)/max(len(T),1)                       # recall (pairs completeness)
    RR = 1 - len(C)/total                               # reduction ratio
    PQ = len(T & C)/max(len(C),1)                       # pairs quality
    return {'PC':PC,'RR':RR,'PQ':PQ,
            'true_pairs':len(T),'cand_pairs':len(C),
            'size_hist':size_hist, 'cand_set':C}

m = blocking_metrics(df, blocks, uid_col='uid')
print({k:v for k,v in m.items() if k not in ['size_hist','cand_set']})
for name,hist in m['size_hist'].items():
    print(f'\n{name} – размер→кол-во групп'); print(hist.head(10))


{'PC': 1.0, 'RR': 0.9988178922051515, 'PQ': 0.9905660377358491, 'true_pairs': 315, 'cand_pairs': 318}

domain_zip – размер→кол-во групп
2    74
3    72
Name: count, dtype: int64

pl4_zip – размер→кол-во групп
2    76
3    71
Name: count, dtype: int64

name0_zip – размер→кол-во групп
2    78
3    73
Name: count, dtype: int64

city_zip – размер→кол-во групп
2    74
3    72
Name: count, dtype: int64


In [17]:
cand_pairs = m['cand_set']      # или m2['cand_set'] после прунинга
len(cand_pairs)                  # это и есть вход для правил/модели

318

In [12]:

cand_df = pd.DataFrame(list(m['cand_set']), columns=['i','j'])
cand_df.to_csv('data/cand_pairs.csv', index=False)  # или .csv


In [None]:
# from rapidfuzz.distance import JaroWinkler
# from rapidfuzz import fuzz
# import pandas as pd
# from itertools import combinations
# from collections import defaultdict, deque

# df = pd.read_csv("data/clear_data.csv", dtype={"Phone_norm": str, "Zip_norm": str})

# df = df.copy()

# # на всякий случай — служебные поля
# df['Email_norm']  = df['Email_norm'].str.strip().str.lower()
# df['Phone_norm']  = df['Phone_norm'].astype(str).str.replace(r'\D','', regex=True)
# df['Zip_norm']    = df['Zip_norm'].astype(str).str.zfill(5)
# df['Name_norm']   = df['Name_norm'].str.strip().str.lower()
# df['Street_norm'] = df['Street_norm'].str.strip().str.lower()
# if 'City_norm' in df.columns:
#     df['City_norm'] = df['City_norm'].str.strip().str.lower()

# df['email_user']  = df['Email_norm'].str.split('@').str[0]
# df['phone_last4'] = df['Phone_norm'].str[-4:]

# # Пороги из твоего EDA (средние/квантили у истинных дублей)
# NAME_THR   = 0.92   # Jaro–Winkler по name
# STREET_THR = 88     # token_set_ratio по street
# HARD_NAME  = 0.95   # «строгий» порог для некоторых правил


In [None]:
# def pair_features(i, j):
#     a, b = df.loc[i], df.loc[j]
#     # similarity
#     name_sim   = JaroWinkler.normalized_similarity(a['Name_norm'], b['Name_norm'])
#     street_sim = fuzz.token_set_ratio(a['Street_norm'], b['Street_norm'])
#     # бинарные совпадения
#     zip_eq      = a['Zip_norm'] == b['Zip_norm']
#     city_eq     = ('City_norm' in df and a['City_norm'] == b['City_norm'])
#     email_eq    = a['Email_norm'] == b['Email_norm']
#     phone_eq    = a['Phone_norm'] == b['Phone_norm']
#     email_user_eq  = a['email_user']  == b['email_user']
#     phone_last4_eq = a['phone_last4'] == b['phone_last4']
#     return {
#         'name_sim': name_sim, 'street_sim': street_sim,
#         'zip_eq': zip_eq, 'city_eq': city_eq,
#         'email_eq': email_eq, 'phone_eq': phone_eq,
#         'email_user_eq': email_user_eq, 'phone_last4_eq': phone_last4_eq
#     }


In [None]:
# def is_match(i, j):
#     f = pair_features(i, j)

#     # 1) Якоря: точные совпадения
#     if f['email_eq'] or f['phone_eq']:
#         return True

#     # 2) Мягкие правила (пороги из EDA)
#     if f['name_sim'] >= NAME_THR and (f['zip_eq'] or f['city_eq']):
#         return True

#     if f['street_sim'] >= STREET_THR and f['zip_eq']:
#         return True

#     # 3) Подстраховочные
#     if f['email_user_eq'] and (f['zip_eq'] or f['name_sim'] >= HARD_NAME):
#         return True

#     if f['phone_last4_eq'] and f['name_sim'] >= HARD_NAME and (f['zip_eq'] or f['city_eq']):
#         return True

#     return False


In [None]:
# # cand_pairs — твой набор кандидатных пар после блокинга
# # если нет под рукой:
# # cand_pairs = m['cand_set']  # где m = blocking_metrics(...)



# # истина (пары из одного uid)
# def true_pairs(df, uid='uid'):
#     S=set()
#     for _, g in df.groupby(uid).groups.items():
#         g=list(g)
#         for i,j in combinations(sorted(g),2):
#             S.add((i,j))
#     return S

# T = true_pairs(df, uid='uid')

# # предсказанные совпадения
# P = {p for p in cand_pairs if is_match(*p)}

# tp = len(P & T)
# fp = len(P - T)
# fn = len(T - P)
# prec = tp / (tp + fp) if (tp+fp) else 0.0
# rec  = tp / (tp + fn) if (tp+fn) else 0.0
# f1   = 0 if (prec+rec)==0 else 2*prec*rec/(prec+rec)

# print(f'Pairs — Precision: {prec:.3f}  Recall: {rec:.3f}  F1: {f1:.3f}')
# print(f'tp={tp}, fp={fp}, fn={fn}, |P|={len(P)}, |T|={len(T)}, |cand|={len(cand_pairs)}')


Pairs — Precision: 1.000  Recall: 0.997  F1: 0.998
tp=314, fp=0, fn=1, |P|=314, |T|=315, |cand|=318


In [None]:
# # граф смежности по предсказанным совпадениям
# adj = defaultdict(set)
# for i,j in P:
#     adj[i].add(j); adj[j].add(i)

# # компоненты связности — это и есть кластеры сущностей
# visited=set(); clusters=[]
# for v in df.index:
#     if v in visited: continue
#     comp=[]; q=deque([v]); visited.add(v)
#     while q:
#         u=q.popleft(); comp.append(u)
#         for w in adj[u]:
#             if w not in visited: visited.add(w); q.append(w)
#     clusters.append(sorted(comp))

# print('Кластеров:', len(clusters))
# # пример: первые 5 кластеров и их размеры
# print([len(c) for c in clusters[:5]])


Кластеров: 500
[1, 1, 1, 2, 1]


In [None]:
# def heavy_keys(ser, min_size=10, top=20):
#     sizes = ser.groupby(ser).size().sort_values(ascending=False)
#     return sizes[sizes >= min_size].head(top)

# for name, ser in blocks.items():
#     print(f'\n{name}')
#     print(heavy_keys(ser, min_size=10))  # должно вывести пусто



domain_zip
Series([], dtype: int64)

pl4_zip
Series([], dtype: int64)

name0_zip
Series([], dtype: int64)

city_zip
Series([], dtype: int64)


In [None]:
# MAX_BLOCK_SIZE = 100  # подбери порог под свой объём
# pruned_blocks = {}
# for name, ser in blocks.items():
#     counts = ser.value_counts()
#     ok_keys = counts[counts<=MAX_BLOCK_SIZE].index
#     pruned_blocks[name] = ser.where(ser.isin(ok_keys), other=pd.NA)

# m2 = blocking_metrics(df, pruned_blocks, uid_col='uid')
# print({k:v for k,v in m2.items() if k not in ['size_hist','cand_set']})


{'PC': 1.0, 'RR': 0.9988178922051515, 'PQ': 0.9905660377358491, 'true_pairs': 315, 'cand_pairs': 318}
