In [4]:
import pandas as pd
from src.rules import pair_features, is_match, prepare_aux_cols, true_pairs, predict_pairs, evaluate_pairwise, NAME_THR, STREET_THR, HARD_NAME
from src.cluster import build_clusters, summarize_clusters, show_cluster, cluster_cohesion

# 1) загрузить df (после нормализации) и кандидатов
df = pd.read_csv("data/clear_data.csv", dtype={'Phone_norm': str, 'Zip_norm': str})  # твой датафрейм
cand_df = pd.read_csv("out/cand_pairs.csv")
cand_pairs = list(map(tuple, cand_df[['i','j']].to_numpy()))
df = prepare_aux_cols(df)

# 2) подготовка служебных колонок
df = prepare_aux_cols(df)

# --- 0) Подготовка df без uid для мэтчинга/кластеризации ---
df_match = df.drop(columns=['uid'], errors='ignore')

# --- 1) ОЦЕНКА ПАР (matching) БЕЗ uid ---
# истинные пары по uid оставляем только для метрик
T = true_pairs(df, uid_col='uid')

# предсказанные пары считаем на df_match (uid внутри не виден)
P = predict_pairs(
    df_match, cand_pairs,
    name_thr=NAME_THR, street_thr=STREET_THR, hard_name=HARD_NAME
)

# метрики качества по «истинным» парам vs предсказаниям
metrics = evaluate_pairwise(T, P)
print(metrics)

# --- 2) СОХРАНЯЕМ ПРЕДСКАЗАННЫЕ ПАРЫ (без uid-колонок) ---
import os
os.makedirs('out', exist_ok=True)

rows = []
for i, j in sorted(P):
    f = pair_features(df_match, i, j)   # признаки тоже считаем без uid
    rows.append({
        'i': int(i),
        'j': int(j),
        **f
        # НИЧЕГО про uid не пишем, чтобы его не таскать дальше
    })

pairs_df = pd.DataFrame(rows)
pairs_path = 'out/pairs_pred.csv'
pairs_df.to_csv(pairs_path, index=False)
print(f"saved {len(P)} pairs -> {pairs_path}")

# --- 3) КЛАСТЕРИЗАЦИЯ (без uid) ---
# build_clusters принимает множество пар и индекс строк;
# подаём индекс от df_match, внутри uid не используется.
clusters = build_clusters(P, df_match.index)
clust_df = summarize_clusters(df_match, clusters)
display(clust_df.head(10))

# (если нужно) просмотр подозрительных кластеров
sus = clust_df[(clust_df['size'] >= 5) | (clust_df['n_uids'] > 1)]
display(sus.head(10))

if len(clust_df):
    biggest_id = int(clust_df.iloc[0]['cluster_id'])
    display(show_cluster(df_match, clusters, biggest_id))



{'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'tp': 315, 'fp': 0, 'fn': 0, 'P': 315, 'T': 315}
saved 315 pairs -> out/pairs_pred.csv


Unnamed: 0,cluster_id,size,n_uids,top_uid,top_uid_share
368,368,3,,,
465,465,3,,,
63,63,3,,,
102,102,3,,,
103,103,3,,,
431,431,3,,,
201,201,3,,,
57,57,3,,,
56,56,3,,,
366,366,3,,,


Unnamed: 0,cluster_id,size,n_uids,top_uid,top_uid_share


Unnamed: 0,row_id,Name_norm,City_norm,Street_norm,Email_norm,Zip_norm,Phone_norm,email_user,phone_last4
536,537,lisa campbell,port jose,tyler rest 194,lisa.campbell@example.com,69070,102767478,lisa.campbell,7478
537,538,lisa cmbpell,port josf,tyler rest 194,lisa.campbell@example.com,69070,102747478,lisa.campbell,7478
538,539,ljsa campbell,port jose,tyler rest1 94,lisa.campbell@example.com,69070,102767478,lisa.campbell,7478


In [5]:
from itertools import combinations
from src.rules import pair_features, true_pairs  # если они у тебя в rules.py

T  = true_pairs(df, uid_col='uid')
FN = list(T - P)
print('FN:', len(FN), FN[:5])  # должны увидеть 1 пару

def inspect_pair(i, j, cols=('uid','Name_norm','Street_norm','City_norm','Zip_norm','Email_norm','Phone_norm')):
    print('features:', pair_features(df, i, j))
    display(df.loc[[i,j], list(cols)])

# посмотреть эту пару
if FN:
    i, j = FN[0]
    inspect_pair(i, j)


FN: 0 []


In [6]:
from src.canonicalize import canonicalize_all, majority, longest, most_frequent_valid

# правила (под твои названия колонок)
canon_rules = {
    "Name_norm":   longest,              # имя — самая длинная нормализованная строка
    "Street_norm": majority,             # улица — мажоритарно
    "City_norm":   majority,             # город — мажоритарно
    "Zip_norm":    majority,             # индекс — мажоритарно
    "Email_norm":  most_frequent_valid,  # самый частый валидный
    "Phone_norm":  most_frequent_valid,  # самый частый валидный
}

df_eid, entities = canonicalize_all(df, clusters, canon_rules)
display(entities.head())


Unnamed: 0,entity_id,support_size,Name_norm,Street_norm,City_norm,Zip_norm,Email_norm,Phone_norm,name_share,all_emails,all_phones
0,ent_0659fd3b3bd7,3,patrick mccall,smith viaduct 83,larryfort,91751,patrick.mccall@example.com,561681472,0.333333,patrick.mccall@example.co;patrick.mccall@examp...,0560681472;0561681472
1,ent_06642f492c0a,3,ashley greer phd,golden bypass 158,bakerland,58050,ashley.greer.phd@example.com,26012796,0.333333,ashle.greer.phd@example.com;ashley.greer.phd@e...,0026012796
2,ent_0b3092293a05,3,jimmy scott,alexis junction 48,ashleyborough,23503,jimmy.scott@example.com,565270477,0.666667,jimmy.scott@example.com,0565270477;0765270472
3,ent_0f1ec283362f,3,norman cook,pham way 2,west erik,92418,norman.cook@example.com,546688937,0.666667,norman.cook@example.com,0546188937;0546688937
4,ent_11f67ba52a2b,3,chelsea travis,weaver extensions 189,johnsonton,30991,chelsea.travis@example.com,833146591,0.666667,chelsea.trais@example.com;chelsea.travis@examp...,0833146591


In [7]:
# сохраняем
df_eid = df_eid.drop(columns=['uid'], errors='ignore')
df_eid.to_csv('out/rows_with_entity_id.csv', index=False)
entities.to_csv('out/entities.csv', index=False)

# быстрый sanity-check
assert df_eid['entity_id'].notna().all(), "Есть строки без entity_id"
assert entities['entity_id'].is_unique, "Дубли entity_id в entities"

# удобно посмотреть самые большие/неуверенные сущности
display(entities[['entity_id','support_size','name_share','Name_norm','Street_norm','City_norm','Zip_norm']].head(10))


Unnamed: 0,entity_id,support_size,name_share,Name_norm,Street_norm,City_norm,Zip_norm
0,ent_0659fd3b3bd7,3,0.333333,patrick mccall,smith viaduct 83,larryfort,91751
1,ent_06642f492c0a,3,0.333333,ashley greer phd,golden bypass 158,bakerland,58050
2,ent_0b3092293a05,3,0.666667,jimmy scott,alexis junction 48,ashleyborough,23503
3,ent_0f1ec283362f,3,0.666667,norman cook,pham way 2,west erik,92418
4,ent_11f67ba52a2b,3,0.666667,chelsea travis,weaver extensions 189,johnsonton,30991
5,ent_13f380eca875,3,0.333333,jacob williams,april turnpike 8,paulmouth,6587
6,ent_16a059ceb722,3,0.666667,john hancock,franklin alley 177,bakermouth,37889
7,ent_1d30df464877,3,0.333333,anthony adams,tonya springs 88,dwayneton,91018
8,ent_1e295c05aed8,3,0.666667,michelle craig,rich manors 121,new danielfurt,19549
9,ent_1e4ca479bf0e,3,0.333333,andrea jones,richard port 103,new brooke,81684


In [None]:
low_conf = entities[(entities['support_size'] >= 2) & (entities['name_share'] < 0.7)]
display(low_conf.head(10))

Unnamed: 0,entity_id,support_size,Name_norm,Street_norm,City_norm,Zip_norm,Email_norm,Phone_norm,name_share,all_emails,all_phones
0,ent_0659fd3b3bd7,3,patrick mccall,smith viaduct 83,larryfort,91751,patrick.mccall@example.com,561681472,0.333333,patrick.mccall@example.co;patrick.mccall@examp...,0560681472;0561681472
1,ent_06642f492c0a,3,ashley greer phd,golden bypass 158,bakerland,58050,ashley.greer.phd@example.com,26012796,0.333333,ashle.greer.phd@example.com;ashley.greer.phd@e...,0026012796
2,ent_0b3092293a05,3,jimmy scott,alexis junction 48,ashleyborough,23503,jimmy.scott@example.com,565270477,0.666667,jimmy.scott@example.com,0565270477;0765270472
3,ent_0f1ec283362f,3,norman cook,pham way 2,west erik,92418,norman.cook@example.com,546688937,0.666667,norman.cook@example.com,0546188937;0546688937
4,ent_11f67ba52a2b,3,chelsea travis,weaver extensions 189,johnsonton,30991,chelsea.travis@example.com,833146591,0.666667,chelsea.trais@example.com;chelsea.travis@examp...,0833146591
5,ent_13f380eca875,3,jacob williams,april turnpike 8,paulmouth,6587,jacob.williams@example.com,902559056,0.333333,jacob.williams@example.com,0902559056
6,ent_16a059ceb722,3,john hancock,franklin alley 177,bakermouth,37889,john.hancock@example.com,280598262,0.666667,john.hancock@example.com,0220598262;0280598262
7,ent_1d30df464877,3,anthony adams,tonya springs 88,dwayneton,91018,anthony.adams@example.com,579887749,0.333333,anthony.adams@example.com,0579887749
8,ent_1e295c05aed8,3,michelle craig,rich manors 121,new danielfurt,19549,michelle.craig@example.com,884795137,0.666667,michelle.craig@example.com;michellecraig@examp...,0884795137
9,ent_1e4ca479bf0e,3,andrea jones,richard port 103,new brooke,81684,andrea.jones@example.com,850142940,0.333333,andrea.jone@example.com;andrea.jones@example.com,0850142940
