In [1]:
import numpy as np, pandas as pd
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support
from src.rules import pair_features, is_match, true_pairs, prepare_aux_cols
df = df = pd.read_csv("data/clear_data.csv", dtype={"Phone_norm": str, "Zip_norm": str})
# df — нормализованный DataFrame с uid и *_norm колонками
df = prepare_aux_cols(df)  # на всякий случай

# cand_pairs: либо уже есть, либо загрузить
cand_df = pd.read_csv('out/cand_pairs.csv'); cand_pairs = list(map(tuple, cand_df[['i','j']].to_numpy()))


In [2]:
def make_pairs_df(df, pairs):
    rows=[]
    for i,j in pairs:
        f = pair_features(df, i, j)                 # признаки
        y = int(df.at[i,'uid'] == df.at[j,'uid'])   # метка
        rows.append({**f, 'y':y, 'i':i, 'j':j,
                     'uid_i':df.at[i,'uid'], 'uid_j':df.at[j,'uid']})
    Xy = pd.DataFrame(rows)
    for c in Xy.columns:             # bool → int
        if Xy[c].dtype==bool: Xy[c]=Xy[c].astype(int)
    return Xy

Xy = make_pairs_df(df, cand_pairs)

# балансируем: все позитивы + до 3x негативов
pos = Xy[Xy.y==1]
neg = Xy[Xy.y==0].sample(n=min(len(Xy[Xy.y==0]), len(pos)*3), random_state=42, replace=False)
Xy_bal = pd.concat([pos,neg]).sample(frac=1, random_state=42).reset_index(drop=True)
len(pos), len(neg), Xy_bal.y.mean()


(315, 3, 0.9905660377358491)

In [3]:
uids = df['uid'].unique()
rng = np.random.default_rng(42)
test_uids = set(rng.choice(uids, size=max(1, int(0.2*len(uids))), replace=False))

mask_test = Xy_bal.apply(lambda r: r.uid_i in test_uids and r.uid_j in test_uids, axis=1)
train = Xy_bal.loc[~mask_test].copy()
test  = Xy_bal.loc[mask_test].copy()

feat_cols = ['name_sim','street_sim','zip_eq','city_eq','email_user_eq','phone_last4_eq']  # базовый набор
X_train, y_train = train[feat_cols], train['y']
X_test,  y_test  = test[feat_cols],  test['y']


In [4]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)
groups = train[['uid_i','uid_j']].max(axis=1)  # любая группировка по uid

scores=[]
for tr_idx, va_idx in gkf.split(train[feat_cols], train['y'], groups=groups):
    clf_cv = LogisticRegression(max_iter=1000, class_weight='balanced').fit(
        train.iloc[tr_idx][feat_cols], train.iloc[tr_idx]['y']
    )
    proba = clf_cv.predict_proba(train.iloc[va_idx][feat_cols])[:,1]
    p,r,t = precision_recall_curve(train.iloc[va_idx]['y'], proba)
    f1 = 2*p*r/(p+r+1e-12); scores.append(f1[:-1].max())
print('CV F1 (GroupKFold):', np.mean(scores), '±', np.std(scores))


CV F1 (GroupKFold): 0.9999999999995 ± 0.0


In [5]:
# 1) helper: сэмпл негативных пар (i,j) с разными uid, которых ещё нет в наборе
import numpy as np
rng = np.random.default_rng(42)

def sample_random_negatives(df, n, banned_pairs, seed=42):
    rng = np.random.default_rng(seed)
    idx = df.index.to_numpy()
    out = set()
    while len(out) < n:
        i, j = map(int, rng.choice(idx, size=2, replace=False))
        if df.at[i,'uid'] != df.at[j,'uid'] and (i,j) not in banned_pairs and (j,i) not in banned_pairs:
            out.add((i,j))
    return list(out)

# 2) если в TRAIN нет (или мало) нулей — добавим, например, 20
need_neg_train = 20
if (train['y'] == 0).sum() < need_neg_train:
    banned_tr = set(map(tuple, train[['i','j']].to_numpy()))
    extra_negs_tr = sample_random_negatives(df, n=need_neg_train, banned_pairs=banned_tr, seed=123)
    Xy_negs_tr = make_pairs_df(df, extra_negs_tr)
    train = pd.concat([train, Xy_negs_tr], ignore_index=True)

print('train y:', train['y'].value_counts().to_dict())
print('test  y:',  test['y'].value_counts().to_dict())

# 3) заново соберём матрицы признаков
feat_cols = ['name_sim','street_sim','zip_eq','city_eq','email_user_eq','phone_last4_eq']
X_train = train[feat_cols].copy();  X_train['street_sim'] /= 100.0
y_train = train['y']

X_test  = test[feat_cols].copy();   X_test['street_sim']  /= 100.0
y_test  = test['y']


train y: {1: 227, 0: 23}
test  y: {1: 88}


In [6]:
rng = np.random.default_rng(42)

def sample_random_negatives(df, n, banned):
    """Сэмплим пары (i,j) с разными uid, которых нет в banned."""
    idx = df.index.to_numpy()
    out=set()
    while len(out)<n:
        i, j = map(int, rng.choice(idx, size=2, replace=False))
        if df.at[i,'uid'] != df.at[j,'uid'] and (i,j) not in banned and (j,i) not in banned:
            out.add((i,j))
    return list(out)

# если в тесте один класс — добавим, например, 20 отрицательных пар
if y_test.nunique() < 2:
    banned = set(map(tuple, test[['i','j']].to_numpy()))
    extra_negs = sample_random_negatives(df, n=20, banned=banned)
    Xy_negs = make_pairs_df(df, extra_negs)

    test = pd.concat([test, Xy_negs], ignore_index=True)
    X_test = test[feat_cols].copy()
    X_test['street_sim'] /= 100.0
    y_test = test['y']

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report
from sklearn.model_selection import GroupKFold

# --- гиперпараметры качества ---
MIN_PREC = 0.990   # хотим очень высокую precision (минимум ложных склеек)
MIN_REC  = 0.980   # и при этом высокую recall

def pick_threshold_by_constraints(y_true, proba, min_prec=MIN_PREC, min_rec=MIN_REC, beta_fallback=0.5):
    """Подбор порога: сначала ищем точку с p>=min_prec и r>=min_rec (берём max F1),
    если таких нет — берём максимум F-beta (beta<1 сильнее штрафует FP)."""
    p, r, t = precision_recall_curve(y_true, proba)
    p, r = p[:-1], r[:-1]   # выровнять длины
    mask = (p >= min_prec) & (r >= min_rec)
    if mask.any():
        f1 = 2 * p * r / (p + r + 1e-12)
        thr = float(t[mask][np.argmax(f1[mask])])
    else:
        beta = beta_fallback
        fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
        thr = float(t[np.argmax(fbeta)])
    return thr

# --------- 1) групповая валидация на train ---------
# train — это твой dataframe с колонками feat_cols + ['y', 'uid_i', 'uid_j']
# X_train, y_train — уже подготовлены ранее (масштабирование, выбор признаков)
groups = train[['uid_i', 'uid_j']].max(axis=1)   # любая детерминированная группировка по uid

gkf = GroupKFold(n_splits=5)
fold_thrs = []

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_train, y_train, groups=groups), 1):
    clf_cv = LogisticRegression(max_iter=1000, class_weight='balanced').fit(
        X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    )
    proba_val = clf_cv.predict_proba(X_train.iloc[va_idx])[:, 1]
    thr_fold = pick_threshold_by_constraints(y_train.iloc[va_idx], proba_val)
    fold_thrs.append(thr_fold)
    print(f'[fold {fold}] thr={thr_fold:.6f}')

# финальный порог — медиана/среднее по фолдам (робастно)
best_thr = float(np.median(fold_thrs))
print('best_thr (GroupKFold, median):', best_thr)

# --------- 2) обучаем финальную модель на всём train ---------
clf = LogisticRegression(max_iter=1000, class_weight='balanced').fit(X_train, y_train)

# --------- 3) применяем на test ---------
proba_test = clf.predict_proba(X_test)[:, 1]
y_pred = (proba_test >= best_thr).astype(int)

print(confusion_matrix(y_test, y_pred, labels=[0, 1]))
print(classification_report(y_test, y_pred, labels=[0, 1], zero_division=0, digits=3))


print('mean:', np.mean(fold_thrs), 'std:', np.std(fold_thrs),
      'iqr:', np.percentile(fold_thrs, 75) - np.percentile(fold_thrs, 25))

[fold 1] thr=0.929998
[fold 2] thr=0.733296
[fold 3] thr=0.804076
[fold 4] thr=0.433905
[fold 5] thr=0.333092
best_thr (GroupKFold, median): 0.7332955615590503
[[20  0]
 [ 0 88]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        20
           1      1.000     1.000     1.000        88

    accuracy                          1.000       108
   macro avg      1.000     1.000     1.000       108
weighted avg      1.000     1.000     1.000       108

mean: 0.6468733837461482 std: 0.22634274877114427 iqr: 0.37017093472596124


In [8]:
def model_score_pair(df, i, j, clf, feat_cols):
    f = pair_features(df, i, j)
    x = pd.DataFrame([[f.get(c, 0) for c in feat_cols]], columns=feat_cols)
    return float(clf.predict_proba(x)[0, 1])

def hybrid_is_match(df, i, j, clf, feat_cols, thr):
    if is_match(df, i, j):                   # твой бейзлайн-правила
        return True
    return model_score_pair(df, i, j, clf, feat_cols) >= thr

# предсказания на всех кандидатах
pred_pairs_hybrid = {(i,j) for i,j in cand_pairs if hybrid_is_match(df, i, j, clf, feat_cols, best_thr)}

# метрики
T = true_pairs(df, uid_col='uid')
tp = len(pred_pairs_hybrid & T); fp = len(pred_pairs_hybrid - T); fn = len(T - pred_pairs_hybrid)
prec = tp/(tp+fp) if tp+fp else 0.0
rec  = tp/(tp+fn) if tp+fn else 0.0
f1   = 0 if prec+rec==0 else 2*prec*rec/(prec+rec)
print(f'Hybrid — P:{prec:.3f} R:{rec:.3f} F1:{f1:.3f}   tp={tp} fp={fp} fn={fn}')


Hybrid — P:0.991 R:1.000 F1:0.995   tp=315 fp=3 fn=0


In [9]:

def model_score_pair(df, i, j, clf, feat_cols):
    f = pair_features(df, i, j)
    x = pd.DataFrame([[f.get(c, 0) for c in feat_cols]], columns=feat_cols)
    return float(clf.predict_proba(x)[0, 1])

def model_only_is_match(df, i, j, clf, feat_cols, thr):
    # if is_match(df, i, j):                   # твой бейзлайн-правила
    #     return True
    return model_score_pair(df, i, j, clf, feat_cols) >= thr

# предсказания на всех кандидатах
pred_pairs_model = {(i, j) for i, j in cand_pairs if model_only_is_match(df, i, j, clf, feat_cols, best_thr)}


# метрики
T = true_pairs(df, uid_col='uid')
tp = len(pred_pairs_hybrid & T); fp = len(pred_pairs_hybrid - T); fn = len(T - pred_pairs_hybrid)
prec = tp/(tp+fp) if tp+fp else 0.0
rec  = tp/(tp+fn) if tp+fn else 0.0
f1   = 0 if prec+rec==0 else 2*prec*rec/(prec+rec)
print(f'Only_Model — P:{prec:.3f} R:{rec:.3f} F1:{f1:.3f}   tp={tp} fp={fp} fn={fn}')



Only_Model — P:0.991 R:1.000 F1:0.995   tp=315 fp=3 fn=0


In [10]:
# размер разбиения
print('train:', len(train), 'test:', len(test))
print('pos/neg test:', test['y'].value_counts().to_dict())

# confusion matrix и отчёт
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

# коэффициенты модели (на будущее — важность признаков)
coef = pd.Series(clf.coef_[0], index=feat_cols).sort_values(ascending=False)
print(coef)


train: 250 test: 108
pos/neg test: {1: 88, 0: 20}
[[20  0]
 [ 0 88]]
              precision    recall  f1-score   support

           0      1.000     1.000     1.000        20
           1      1.000     1.000     1.000        88

    accuracy                          1.000       108
   macro avg      1.000     1.000     1.000       108
weighted avg      1.000     1.000     1.000       108

city_eq           1.965282
street_sim        1.941397
phone_last4_eq    1.928396
zip_eq            1.801245
name_sim          1.537217
email_user_eq     1.438728
dtype: float64


In [11]:
import os, json, joblib
os.makedirs('data', exist_ok=True)
joblib.dump({'clf': clf, 'feat_cols': feat_cols, 'threshold': best_thr},
            'data/pair_model.joblib')
with open('data/pair_model_meta.json','w',encoding='utf-8') as f:
    json.dump({'features': feat_cols, 'threshold': best_thr}, f, ensure_ascii=False, indent=2)
