In [21]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, classification_report
from urllib.parse import quote_plus

user = "root"
password = quote_plus("Yangsijie0819$")  # 避免密码里有 @ : / 等特殊字符导致连接串坏掉
host = "127.0.0.1"
port = 3306
db = "aidev"

engine = create_engine(
    f"mysql+pymysql://{user}:{password}@{host}:{port}/{db}?charset=utf8mb4",
    pool_pre_ping=True,
)

# 2) 读取 RQ3 数据集（只含早期可见特征 + 标签）
df = pd.read_sql("""
SELECT pr_id, repo_id, agent, scenario_label, state, high_cost
FROM pr_rq3_dataset;
""", engine)

# 3) 基本清理：state 不再需要，可以不 dropna 它
df = df.dropna(subset=["repo_id", "agent", "scenario_label", "high_cost"]).copy()
df["high_cost"] = df["high_cost"].astype(int)

# 4) 只保留 agent + scenario_label
X = df[["agent", "scenario_label"]]
y = df["high_cost"]
groups = df["repo_id"]

# 5) 按 repo 分组切分
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# ===== Baseline: agent prior risk (train-only, no leakage) =====
df_train = df.iloc[train_idx].copy()
df_test  = df.iloc[test_idx].copy()

global_rate = df_train["high_cost"].mean()

agent_rate = df_train.groupby("agent")["high_cost"].mean()
proba_agent_prior = df_test["agent"].map(agent_rate).fillna(global_rate).to_numpy()

prid_test = df_test["pr_id"].to_numpy()  # 你 Top-k tie-break 需要
y_test_np = df_test["high_cost"].to_numpy()

print("\n[Baseline] Agent prior AUC:", round(roc_auc_score(y_test_np, proba_agent_prior), 4))
for ar in [0.10, 0.20]:
    out = eval_topk(ar, proba_agent_prior, y_test_np, prid_test)
    print(
        f"[Baseline] Top-{int(ar*100)}% | k={out['k']} | "
        f"AUC={out['AUC']:.3f} | P={out['precision']:.4f} | R={out['recall']:.4f} | F1={out['f1']:.4f}"
    )




[Baseline] Agent prior AUC: 0.8567
[Baseline] Top-10% | k=1301 | AUC=0.857 | P=0.4750 | R=0.4972 | F1=0.4858
[Baseline] Top-20% | k=2601 | AUC=0.857 | P=0.4198 | R=0.8785 | F1=0.5682


In [15]:

# 新增三种

from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# sanity check: split 是否固定
print("n_test:", len(test_idx))
print("test_repo_unique:", df.iloc[test_idx]["repo_id"].nunique())
print("pos_rate_test:", df.iloc[test_idx]["high_cost"].mean())


n_test: 13002
test_repo_unique: 562
pos_rate_test: 0.09560067681895093


In [18]:

def auc_on_cols(cols):
    Xtr = df.iloc[train_idx][cols]
    Xte = df.iloc[test_idx][cols]
    ytr = df.iloc[train_idx]["high_cost"]
    yte = df.iloc[test_idx]["high_cost"]

    preprocess = ColumnTransformer(
        transformers=[("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cols)],
        remainder="drop",
    )
    clf = LogisticRegression(max_iter=2000, class_weight="balanced", C=10.0)

    m = Pipeline([("preprocess", preprocess), ("clf", clf)])
    m.fit(Xtr, ytr)

    proba = m.predict_proba(Xte)[:, 1]
    return roc_auc_score(yte, proba)

auc_agent = auc_on_cols(["agent"])
auc_scen  = auc_on_cols(["scenario_label"])
auc_both  = auc_on_cols(["agent", "scenario_label"])

print("AUC (agent only):", round(auc_agent, 4))
print("AUC (scenario only):", round(auc_scen, 4))
print("AUC (agent + scenario):", round(auc_both, 4))



AUC (agent only): 0.8567
AUC (scenario only): 0.6654
AUC (agent + scenario): 0.8284


In [23]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

def fit_predict_proba(train_idx, test_idx, cat_cols):
    Xtr = df.iloc[train_idx][cat_cols]
    Xte = df.iloc[test_idx][cat_cols]
    ytr = df.iloc[train_idx]["high_cost"]
    yte = df.iloc[test_idx]["high_cost"].to_numpy()
    prid = df.iloc[test_idx]["pr_id"].to_numpy()

    preprocess = ColumnTransformer(
        transformers=[("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols)],
        remainder="drop",
    )
    clf = LogisticRegression(max_iter=2000, class_weight="balanced", C=1.0)
    model = Pipeline([("preprocess", preprocess), ("clf", clf)])
    model.fit(Xtr, ytr)

    proba = model.predict_proba(Xte)[:, 1]
    return yte, proba, prid

# ===== Repeated repo-level splits =====
cat_cols = ["agent", "scenario_label"]  # 或 ["agent"] 看你主模型
groups = df["repo_id"].to_numpy()

aucs, p10s, r10s, f10s, p20s, r20s, f20s = [], [], [], [], [], [], []

n_runs = 30
for rs in range(n_runs):
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=rs)
    tr, te = next(gss.split(df, df["high_cost"], groups=groups))

    yte, proba, prid = fit_predict_proba(tr, te, cat_cols)

    # 跳过“测试集单类”的极端情况（通常很少）
    if len(np.unique(yte)) < 2:
        continue

    aucs.append(roc_auc_score(yte, proba))
    out10 = eval_topk(0.10, proba, yte, prid)
    out20 = eval_topk(0.20, proba, yte, prid)

    p10s.append(out10["precision"]); r10s.append(out10["recall"]); f10s.append(out10["f1"])
    p20s.append(out20["precision"]); r20s.append(out20["recall"]); f20s.append(out20["f1"])

print(f"\n[Repeated GSS] runs used: {len(aucs)}/{n_runs}")
print("[Repeated GSS] AUC mean±std:", round(np.mean(aucs), 4), "±", round(np.std(aucs), 4))
print("[Repeated GSS] Top-10% P/R/F1 mean:",
      round(np.mean(p10s), 4), round(np.mean(r10s), 4), round(np.mean(f10s), 4))
print("[Repeated GSS] Top-20% P/R/F1 mean:",
      round(np.mean(p20s), 4), round(np.mean(r20s), 4), round(np.mean(f20s), 4))



[Repeated GSS] runs used: 30/30
[Repeated GSS] AUC mean±std: 0.7974 ± 0.0395
[Repeated GSS] Top-10% P/R/F1 mean: 0.5893 0.2685 0.3511
[Repeated GSS] Top-20% P/R/F1 mean: 0.5756 0.4869 0.506


In [19]:


# 6) One-hot + Logistic Regression（把 state 从 cat_cols 里删掉）
cat_cols = ["agent", "scenario_label"]
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols),
    ],
    remainder="drop",
)

clf = LogisticRegression(max_iter=2000, class_weight="balanced")

model = Pipeline(steps=[("preprocess", preprocess), ("clf", clf)])
model.fit(X_train, y_train)

# 7) 评估
proba = model.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

auc = roc_auc_score(y_test, proba)
f1 = f1_score(y_test, pred)
cm = confusion_matrix(y_test, pred)

print("Test AUC:", round(auc, 4))
print("Test F1:", round(f1, 4))
print("Confusion matrix:\n", cm)
print("\nClassification report:\n", classification_report(y_test, pred, digits=4))

# 8) 输出最重要特征
ohe = model.named_steps["preprocess"].named_transformers_["cat"]
feature_names = ohe.get_feature_names_out(cat_cols)
coefs = model.named_steps["clf"].coef_[0]

imp = (
    pd.DataFrame({"feature": feature_names, "coef": coefs})
    .assign(abs_coef=lambda d: d["coef"].abs())
    .sort_values("abs_coef", ascending=False)
    .head(20)
)

print("\nTop 20 features by |coef|:\n", imp.to_string(index=False))




Test AUC: 0.8272
Test F1: 0.7146
Confusion matrix:
 [[11340   419]
 [  319   924]]

Classification report:
               precision    recall  f1-score   support

           0     0.9726    0.9644    0.9685     11759
           1     0.6880    0.7434    0.7146      1243

    accuracy                         0.9432     13002
   macro avg     0.8303    0.8539    0.8416     13002
weighted avg     0.9454    0.9432    0.9442     13002


Top 20 features by |coef|:
                          feature      coef  abs_coef
scenario_label_S2_Human_coedited  2.847765  2.847765
scenario_label_S1_Human_reviewed  2.716060  2.716060
              agent_OpenAI_Codex -1.161978  1.161978
                    agent_Cursor  0.826845  0.826845
                   agent_Copilot  0.816780  0.816780
                     agent_Devin  0.815365  0.815365


In [10]:
prid_test = df.iloc[test_idx]["pr_id"].to_numpy()


In [20]:
# 按 Top-K 评估

import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def eval_topk(alert_rate: float, proba, y_test, prid_test):
    n = len(proba)
    k = int(np.ceil(alert_rate * n))

    # 可复现排序：proba 高优先；并列用 pr_id 小的优先
    order = np.lexsort((prid_test, -proba))
    pred_k = np.zeros(n, dtype=int)
    pred_k[order[:k]] = 1

    out = {
        "alert_rate": alert_rate,
        "k": k,
        "predicted_positive_rate": pred_k.mean(),
        "AUC": roc_auc_score(y_test, proba),
        "precision": precision_score(y_test, pred_k, zero_division=0),
        "recall": recall_score(y_test, pred_k, zero_division=0),
        "f1": f1_score(y_test, pred_k, zero_division=0),
    }
    return out

for ar in [0.10, 0.20]:  # 想加 0.05 就改成 [0.05, 0.10, 0.20]
    out = eval_topk(ar, proba, y_test, prid_test)
    print(
        f"Top-{int(ar*100)}% | k={out['k']} | pos_rate={out['predicted_positive_rate']:.4f} | "
        f"AUC={out['AUC']:.3f} | P={out['precision']:.4f} | R={out['recall']:.4f} | F1={out['f1']:.4f}"
    )



Top-10% | k=1301 | pos_rate=0.1001 | AUC=0.827 | P=0.6941 | R=0.7265 | F1=0.7099
Top-20% | k=2601 | pos_rate=0.2000 | AUC=0.827 | P=0.4079 | R=0.8536 | F1=0.5520
