In [7]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from cleanlab.filter import find_label_issues

In [8]:
# 生成一个简单的二分类数据集
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# 随机修改一些训练集的标签
np.random.seed(42)
num_noisy_labels = 50  # 修改50个标签
noisy_indices = np.random.choice(len(y_train), num_noisy_labels, replace=False)
y_train[noisy_indices] = 1 - y_train[noisy_indices]  # 反转标签

In [16]:
# 使用逻辑回归模型进行训练
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [17]:
# 获取训练集上的预测概率
y_train_pred_proba = model.predict_proba(X_train)

In [18]:
# 使用cleanlab找出标签有问题的样本
ordered_label_issues = find_label_issues(
    labels=y_train,
    pred_probs=y_train_pred_proba,
    return_indices_ranked_by='self_confidence'
)

In [22]:
# 输出找到的标签有问题的样本
print("可能标注错误的样本编号为:", ordered_label_issues)

可能标注错误的样本编号为: [174 448  63 563 741 632 139 526 346 360 571 514 201 696 721 534 595 176
 596  64 667 330 622  67 355 692  77 529 582  65 637 581 123 273 286 166
  99 578 525 556 433 344 327 536 323 398  16 383 416 635 713  47 544  39
  95 475 763  23 468 723 613 746 796 367 377 369 215 621 409  69 793 302
  53 472 130 124 669 760 636]


In [20]:
# 验证找到的标签问题是否包括我们故意修改的标签
found_noisy_indices = set(ordered_label_issues)
true_noisy_indices = set(noisy_indices)

In [23]:
# 计算召回率
recall = len(found_noisy_indices.intersection(true_noisy_indices)) / len(true_noisy_indices)
print(f"Recall of noisy labels: {recall:.2f}")

Recall of noisy labels: 0.70
