In [33]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification

In [34]:
# 生成模拟数据：标注数据和未标注数据
# 我们用 make_classification 创建一个二分类问题的数据集
X, y = make_classification(n_samples=10000, n_features=20, n_informative=15, n_classes=2, random_state=42)

In [35]:
X.shape, y.shape

((10000, 20), (10000,))

In [36]:
# 将数据分成标注和未标注的部分
X_train_labeled, X_unlabeled, y_train_labeled, _ = train_test_split(X, y, test_size=0.8, random_state=42)
X_unlabeled, X_test, _, y_test = train_test_split(X_unlabeled, _, test_size=0.25, random_state=42)

In [37]:

print(f"Labeled data: {X_train_labeled.shape[0]} samples")
print(f"Unlabeled data: {X_unlabeled.shape[0]} samples")
print(f"Test data: {X_test.shape[0]} samples")

# 第一步：训练初始模型
initial_model = LogisticRegression()
initial_model.fit(X_train_labeled, y_train_labeled)

# 用初始模型预测测试集的准确率
y_pred_initial = initial_model.predict(X_test)
initial_accuracy = accuracy_score(y_test, y_pred_initial)
print(f"Initial model accuracy: {initial_accuracy:.4f}")

# 第二步：使用模型对未标注数据进行预测，并生成伪标签
unlabeled_probabilities = initial_model.predict_proba(X_unlabeled)
unlabeled_confidence = np.max(unlabeled_probabilities, axis=1)  # 选择最高置信度
unlabeled_labels = np.argmax(unlabeled_probabilities, axis=1)  # 对应的伪标签

# 设置置信度阈值，只选择高置信度样本
confidence_threshold = 0.9
high_confidence_indices = np.where(unlabeled_confidence >= confidence_threshold)[0]
X_pseudo_labeled = X_unlabeled[high_confidence_indices]
y_pseudo_labeled = unlabeled_labels[high_confidence_indices]

print(f"Number of high-confidence pseudo-labeled samples: {len(y_pseudo_labeled)}")

# 第三步：将伪标签数据加入训练集，重新训练模型
X_train_combined = np.vstack([X_train_labeled, X_pseudo_labeled])
y_train_combined = np.hstack([y_train_labeled, y_pseudo_labeled])

# 重新训练模型
model_with_pseudo_labels = LogisticRegression()
model_with_pseudo_labels.fit(X_train_combined, y_train_combined)

# 第四步：评估模型性能
y_pred_with_pseudo_labels = model_with_pseudo_labels.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_with_pseudo_labels)
print(f"Model accuracy after self-training: {final_accuracy:.4f}")

Labeled data: 2000 samples
Unlabeled data: 6000 samples
Test data: 2000 samples
Initial model accuracy: 0.8275
Number of high-confidence pseudo-labeled samples: 2704
Model accuracy after self-training: 0.8275
