# 自训练: 对手写数字进行分类

In [88]:
import numpy as np
import warnings

warnings.filterwarnings("ignore")

## 1. 数据加载

In [89]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# 加载MNIST数据集
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist["data"], mnist["target"]

# 将数据拆分为有标签和无标签数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train_labeled, X_train_unlabeled, y_train_labeled, y_train_unlabeled = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

## 2.  训练初始模型

In [90]:
# 使用有标签数据训练模型
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train_labeled, y_train_labeled)

In [91]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.4f}")

模型准确率: 0.9441


## 3. 对无标签数据进行预测

In [92]:
# 使用模型对未标注数据进行预测
pseudo_labels = clf.predict(X_train_unlabeled)
probs = clf.predict_proba(X_train_unlabeled)

# 筛选出高置信度的预测（例如置信度大于0.95）
confident_indices = np.max(probs, axis=1) > 0.95
X_confident = X_train_unlabeled[confident_indices]
y_confident = pseudo_labels[confident_indices]

In [93]:
probs[0]

array([0. , 0. , 0. , 0.7, 0. , 0.3, 0. , 0. , 0. , 0. ])

## 4. 更新训练集

In [94]:
# 将高置信度样本加入训练集
X_combined = np.vstack((X_train_labeled, X_confident))
y_combined = np.concatenate((y_train_labeled, y_confident))

## 5. 用新的训练集重新训练模型

In [95]:
# 用新的训练集重新训练模型
clf.fit(X_combined, y_combined)

## 6. 评估模型

In [96]:
# 评估模型在测试集上的表现
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确率: {accuracy:.4f}")

模型准确率: 0.9400


## 循环自训练

In [99]:
acc = []
for i in range(20):
    # 使用模型对未标注数据进行预测
    pseudo_labels = clf.predict(X_train_unlabeled)
    probs = clf.predict_proba(X_train_unlabeled)

    # 筛选出高置信度的预测（例如置信度大于0.95）
    confident_indices = np.max(probs, axis=1) > 0.95 - i * 0.01
    X_confident = X_train_unlabeled[confident_indices]
    y_confident = pseudo_labels[confident_indices]
    y = y_train_unlabeled[confident_indices]
    print(len(y[y==y_confident])/len(y))
    # 将高置信度样本加入训练集
    X_combined = np.vstack((X_train_labeled, X_confident))
    y_combined = np.concatenate((y_train_labeled, y_confident))

    # 用新的训练集重新训练模型
    clf.fit(X_combined, y_combined)
    
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    acc.append(accuracy)
    print(X_combined.shape)

0.9997629208155524
(52536, 784)
0.9994288325336989
(52854, 784)
0.9995505112934038
(52999, 784)
0.9995471527227443
(52933, 784)
0.9992115341293084
(52978, 784)
0.9988924859430908
(55838, 784)
0.9984875019901289
(56662, 784)
0.9986884740009258
(57062, 784)
0.9979383017715333
(57196, 784)
0.9982615268329554
(57330, 784)
0.9982553288325874
(57283, 784)
0.9985582030657156
(57278, 784)
0.9987118284458589
(57297, 784)
0.9987191078963231
(57372, 784)
0.9983393719806763
(57348, 784)
0.9972733469665985
(58770, 784)
0.9965753424657534
(59284, 784)
0.9966981742846044
(59546, 784)
0.9961444544403033
(59662, 784)
0.9959615384615385
(59700, 784)


In [None]:
acc

[0.9397142857142857,
 0.9397142857142857,
 0.937,
 0.94,
 0.9405714285714286,
 0.9428571428571428,
 0.9377142857142857,
 0.9385714285714286,
 0.9415714285714286,
 0.9424285714285714,
 0.9408571428571428,
 0.9384285714285714,
 0.9404285714285714,
 0.9414285714285714,
 0.9355714285714286,
 0.9441428571428572,
 0.9394285714285714,
 0.9412857142857143,
 0.9392857142857143,
 0.9407142857142857]