In [1]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

d:\python_etc\miniconda\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
d:\python_etc\miniconda\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


In [2]:
# 加载 Digits 数据集
digits = load_digits()
X = digits.data
y = digits.target

In [3]:
# 切分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [4]:
# 训练基础 XGBoost 模型（使用 GPU 加速）
xgb_model = xgb.XGBClassifier(
    eval_metric='mlogloss',
    tree_method='hist',  # GPU加速需要设置为"hist"
    device='cuda',  # 使用GPU
)

In [5]:
def train_with_progress(model, X_train, y_train, num_round=5):
    for epoch in tqdm(range(num_round), desc="Training", unit="epoch"):
        model.fit(X_train, y_train)

In [6]:
# 训练初始模型
train_with_progress(xgb_model, X_train, y_train, num_round=5)

Training: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:05<00:00,  1.16s/epoch]


In [7]:
# 评估初始模型
initial_preds = xgb_model.predict(X_test)
initial_acc = accuracy_score(y_test, initial_preds)
print(f"Initial model accuracy: {initial_acc:.4f}")

Initial model accuracy: 0.9685


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [43]:
# 生成伪标签（把预测概率>= 0.8的拿来当训练集）
probs = xgb_model.predict_proba(X_test) # 预测测试集
pseudo_labels = np.argmax(probs, axis=1) # 将每个样本概率最大的类别作为标签
confidences = np.max(probs, axis=1) #拿到标签概率
high_confidence_indices = confidences >= 0.8 # 只要预测概率>=0.8的

pseudo_images = X_test[high_confidence_indices]
pseudo_labels = pseudo_labels[high_confidence_indices]

# 扩充训练集
expanded_train_images = np.vstack([X_train, pseudo_images])
expanded_train_labels = np.hstack([y_train, pseudo_labels])

In [44]:
# 重新训练模型
xgb_model_expanded = xgb.XGBClassifier(
    eval_metric='mlogloss',
    tree_method='hist',  # GPU加速需要设置为"hist"
    device='cuda',  # 使用GPU
)

In [45]:
train_with_progress(xgb_model_expanded, expanded_train_images, expanded_train_labels, num_round=5)


Training: 100%|███████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.47s/epoch]


In [46]:
# 评估扩展后的模型
expanded_preds = xgb_model_expanded.predict(X_test)
expanded_acc = accuracy_score(y_test, expanded_preds)
print(f"Expanded model accuracy: {expanded_acc:.4f}")

Expanded model accuracy: 0.9741
