In [1]:
import gdown
file_id = "1cPmNRjDZDevMK5vzXK7M12kwhFfpcjsb"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="TCGA_BRCA_RNA.h5ad", quiet=False)

!pip install scanpy
# !pip install tabpfn
!pip install autogluon


Downloading...
From (original): https://drive.google.com/uc?id=1cPmNRjDZDevMK5vzXK7M12kwhFfpcjsb
From (redirected): https://drive.google.com/uc?id=1cPmNRjDZDevMK5vzXK7M12kwhFfpcjsb&confirm=t&uuid=02db59ed-bf4b-430b-99fc-37a4a4305146
To: /content/TCGA_BRCA_RNA.h5ad
100%|██████████| 575M/575M [00:04<00:00, 126MB/s]


Collecting scanpy
  Downloading scanpy-1.11.4-py3-none-any.whl.metadata (9.2 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.12.2-py3-none-any.whl.metadata (9.6 kB)
Collecting legacy-api-wrap>=1.4.1 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.2.1-py3-none-any.whl.metadata (3.4 kB)
Collecting array-api-compat>=1.7.1 (from anndata>=0.8->scanpy)
  Downloading array_api_compat-1.12.0-py3-none-any.whl.metadata (2.5 kB)
Collecting zarr!=3.0.*,>=2.18.7 (from anndata>=0.8->scanpy)
  Downloading zarr-3.1.2-py3-none-any.whl.metadata (10 kB)
Collecting donfig>=0.8 (from zarr!=3.0.*,>=2.18.7->anndata>=0.8->scanpy)
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Collecting numcodecs>=0.14 (from numcodecs[crc32c]>=0.14->zarr!=3.0.*,>=2.18.7->anndata>=0.8->scanpy)
  Downloading numcodecs-0.16.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.me

In [5]:
# ========== 配置 ==========
H5AD_PATH = "/content/TCGA_BRCA_RNA.h5ad"
SEED = 42

import os
os.environ["SCIPY_ARRAY_API"] = "1"


## Autogluon

In [6]:
#Autogluon
import os
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

from autogluon.tabular import TabularDataset, TabularPredictor

# ========== 配置 ==========
# H5AD_PATH = "/Users/xin/Desktop/DATA5703/TCGA-DNA-RNA-IMAGE-stage-classifier/RNA/TCGA_BRCA_RNA.h5ad"
SEED = 42
TOP_K = 500

# ========== 读取数据 ==========
adata = anndata.read_h5ad(H5AD_PATH)
print("✅ 原始数据维度:", adata.shape)

# ========== 清洗 inf/nan 和异常值 ==========
X = adata.X.toarray() if not isinstance(adata.X, np.ndarray) else adata.X.copy()
X = X.astype(np.float32)
X[np.isinf(X)] = np.nan
col_means = np.nanmean(X, axis=0)
inds = np.where(np.isnan(X))
X[inds] = np.take(col_means, inds[1])
X = np.clip(X, a_min=0, a_max=1e6)
adata.X = X

# ========== 标签处理 ==========
# 删除 stage 为 "Unknown" 的样本
adata = adata[adata.obs["stage"].astype(str) != "Unknown", :].copy()

# 提取标签
y = adata.obs["stage"].astype(str)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
label_names = le.classes_
print("✅ 标签类别:", {i: name for i, name in enumerate(label_names)})

# 打印标签分布
from collections import Counter
print("📊 标签分布:", dict(Counter(y_encoded)))

# ========== 划分数据集 ==========
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    adata.X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=SEED
)

# ========== 特征选择 ==========
selector = SelectKBest(mutual_info_classif, k=TOP_K)
X_train_temp_sel = selector.fit_transform(X_train_temp, y_train_temp)
X_test_sel = selector.transform(X_test)

# 获取基因名
selected_genes = selector.get_support(indices=True)
gene_names = adata.var_names[selected_genes]

# ========== SMOTE 增加稀有类样本 ==========
smote = SMOTE(sampling_strategy="not majority", random_state=SEED)
X_train_sel, y_train = smote.fit_resample(X_train_temp_sel, y_train_temp)
print("✅ 训练集 after SMOTE:", X_train_sel.shape)

# ========== AutoGluon 训练 ==========
# 转为 dataframe 形式
train_df = pd.DataFrame(X_train_sel, columns=gene_names)
train_df["label"] = [label_names[i] for i in y_train]  # 用标签名更直观

test_df = pd.DataFrame(X_test_sel, columns=gene_names)
test_df["label"] = [label_names[i] for i in y_test]

# 创建 AutoGluon 数据集
train_data = TabularDataset(train_df)
test_data = TabularDataset(test_df)

print("🚀 AutoGluon 开始训练...")
predictor = TabularPredictor(label="label", path="autogluon_rna_output", eval_metric="f1_weighted").fit(
    train_data,
    time_limit=300,
    presets="high_quality",
    save_bag_folds=True,
    hyperparameters={
        'GBM': {},
        'CAT': {},
        'XGB': {},
        'NN_TORCH': {},
        'FASTAI': {},
        'RF': {}
    }
)

# ========== 评估 ==========
print("📊 Leaderboard:")
print(predictor.leaderboard(test_data, silent=True))

y_pred = predictor.predict(test_data)
print("\n📊 Classification Report (AutoGluon):")
print(classification_report(test_data["label"], y_pred, target_names=label_names))



✅ 原始数据维度: (1231, 58048)
✅ 标签类别: {0: 'Stage I', 1: 'Stage II', 2: 'Stage III', 3: 'Stage IV'}
📊 标签分布: {np.int64(1): 705, np.int64(0): 201, np.int64(3): 22, np.int64(2): 278}


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Mar 30 16:01:29 UTC 2025
CPU Count:          2
Memory Avail:       10.14 GB / 12.67 GB (80.0%)
Disk Space Avail:   66.83 GB / 112.64 GB (59.3%)
Presets specified: ['high_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 75s of th

✅ 训练集 after SMOTE: (2252, 500)
🚀 AutoGluon 开始训练...


	Running DyStack sub-fit in a ray process to avoid memory leakage. Enabling ray logging (enable_ray_logging=True). Specify `ds_args={'enable_ray_logging': False}` if you experience logging issues.
2025-09-05 07:31:12,206	INFO worker.py:1843 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
		Context path: "/content/autogluon_rna_output/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=4807)[0m Running DyStack sub-fit ...
[36m(_dystack pid=4807)[0m Beginning AutoGluon training ... Time limit = 70s
[36m(_dystack pid=4807)[0m AutoGluon will save models to "/content/autogluon_rna_output/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=4807)[0m Train Data Rows:    2001
[36m(_dystack pid=4807)[0m Train Data Columns: 500
[36m(_dystack pid=4807)[0m Label Column:       label
[36m(_dystack pid=4807)[0m Problem Type:       multiclass
[36m(_dystack pid=4807)[0m Preprocessing data ...
[36m(_dystack pid=4807)[0m Train Data Class Count: 4
[36m(_dystack

📊 Leaderboard:
                         model  score_test  score_val  eval_metric  \
0     WeightedEnsemble_L2_FULL    0.513073        NaN  f1_weighted   
1          WeightedEnsemble_L2    0.501567   0.904910  f1_weighted   
2     WeightedEnsemble_L3_FULL    0.487110        NaN  f1_weighted   
3          WeightedEnsemble_L3    0.486460   0.899178  f1_weighted   
4         LightGBM_BAG_L1_FULL    0.484817        NaN  f1_weighted   
5       NeuralNetFastAI_BAG_L2    0.478218   0.896251  f1_weighted   
6  NeuralNetFastAI_BAG_L2_FULL    0.473388        NaN  f1_weighted   
7              LightGBM_BAG_L1    0.468291   0.883297  f1_weighted   
8  NeuralNetFastAI_BAG_L1_FULL    0.452281        NaN  f1_weighted   
9       NeuralNetFastAI_BAG_L1    0.444549   0.887683  f1_weighted   

   pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  \
0        0.026739            NaN   12.014978                 0.002405   
1        0.202433       0.286155  143.403640                 0.00198

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Lightgbm

In [7]:
import os
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

# ========= 配置 =========
# H5AD_PATH = "/Users/xin/Desktop/DATA5703/5703TCGA_BRCA/RNA/TCGA_BRCA_RNA.h5ad"
SEED = 42
TOP_K = 500

# ========= 读取数据 =========
adata = anndata.read_h5ad(H5AD_PATH)
print("✅ 原始数据维度:", adata.shape)

# ========= 清洗 inf/nan 和异常值 =========
X = adata.X.toarray() if not isinstance(adata.X, np.ndarray) else adata.X.copy()
X = X.astype(np.float32)
X[np.isinf(X)] = np.nan
col_means = np.nanmean(X, axis=0)
inds = np.where(np.isnan(X))
X[inds] = np.take(col_means, inds[1])
X = np.clip(X, a_min=0, a_max=1e6)
adata.X = X

# ========= 标签处理 =========
adata.obs["stage_merged"] = adata.obs["stage"].astype(str).replace({"Unknown": "Stage IV"})
y = adata.obs["stage_merged"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)
label_map = {i: label for i, label in enumerate(le.classes_)}
num_classes = len(label_map)
print("✅ 标签类别:", label_map)

# ========= 划分数据集 =========
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(
    adata.X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=SEED
)

# ========= 特征选择 =========
selector = SelectKBest(f_classif, k=TOP_K)
X_train_temp_sel = selector.fit_transform(X_train_temp, y_train_temp)
X_val_sel = selector.transform(X_val)
X_test_sel = selector.transform(X_test)

selected_genes = selector.get_support(indices=True)
selected_gene_names = adata.var_names[np.array(selected_genes)]
print("✅ 特征选择完成，Top K 基因数:", len(selected_gene_names))

# ========= 仅 SMOTE 过采样 =========
smote = SMOTE(sampling_strategy="not majority", random_state=SEED)
# smote = SMOTE(sampling_strategy={'Stage I': 80, 'Stage IV': 80}, random_state=SEED)
X_train_sel, y_train = smote.fit_resample(X_train_temp_sel, y_train_temp)
print("✅ SMOTE 后训练集维度:", X_train_sel.shape)

# ========= LightGBM 训练 =========
print("🚀 Training LightGBM (with class_weight='balanced') ...")
lgb_train = lgb.Dataset(X_train_sel, label=y_train)
lgb_val = lgb.Dataset(X_val_sel, label=y_val, reference=lgb_train)

params = {
    "objective": "multiclass",
    "num_class": num_classes,
    "metric": "multi_logloss",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "seed": SEED,
    "verbosity": -1,
    "class_weight": "balanced"  # ✅ 加入类别平衡参数
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=100
)

# ========= 评估 =========
print("\n📊 Classification Report on Test Set (LightGBM):")
y_pred = model.predict(X_test_sel)
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels, target_names=list(label_map.values())))


✅ 原始数据维度: (1231, 58048)
✅ 标签类别: {0: 'Stage I', 1: 'Stage II', 2: 'Stage III', 3: 'Stage IV'}


 25678 25992 27209 28021 28121 28634 29832 29946 30191 30227 30256 30316
 30406 31125 31558 31758 31848 31929 32815 32869 32988 33419 33785 33966
 35383 35547 37196 37399 37574 38002 38250 38533 39587 39878 41806 42909
 44204 44756 45041 45226 45451 45502 45539 45541 45965 45975 46639 47101
 47517 48796 49278 49488 49601 49621 50585 50747 50774 50947 51310 51801
 52359 52832 52911 53706 55028 55056 55840 56594] are constant.
  f = msb / msw


✅ 特征选择完成，Top K 基因数: 500
✅ SMOTE 后训练集维度: (1972, 500)
🚀 Training LightGBM (with class_weight='balanced') ...

📊 Classification Report on Test Set (LightGBM):
              precision    recall  f1-score   support

     Stage I       0.17      0.03      0.06        30
    Stage II       0.55      0.78      0.65       106
   Stage III       0.28      0.19      0.23        42
    Stage IV       0.00      0.00      0.00         7

    accuracy                           0.50       185
   macro avg       0.25      0.25      0.23       185
weighted avg       0.41      0.50      0.43       185



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## CatBoost

In [8]:
import os
import numpy as np
import pandas as pd
import anndata
import scanpy as sc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
from catboost import CatBoostClassifier

# ========= 配置 =========
# H5AD_PATH = "/Users/xin/Desktop/DATA5703/5703TCGA_BRCA/RNA/TCGA_BRCA_RNA.h5ad"
SEED = 42
TOP_K = 500

# ========= 读取数据 =========
adata = anndata.read_h5ad(H5AD_PATH)
print("✅ 原始数据维度:", adata.shape)

# ========= 清洗 inf/nan 和异常值 =========
X = adata.X.toarray() if not isinstance(adata.X, np.ndarray) else adata.X.copy()
X = X.astype(np.float32)
X[np.isinf(X)] = np.nan
col_means = np.nanmean(X, axis=0)
inds = np.where(np.isnan(X))
X[inds] = np.take(col_means, inds[1])
X = np.clip(X, a_min=0, a_max=1e6)
adata.X = X

# ========= 标签处理 =========
adata.obs["stage_merged"] = adata.obs["stage"].astype(str).replace({"Unknown": "Stage IV"})
y = adata.obs["stage_merged"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)
label_map = {i: label for i, label in enumerate(le.classes_)}
num_classes = len(label_map)
print("✅ 标签类别:", label_map)

# ========= 划分数据集 =========
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(
    adata.X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=SEED
)

# ========= 特征选择 =========
selector = SelectKBest(score_func=mutual_info_classif, k=TOP_K)
X_train_sel_temp = selector.fit_transform(X_train_temp, y_train_temp)
X_val_sel = selector.transform(X_val)
X_test_sel = selector.transform(X_test)
selected_genes = selector.get_support(indices=True)
selected_gene_names = adata.var_names[selected_genes]
print("✅ 特征选择完成，Top K 基因数:", len(selected_gene_names))

# ========= 定向 SMOTE（只增强稀有类）=========
print("📊 训练前类别分布:", Counter(y_train_temp))
smote = SMOTE(sampling_strategy={0: 300, 2: 350, 3: 200}, random_state=SEED)  # Stage I, III, IV
X_train_sel, y_train = smote.fit_resample(X_train_sel_temp, y_train_temp)
print("✅ SMOTE 后类别分布:", Counter(y_train))

# ========= CatBoost 训练 =========
print("🚀 Training CatBoost...")
catboost_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    random_seed=SEED,
    verbose=100,
    class_weights=[2.5, 1.0, 2.0, 5.0]  # 可选：手动设置类别权重
)
catboost_model.fit(X_train_sel, y_train)

# ========= 评估 =========
print("\n📊 Classification Report on Test Set(CatBoost):")
y_pred = catboost_model.predict(X_test_sel)
print(classification_report(y_test, y_pred, target_names=list(label_map.values())))


✅ 原始数据维度: (1231, 58048)
✅ 标签类别: {0: 'Stage I', 1: 'Stage II', 2: 'Stage III', 3: 'Stage IV'}
✅ 特征选择完成，Top K 基因数: 500
📊 训练前类别分布: Counter({np.int64(1): 493, np.int64(2): 194, np.int64(0): 141, np.int64(3): 33})
✅ SMOTE 后类别分布: Counter({np.int64(1): 493, np.int64(2): 350, np.int64(0): 300, np.int64(3): 200})
🚀 Training CatBoost...
0:	learn: 1.3612850	total: 415ms	remaining: 2m 4s
100:	learn: 0.6561172	total: 29.4s	remaining: 57.9s
200:	learn: 0.4257998	total: 58.7s	remaining: 28.9s
299:	learn: 0.3042500	total: 1m 29s	remaining: 0us

📊 Classification Report on Test Set(CatBoost):
              precision    recall  f1-score   support

     Stage I       0.26      0.27      0.26        30
    Stage II       0.59      0.58      0.58       106
   Stage III       0.26      0.31      0.28        42
    Stage IV       0.00      0.00      0.00         7

    accuracy                           0.44       185
   macro avg       0.28      0.29      0.28       185
weighted avg       0.44      0.44     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## TabPFN


In [10]:
!pip install tabpfn

Collecting tabpfn
  Downloading tabpfn-2.1.3-py3-none-any.whl.metadata (27 kB)
Collecting eval-type-backport>=0.2.2 (from tabpfn)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Downloading tabpfn-2.1.3-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.8/160.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Installing collected packages: eval-type-backport, tabpfn
Successfully installed eval-type-backport-0.2.2 tabpfn-2.1.3


In [11]:
import os
import numpy as np
import pandas as pd
import torch
import anndata
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from tabpfn import TabPFNClassifier
from collections import Counter

# ========== 配置 ==========
# H5AD_PATH = "./RNA/TCGA_BRCA_RNA.h5ad"
SEED = 42
TOP_K = 500

# ========== 读取 .h5ad ==========
adata = anndata.read_h5ad(H5AD_PATH)
X = adata.X.toarray() if not isinstance(adata.X, np.ndarray) else adata.X.copy()
X = X.astype(np.float32)
print("✅ 原始数据维度:", X.shape)

# ========== 数据清洗 ==========
X[np.isinf(X)] = np.nan
col_means = np.nanmean(X, axis=0)
inds = np.where(np.isnan(X))
X[inds] = np.take(col_means, inds[1])
X = np.clip(X, a_min=0, a_max=1e6)
adata.X = X

# ========== 标签处理（去除 Unknown）==========
adata = adata[adata.obs["stage"].astype(str) != "Unknown", :].copy()
y = adata.obs["stage"].astype(str)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
label_names = le.classes_
print("✅ 标签类别:", dict(enumerate(label_names)))
print("📊 标签分布:", dict(Counter(y_encoded)))

# ========== 划分数据集（Stratify）==========
X_train, X_test, y_train, y_test = train_test_split(
    adata.X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=SEED
)

# ========== 特征选择（互信息）==========
selector = SelectKBest(mutual_info_classif, k=TOP_K)
X_train_sel = selector.fit_transform(X_train, y_train)
X_test_sel = selector.transform(X_test)

# ========== SMOTE 过采样稀有类 ==========
smote = SMOTE(sampling_strategy="not majority", random_state=SEED)
X_train_res, y_train_res = smote.fit_resample(X_train_sel, y_train)
print("✅ SMOTE后训练集维度:", X_train_res.shape)
print("📊 新标签分布:", dict(Counter(y_train_res)))

# ========== TabPFN 训练 ==========
clf = TabPFNClassifier(
    device="cuda" if torch.cuda.is_available() else "cpu",
    ignore_pretraining_limits=True
)
print("🚀 开始训练 TabPFN...")
clf.fit(X_train_res, y_train_res)

# ========== 预测 & 报告 ==========
y_pred = clf.predict(X_test_sel)
print("\n📊 Classification Report (TabPFN):")
print(classification_report(y_test, y_pred, target_names=label_names))

✅ 原始数据维度: (1231, 58048)
✅ 标签类别: {0: 'Stage I', 1: 'Stage II', 2: 'Stage III', 3: 'Stage IV'}
📊 标签分布: {np.int64(1): 705, np.int64(0): 201, np.int64(3): 22, np.int64(2): 278}
✅ SMOTE后训练集维度: (2252, 500)
📊 新标签分布: {np.int64(1): 563, np.int64(2): 563, np.int64(0): 563, np.int64(3): 563}
🚀 开始训练 TabPFN...


tabpfn-v2-classifier-finetuned-zk73skhh.(…):   0%|          | 0.00/29.0M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]


📊 Classification Report (TabPFN):
              precision    recall  f1-score   support

     Stage I       0.00      0.00      0.00        40
    Stage II       0.59      0.97      0.73       142
   Stage III       0.60      0.05      0.10        56
    Stage IV       0.00      0.00      0.00         4

    accuracy                           0.58       242
   macro avg       0.30      0.26      0.21       242
weighted avg       0.48      0.58      0.45       242



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
