In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import os
import random
import matplotlib.pyplot as plt

In [2]:
seed=1234
random.seed(seed)
np.random.seed(seed)
Datasets=['Matek', 'Acevedo']
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
x = np.load('./X.npy')
y = np.load('./y.npy')
dataset = np.load('./dataset.npy')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
print(x.shape)

(32890, 768)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from matplotlib.colors import ListedColormap

# 加载数据
x = np.load('./X.npy')
y = np.load('./y.npy')

# 定义标签映射
label_map = {
    'basophil': 0,
    'eosinophil': 1,
    'erythroblast': 2,
    'myeloblast': 3,
    'promyelocyte': 4,
    'myelocyte': 5,
    'metamyelocyte': 6,
    'neutrophil_banded': 7,
    'neutrophil_segmented': 8,
    'monocyte': 9,
    'lymphocyte_typical': 10,
    'lymphocyte_atypical': 11,
    'smudge_cell': 12,
}

# 反向映射：从数字到细胞名称
inverse_label_map = {v: k.replace("_", " ") for k, v in label_map.items()}

# 使用 t-SNE 降维到2D空间
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000)
x_tsne = tsne.fit_transform(x)

# 定义13种易于区分的颜色
colors = [
    '#956CB9', '#F9BB7A', '#C4AFD2', '#FE8111', '#8C554C', '#FA9A92', 
    '#97E187', '#39A035', '#AFC7E6', '#2377B3', '#D3292C', '#5DD1DD', 
    '#D0D065'
]
cmap = ListedColormap(colors[:13])  # 确保只取13种颜色

# 可视化
plt.figure(figsize=(12, 8))
scatter = plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=y, cmap=cmap, s=5, alpha=0.8)

# # 创建图例
# handles = [
#     plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=8) 
#     for color in colors[:13]
# ]
# labels = [inverse_label_map[i] for i in range(13)]
# plt.legend(handles, labels, loc='best', title="Cell Types", fontsize=10, title_fontsize=12)

# 去除标题、坐标轴和网格
plt.gca().set_title('')
plt.gca().set_xlabel('')
plt.gca().set_ylabel('')
plt.gca().grid(False)

# 去除所有坐标轴
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().tick_params(left=False, bottom=False, labelleft=False, labelbottom=False)

# 保存为 PDF
plt.savefig("t-SNE.pdf", format="pdf")
plt.show()


In [4]:
#preprocess for xgboost
y[y==10]=-1
y[y==3]=10
y[y==-1]=3

In [5]:
X={}
Y={}
for ds in range(len(Datasets)):
    X[ds] = x[dataset == ds]
    Y[ds] = y[dataset == ds]

In [6]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
cm=[np.zeros((13,13)), np.zeros((11,11))]
print("RandomForestClassifier :")
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        rf = RandomForestClassifier(n_estimators=200, max_depth=16, n_jobs=64, random_state=seed)
        rf.fit(X[ds][train_index],Y[ds][train_index])
        pred = rf.predict(X[ds][test_index])
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = rf.predict(x[dataset == val_ds])
            # cm[val_ds] += confusion_matrix(y[dataset == val_ds], pred)
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

RandomForestClassifier :
train on Matek :
test on Matek, acc mean : 0.945386, acc std : 0.003678
test on Acevedo, acc mean : 0.367545, acc std : 0.007421
train on Acevedo :
test on Matek, acc mean : 0.425933, acc std : 0.01854
test on Acevedo, acc mean : 0.870179, acc std : 0.009402


In [7]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print("XGBoost :")
for ds in range(len(Datasets)):
    if Datasets[ds] != 'SYSU3H':
        for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
            xgboost = XGBClassifier(tree_method = "hist", device = "cuda",random_state=seed)
            xgboost.fit(X[ds][train_index],Y[ds][train_index])
            pred = xgboost.predict(X[ds][test_index])
            accuracy = accuracy_score(Y[ds][test_index], pred)
            result[Datasets[ds]][Datasets[ds]][fold] = accuracy
            for val_ds in range(len(Datasets)):
                if val_ds == ds:
                    continue
                pred = xgboost.predict(x[dataset == val_ds])
                accuracy = accuracy_score(y[dataset == val_ds], pred)
                result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

XGBoost :


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




train on Matek :
test on Matek, acc mean : 0.954652, acc std : 0.003707
test on Acevedo, acc mean : 0.422911, acc std : 0.03275
train on Acevedo :
test on Matek, acc mean : 0.355546, acc std : 0.01374
test on Acevedo, acc mean : 0.882213, acc std : 0.009202


In [8]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('SVM(poly) :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        svc = SVC(kernel='poly', random_state=seed)
        svc.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = svc.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = svc.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

SVM(poly) :
train on Matek :
test on Matek, acc mean : 0.958522, acc std : 0.00266
test on Acevedo, acc mean : 0.507213, acc std : 0.01042
train on Acevedo :
test on Matek, acc mean : 0.400131, acc std : 0.008172
test on Acevedo, acc mean : 0.890326, acc std : 0.00587


In [9]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('SVM(linear) :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        svc = SVC(kernel='linear', random_state=seed)
        svc.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = svc.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = svc.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            # if val_ds==2 and ds == 1:
            #     print(pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds =='SYSU3H':
        continue
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

SVM(linear) :
train on Matek :
test on Matek, acc mean : 0.948111, acc std : 0.002211
test on Acevedo, acc mean : 0.519961, acc std : 0.06289
train on Acevedo :
test on Matek, acc mean : 0.617016, acc std : 0.02109
test on Acevedo, acc mean : 0.893282, acc std : 0.004845


In [10]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('mlp :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        mlp = MLPClassifier(max_iter=1000, early_stopping=True, random_state=seed)
        mlp.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = mlp.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = mlp.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

mlp :
train on Matek :
test on Matek, acc mean : 0.959503, acc std : 0.002268
test on Acevedo, acc mean : 0.565138, acc std : 0.0127
train on Acevedo :
test on Matek, acc mean : 0.684275, acc std : 0.02959
test on Acevedo, acc mean : 0.893626, acc std : 0.004349


In [11]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('LogisticRegression :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        lr = LogisticRegression(max_iter=500, random_state=seed)
        lr.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = lr.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = lr.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

LogisticRegression :


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

train on Matek :
test on Matek, acc mean : 0.954816, acc std : 0.001304
test on Acevedo, acc mean : 0.415691, acc std : 0.07102
train on Acevedo :
test on Matek, acc mean : 0.682826, acc std : 0.02353
test on Acevedo, acc mean : 0.894933, acc std : 0.002537
