In [1]:
!pip install scanpy

import gdown
import os
import random
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.sparse
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectKBest, f_classif

Collecting scanpy
  Downloading scanpy-1.11.4-py3-none-any.whl.metadata (9.2 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.12.2-py3-none-any.whl.metadata (9.6 kB)
Collecting legacy-api-wrap>=1.4.1 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.2.2-py3-none-any.whl.metadata (3.4 kB)
Collecting array-api-compat>=1.7.1 (from anndata>=0.8->scanpy)
  Downloading array_api_compat-1.12.0-py3-none-any.whl.metadata (2.5 kB)
Collecting zarr!=3.0.*,>=2.18.7 (from anndata>=0.8->scanpy)
  Downloading zarr-3.1.3-py3-none-any.whl.metadata (10 kB)
Collecting donfig>=0.8 (from zarr!=3.0.*,>=2.18.7->anndata>=0.8->scanpy)
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Collecting numcodecs>=0.14 (from numcodecs[crc32c]>=0.14->zarr!=3.0.*,>=2.18.7->anndata>=0.8->scanpy)
  Downloading numcodecs-0.16.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.me

In [2]:
# Download dataset
file_id = "1hjpD4dIdVZGsOsdvYVrBskpCTGLNXjMN"
gdown.download(f"https://drive.google.com/uc?id={file_id}", output="TCGA_BRCA_RNA_HiSeqV2.h5ad", quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1hjpD4dIdVZGsOsdvYVrBskpCTGLNXjMN
From (redirected): https://drive.google.com/uc?id=1hjpD4dIdVZGsOsdvYVrBskpCTGLNXjMN&confirm=t&uuid=11ac1972-b503-4d3c-a472-585dabb3a955
To: /content/TCGA_BRCA_RNA_HiSeqV2.h5ad
100%|██████████| 258M/258M [00:01<00:00, 156MB/s]


'TCGA_BRCA_RNA_HiSeqV2.h5ad'

In [18]:
# Config
os.environ["SCIPY_ARRAY_API"] = "1"

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)
os.environ['PYTHONHASHSEED'] = str(SEED)

H5AD_PATH = "/content/TCGA_BRCA_RNA_HiSeqV2.h5ad"

# ===== 参数 =====
BATCH_SIZE = 16
LR = 1e-3
NUM_EPOCHS = 20
HIDDEN1 = 1024
HIDDEN2 = 256
DROPOUT_RATE = 0.5
MODEL_SAVE_PATH = "./mlp_best.pt"


Using device: cuda


In [19]:
# ===== 读取数据 =====
adata = sc.read_h5ad(H5AD_PATH)
adata.var_names_make_unique()

# ===== 只保留合法标签 =====
valid_stages = ["Stage I", "Stage II", "Stage III", "Stage IV"]
adata = adata[adata.obs["stage"].isin(valid_stages)].copy()

# ===== 特征矩阵 & 标签 =====
import scipy.sparse
X = adata.X.toarray() if scipy.sparse.issparse(adata.X) else np.array(adata.X)
labels = adata.obs["stage"].astype(str).values
le = LabelEncoder()
y = le.fit_transform(labels)  # 0~3

selector = SelectKBest(score_func=f_classif, k=1000)
X_rna_selected = selector.fit_transform(X, y)
X = X_rna_selected

INPUT_DIM = X.shape[1]
NUM_CLASSES = len(le.classes_)
print("X shape:", X.shape)
print("Classes:", le.classes_)

sample_indices = np.arange(len(X))  # 或者 adata.obs_names.to_numpy()
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, sample_indices, test_size=0.2, stratify=y, random_state=SEED
)

# ===== 类别权重 =====
class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(DEVICE)
print("Class weights:", class_weights.cpu().numpy())

# ===== 自定义 Dataset =====
class RNAStageDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.y)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_loader = DataLoader(RNAStageDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(RNAStageDataset(X_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

# ===== 定义 MLP 模型 =====
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden1, hidden2, num_classes, dropout_rate=0.5):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.bn1 = nn.BatchNorm1d(hidden1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.bn2 = nn.BatchNorm1d(hidden2)
        self.out = nn.Linear(hidden2, num_classes)
    def forward(self, x):
        x = self.fc1(x); x = self.bn1(x); x = self.relu(x); x = self.dropout(x)
        x = self.fc2(x); x = self.bn2(x); x = self.relu(x); x = self.dropout(x)
        return self.out(x)

model = MLPClassifier(INPUT_DIM, HIDDEN1, HIDDEN2, NUM_CLASSES, DROPOUT_RATE).to(DEVICE)
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)


X shape: (1216, 1000)
Classes: ['Stage I' 'Stage II' 'Stage III' 'Stage IV']
Class weights: [1.5283018  0.43862817 1.0995475  6.394737  ]


  1100  1188  1255  1256  1259  1338  1363  1367  1399  1425  1430  1461
  1619  1655  1743  1766  1891  1993  1994  2003  2031  2032  2224  2225
  2231  2236  2248  2476  2691  2693  2698  2699  2733  2889  2903  3039
  3048  3049  3076  3079  3273  3284  3288  3425  3463  3895  3994  4036
  4050  4051  4052  4135  4933  5009  5033  5242  5299  5371  5418  5439
  5606  5607  5609  5648  5755  6030  6049  6051  6118  6125  6147  6323
  6324  6825  6852  6855  6856  7207  7218  7357  7403  7420  7421  7440
  7602  7800  7857  7858  8000  8025  8107  8565  9029  9200  9305  9485
  9621  9625  9666  9755 10262 10315 10458 10464 10502 10527 10528 10585
 10605 10608 10668 10690 10958 10959 10962 10963 10964 10965 10966 10996
 10999 11124 11125 11192 11372 11776 11905 12140 12144 12231 12585 12593
 12628 12680 12709 12752 12820 12822 12829 13030 13032 13108 13189 13236
 13297 13303 13304 13307 13322 13472 13608 13629 13667 13668 13746 13862
 13866 13882 13905 13908 13909 13911 13912 13918 13

In [20]:
# ===== 训练循环 =====
best_f1 = 0.0
for epoch in range(1, NUM_EPOCHS + 1):
    model.train(); total_loss = 0.0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(DEVICE), batch_y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(batch_X)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch_X.size(0)
    avg_loss = total_loss / len(train_loader.dataset)

    # ===== 验证阶段 =====
    model.eval(); preds, truths = [], []
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X = batch_X.to(DEVICE)
            logits = model(batch_X)
            preds_batch = torch.argmax(logits, dim=1).cpu().numpy()
            preds.extend(preds_batch)
            truths.extend(batch_y.numpy())

    acc = accuracy_score(truths, preds)
    f1 = f1_score(truths, preds, average="macro", zero_division=0)
    print(f"Epoch {epoch}/{NUM_EPOCHS} - Loss: {avg_loss:.4f} | Acc: {acc:.4f} | Macro F1: {f1:.4f}")

    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
        print("✅ New best model saved!")

Epoch 1/20 - Loss: 1.4935 | Acc: 0.3770 | Macro F1: 0.2517
✅ New best model saved!
Epoch 2/20 - Loss: 1.3448 | Acc: 0.3525 | Macro F1: 0.3034
✅ New best model saved!
Epoch 3/20 - Loss: 1.2240 | Acc: 0.4631 | Macro F1: 0.3189
✅ New best model saved!
Epoch 4/20 - Loss: 1.0791 | Acc: 0.2951 | Macro F1: 0.2410
Epoch 5/20 - Loss: 1.0181 | Acc: 0.4508 | Macro F1: 0.4013
✅ New best model saved!
Epoch 6/20 - Loss: 0.8815 | Acc: 0.4057 | Macro F1: 0.3582
Epoch 7/20 - Loss: 0.7686 | Acc: 0.4344 | Macro F1: 0.3443
Epoch 8/20 - Loss: 0.7527 | Acc: 0.1680 | Macro F1: 0.1808
Epoch 9/20 - Loss: 0.6938 | Acc: 0.4016 | Macro F1: 0.3590
Epoch 10/20 - Loss: 0.6115 | Acc: 0.3320 | Macro F1: 0.2643
Epoch 11/20 - Loss: 0.6036 | Acc: 0.3525 | Macro F1: 0.2749
Epoch 12/20 - Loss: 0.5665 | Acc: 0.5082 | Macro F1: 0.3286
Epoch 13/20 - Loss: 0.5624 | Acc: 0.4467 | Macro F1: 0.3180
Epoch 14/20 - Loss: 0.6076 | Acc: 0.3279 | Macro F1: 0.2966
Epoch 15/20 - Loss: 0.4878 | Acc: 0.4672 | Macro F1: 0.2784
Epoch 16/20 -

In [21]:
# ====== ✅ 测试阶段：Raw + Soft Threshold ======
model.load_state_dict(torch.load(MODEL_SAVE_PATH))
model.eval()

soft_preds, raw_preds, truths = [], [], []
proba_all = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(DEVICE)
        logits = model(batch_X)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        preds = np.argmax(probs, axis=1)

        raw_preds.extend(preds)
        truths.extend(batch_y.numpy())
        proba_all.extend(probs)

# numpy 化
y_test = np.array(truths)
y_pred_raw = np.array(raw_preds)
y_proba = np.array(proba_all)

# ========= 测试数据集评估 =========
print("MLP Classification Report(Raw data):")
print(classification_report(y_test, y_pred_raw, target_names=le.classes_, zero_division=0))

macro_f1 = f1_score(y_test, y_pred_raw, average="macro")
acc = accuracy_score(y_test, y_pred_raw)
print(f"MLP Macro F1-score: {macro_f1:.4f}, Accuracy: {acc:.4f}")


print("MLP Confusion Matrix(Raw data):")
print(confusion_matrix(y_test, y_pred_raw))

MLP Classification Report(Raw data):
              precision    recall  f1-score   support

     Stage I       0.33      0.17      0.23        40
    Stage II       0.62      0.84      0.71       139
   Stage III       0.56      0.27      0.37        55
    Stage IV       0.57      0.40      0.47        10

    accuracy                           0.59       244
   macro avg       0.52      0.42      0.44       244
weighted avg       0.56      0.59      0.55       244

MLP Macro F1-score: 0.4448, Accuracy: 0.5861
MLP Confusion Matrix(Raw data):
[[  7  31   2   0]
 [ 12 117   9   1]
 [  2  36  15   2]
 [  0   5   1   4]]


In [None]:
import json

# 保存测试集对应 patient id（需提前划分 sample_indices）
test_patient_ids = adata.obs.iloc[idx_test]["patient_id"].values

output_data = []
for i, probs in enumerate(y_proba):
    output_data.append({
        "patient_id": test_patient_ids[i],
        "probs": probs.tolist(),
        "modality": "RNA",
        "weight": 1.0
    })

print(output_data[:2])  # 打印前两个样本查看格式

output_path = "./RNA_test_results.json"
with open(output_path, "w") as f:
    json.dump(output_data, f, indent=4)

print(f"✅ JSON 文件已保存到: {output_path}")

[{'patient_id': 'TCGA-B6-A0IB', 'probs': [0.0027299015782773495, 0.2252773642539978, 0.08863842487335205, 0.683354377746582], 'modality': 'RNA', 'weight': 1.0}, {'patient_id': 'TCGA-BH-A0BG', 'probs': [0.26045361161231995, 0.7311237454414368, 0.0012419125996530056, 0.007180718705058098], 'modality': 'RNA', 'weight': 1.0}]


In [None]:
# import json

# # ========== 加载已训练模型 ==========
# model = MLPClassifier(INPUT_DIM, HIDDEN1, HIDDEN2, NUM_CLASSES, DROPOUT_RATE).to(DEVICE)
# model.load_state_dict(torch.load(MODEL_SAVE_PATH))
# model.eval()

# # ========== 推理测试集 ==========
# raw_preds, truths, proba_all = [], [], []

# with torch.no_grad():
#     for batch_X, batch_y in test_loader:
#         batch_X = batch_X.to(DEVICE)
#         logits = model(batch_X)
#         probs = torch.softmax(logits, dim=1).cpu().numpy()
#         preds = np.argmax(probs, axis=1)

#         raw_preds.extend(preds)
#         truths.extend(batch_y.numpy())
#         proba_all.extend(probs)

# y_test = np.array(truths)
# y_proba = np.array(proba_all)
# y_pred_raw = np.array(raw_preds)

# # ========== soft threshold 后处理 ==========
# from itertools import product
# from sklearn.metrics import precision_recall_fscore_support

# stage_i_range   = [0.4, 0.5, 0.6]
# stage_ii_range  = [0.5, 0.6, 0.7]
# stage_iii_range = [0.4, 0.5, 0.6]
# stage_iv_range  = [0.2, 0.3, 0.4]

# results = []
# class2index = {cls: i for i, cls in enumerate(le.classes_)}

# for th_i, th_ii, th_iii, th_iv in product(stage_i_range, stage_ii_range, stage_iii_range, stage_iv_range):
#     thresholds = {
#         "Stage I": th_i,
#         "Stage II": th_ii,
#         "Stage III": th_iii,
#         "Stage IV": th_iv
#     }

#     y_pred = np.copy(y_pred_raw)
#     for i in range(len(y_pred)):
#         if y_proba[i][class2index["Stage IV"]] > thresholds["Stage IV"]:
#             y_pred[i] = class2index["Stage IV"]
#         elif y_proba[i][class2index["Stage III"]] > thresholds["Stage III"]:
#             y_pred[i] = class2index["Stage III"]
#         elif y_proba[i][class2index["Stage I"]] > thresholds["Stage I"]:
#             y_pred[i] = class2index["Stage I"]
#         elif y_proba[i][class2index["Stage II"]] > thresholds["Stage II"]:
#             y_pred[i] = class2index["Stage II"]

#     macro_f1 = f1_score(y_test, y_pred, average="macro", zero_division=0)
#     results.append({
#         "Macro F1": macro_f1,
#         "th_i": th_i, "th_ii": th_ii, "th_iii": th_iii, "th_iv": th_iv,
#         "y_pred": y_pred
#     })

# best_result = sorted(results, key=lambda x: x["Macro F1"], reverse=True)[0]
# best_pred = best_result["y_pred"]

# # ========== 提取测试集的 patient_id ==========
# test_indices = X_test.shape[0]
# patient_ids = adata.obs["patient_id"].values
# test_patient_ids = adata.obs.iloc[-test_indices:]["patient_id"].values  # 最后 N 个为 test set

# # ========== 生成 JSON 输出 ==========
# output_data = []
# for i in range(len(test_patient_ids)):
#     output_data.append({
#         "patient_id": test_patient_ids[i],
#         "probs": y_proba[i].tolist(),
#         "pred_label": le.inverse_transform([best_pred[i]])[0],
#         "modality": "RNA",
#         "weight": 1.0
#     })

# print(output_data[:2])
# # 保存 JSON
# json_output_path = "./rna_test_predictions.json"
# with open(json_output_path, "w") as f:
#     json.dump(output_data, f, indent=2)

# print(f"✅ JSON 保存成功：{json_output_path}")

[{'patient_id': 'TCGA-JL-A3YW', 'probs': [0.0027299015782773495, 0.2252773642539978, 0.08863842487335205, 0.683354377746582], 'pred_label': 'Stage IV', 'modality': 'RNA', 'weight': 1.0}, {'patient_id': 'TCGA-AC-A3YI', 'probs': [0.26045361161231995, 0.7311237454414368, 0.0012419125996530056, 0.007180718705058098], 'pred_label': 'Stage II', 'modality': 'RNA', 'weight': 1.0}]
✅ JSON 保存成功：./rna_test_predictions.json
