In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


test colab github


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, classification_report,
                             confusion_matrix, precision_recall_curve, roc_curve)
import warnings
warnings.filterwarnings('ignore')

print("✓ 库导入成功")
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")

✓ 库导入成功
PyTorch 版本: 2.8.0+cu126
CUDA 可用: True


In [None]:
print("="*60)
print("数据加载")
print("="*60)

df = pd.read_csv('/content/drive/MyDrive/2025Fall_CS526_GroupProject/readmission_features_30d_v1.csv')

print(f"原始数据形状: {df.shape}")
print(f"再入院率: {df['readmit_label'].mean():.2%}")

# 删除缺失率高的列（>30%）
missing_threshold = 0.30
columns_to_keep = df.columns[df.isnull().mean() < missing_threshold].tolist()
df_clean = df[columns_to_keep].copy()

print(f"\n保留列数: {len(columns_to_keep)}")

# 删除有缺失值的行
df_clean = df_clean.dropna()
print(f"删除缺失行后: {df_clean.shape}")
print(f"保留了 {len(df_clean)/len(df)*100:.1f}% 的数据")

数据加载
原始数据形状: (546028, 51)
再入院率: 20.33%

保留列数: 46
删除缺失行后: (327118, 46)
保留了 59.9% 的数据


In [None]:
print("="*60)
print("特征工程")
print("="*60)

# 删除不需要的列
columns_to_drop = ['subject_id', 'hadm_id', 'admittime', 'dischtime', 'readmit_label', 'index']
columns_to_drop = [col for col in columns_to_drop if col in df_clean.columns]

# 分离数值型和分类型特征
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()

numeric_features = [col for col in numeric_cols if col not in columns_to_drop]
categorical_features = [col for col in categorical_cols if col not in columns_to_drop]

print(f"数值特征: {len(numeric_features)}")
print(f"分类特征: {len(categorical_features)}")
print(f"分类特征列表: {categorical_features}")

# 编码分类特征
if len(categorical_features) > 0:
    for col in categorical_features:
        le = LabelEncoder()
        df_clean[col + '_encoded'] = le.fit_transform(df_clean[col].astype(str))
    feature_columns = numeric_features + [col + '_encoded' for col in categorical_features]
else:
    feature_columns = numeric_features

print(f"\n最终特征数: {len(feature_columns)}")

特征工程
数值特征: 33
分类特征: 8
分类特征列表: ['last_service', 'gender', 'language', 'marital_status', 'insurance', 'admission_type', 'admission_location', 'discharge_location']

最终特征数: 41


In [None]:
X = df_clean[feature_columns].values
y = df_clean['readmit_label'].values

print(f"特征矩阵形状: {X.shape}")
print(f"标签形状: {y.shape}")
print(f"正类比例: {y.mean():.2%}")
print(f"负类比例: {1-y.mean():.2%}")

特征矩阵形状: (327118, 41)
标签形状: (327118,)
正类比例: 22.61%
负类比例: 77.39%


In [None]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"训练集: {X_train.shape}, 正类: {y_train.mean():.2%}")
print(f"测试集: {X_test.shape}, 正类: {y_test.mean():.2%}")

# 标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"\n标准化完成")
print(f"X_train 范围: [{X_train.min():.2f}, {X_train.max():.2f}]")

训练集: (261694, 41), 正类: 22.61%
测试集: (65424, 41), 正类: 22.61%

标准化完成
X_train 范围: [-12.79, 510.56]


In [None]:
class ReadmissionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 创建 Dataset
train_dataset = ReadmissionDataset(X_train, y_train)
test_dataset = ReadmissionDataset(X_test, y_test)

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

print(f"✓ DataLoader 创建成功")
print(f"训练批次数: {len(train_loader)}")
print(f"测试批次数: {len(test_loader)}")

✓ DataLoader 创建成功
训练批次数: 1023
测试批次数: 256


In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, d_model=256, nhead=8, num_layers=3, dropout=0.2):
        super(TransformerClassifier, self).__init__()

        # 输入投影层
        self.input_projection = nn.Sequential(
            nn.Linear(input_dim, d_model),
            nn.LayerNorm(d_model),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

        # 位置编码
        self.pos_encoding = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            activation='gelu',
            batch_first=True,
            norm_first=True  # Pre-LN，更稳定
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_layers,
            norm=nn.LayerNorm(d_model)
        )

        # 分类头
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.LayerNorm(d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, d_model // 4),
            nn.LayerNorm(d_model // 4),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(d_model // 4, 2)
        )

        # 初始化
        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        # 投影到 d_model 维度
        x = self.input_projection(x)

        # 添加维度以适配 Transformer
        x = x.unsqueeze(1)

        # 添加位置编码
        x = x + self.pos_encoding

        # Transformer 编码
        x = self.transformer_encoder(x)

        # 取出序列表示
        x = x.squeeze(1)

        # 分类
        out = self.classifier(x)

        return out

print("✓ Transformer 模型定义完成")

✓ Transformer 模型定义完成


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# 创建模型
model = TransformerClassifier(
    input_dim=X_train.shape[1],
    d_model=256,
    nhead=8,
    num_layers=3,
    dropout=0.2
).to(device)

print(f"\n模型参数量: {sum(p.numel() for p in model.parameters()):,}")

# 计算类别权重
class_counts = np.bincount(y_train)
class_weights = torch.FloatTensor([1.0, class_counts[0] / class_counts[1]]).to(device)
print(f"类别权重: {class_weights.cpu().numpy()}")

# 损失函数
criterion = nn.CrossEntropyLoss(weight=class_weights)

# 优化器
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.0001,
    weight_decay=0.01,
    betas=(0.9, 0.999)
)

# 学习率调度器
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)

print("✓ 模型和优化器初始化完成")

使用设备: cuda
GPU: NVIDIA A100-SXM4-40GB

模型参数量: 2,422,978
类别权重: [1.       3.422748]
✓ 模型和优化器初始化完成


In [None]:
def train_epoch(model, loader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device, non_blocking=True)
        y_batch = y_batch.to(device, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()

        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    return total_loss / len(loader), acc

def evaluate(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device, non_blocking=True)
            outputs = model(X_batch)
            probs = torch.softmax(outputs, dim=1)
            _, predicted = torch.max(outputs, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())

    return np.array(all_preds), np.array(all_labels), np.array(all_probs)

print("✓ 训练和评估函数定义完成")

✓ 训练和评估函数定义完成


In [None]:
print("="*60)
print("开始训练 Transformer")
print("="*60)

num_epochs = 100
best_auc = 0
best_f1 = 0
patience = 15
patience_counter = 0

for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
    preds, labels, probs = evaluate(model, test_loader, device)

    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, zero_division=0)
    recall = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    auc = roc_auc_score(labels, probs)

    # 每5个epoch打印一次
    if (epoch + 1) % 5 == 0:
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        print(f"  Train - Loss: {train_loss:.4f}, Acc: {train_acc:.4f}")
        print(f"  Test  - Acc: {acc:.4f}, Prec: {precision:.4f}, Rec: {recall:.4f}")
        print(f"  Test  - F1: {f1:.4f}, AUC: {auc:.4f}")
        print(f"  LR: {optimizer.param_groups[0]['lr']:.6f}")

    # 保存最佳模型
    if auc > best_auc:
        best_auc = auc
        best_f1 = f1
        torch.save(model.state_dict(), 'best_transformer_model.pth')
        patience_counter = 0
        if (epoch + 1) % 5 == 0:
            print("  ✓ 保存最佳模型")
    else:
        patience_counter += 1

    # 早停
    if patience_counter >= patience:
        print(f"\n早停于 Epoch {epoch+1}")
        break

print(f"\n{'='*60}")
print(f"训练完成！")
print(f"最佳 AUC: {best_auc:.4f}")
print(f"最佳 F1: {best_f1:.4f}")
print(f"{'='*60}")

开始训练 Transformer

Epoch 5/100
  Train - Loss: 0.6376, Acc: 0.6246
  Test  - Acc: 0.6285, Prec: 0.3329, Rec: 0.6405
  Test  - F1: 0.4381, AUC: 0.6863
  LR: 0.000100
  ✓ 保存最佳模型

Epoch 10/100
  Train - Loss: 0.6319, Acc: 0.6292
  Test  - Acc: 0.6263, Prec: 0.3325, Rec: 0.6480
  Test  - F1: 0.4395, AUC: 0.6900
  LR: 0.000100
  ✓ 保存最佳模型

Epoch 15/100
  Train - Loss: 0.6297, Acc: 0.6331
  Test  - Acc: 0.6062, Prec: 0.3257, Rec: 0.6929
  Test  - F1: 0.4431, AUC: 0.6924
  LR: 0.000051
  ✓ 保存最佳模型

Epoch 20/100
  Train - Loss: 0.6272, Acc: 0.6355
  Test  - Acc: 0.6385, Prec: 0.3402, Rec: 0.6375
  Test  - F1: 0.4437, AUC: 0.6942
  LR: 0.000001
  ✓ 保存最佳模型

Epoch 25/100
  Train - Loss: 0.6269, Acc: 0.6359
  Test  - Acc: 0.6320, Prec: 0.3371, Rec: 0.6497
  Test  - F1: 0.4439, AUC: 0.6956
  LR: 0.000086
  ✓ 保存最佳模型

Epoch 30/100
  Train - Loss: 0.6241, Acc: 0.6386
  Test  - Acc: 0.6377, Prec: 0.3402, Rec: 0.6415
  Test  - F1: 0.4446, AUC: 0.6966
  LR: 0.000051
  ✓ 保存最佳模型

Epoch 35/100
  Train - Loss: 

In [None]:
print("="*60)
print("最终模型评估")
print("="*60)

# 加载最佳模型
model.load_state_dict(torch.load('best_transformer_model.pth'))
preds, labels, probs = evaluate(model, test_loader, device)

print(f"\nAccuracy:  {accuracy_score(labels, preds):.4f}")
print(f"Precision: {precision_score(labels, preds):.4f}")
print(f"Recall:    {recall_score(labels, preds):.4f}")
print(f"F1 Score:  {f1_score(labels, preds):.4f}")
print(f"AUC-ROC:   {roc_auc_score(labels, probs):.4f}")

print("\n分类报告:")
print(classification_report(labels, preds, target_names=['无再入院', '再入院']))

cm = confusion_matrix(labels, preds)
print(f"\n混淆矩阵:")
print(cm)

最终模型评估

Accuracy:  0.6435
Precision: 0.3441
Recall:    0.6368
F1 Score:  0.4468
AUC-ROC:   0.7000

分类报告:
              precision    recall  f1-score   support

        无再入院       0.86      0.65      0.74     50632
         再入院       0.34      0.64      0.45     14792

    accuracy                           0.64     65424
   macro avg       0.60      0.64      0.59     65424
weighted avg       0.74      0.64      0.67     65424


混淆矩阵:
[[32681 17951]
 [ 5373  9419]]


In [None]:
print("="*60)
print("阈值优化")
print("="*60)

precisions, recalls, thresholds = precision_recall_curve(labels, probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

print(f"\n最佳阈值: {best_threshold:.3f} (默认是 0.5)")
print(f"在该阈值下:")
print(f"  Precision: {precisions[best_idx]:.4f}")
print(f"  Recall: {recalls[best_idx]:.4f}")
print(f"  F1: {f1_scores[best_idx]:.4f}")

new_preds = (probs >= best_threshold).astype(int)
print(f"\n使用优化阈值的最终结果:")
print(f"  Accuracy:  {accuracy_score(labels, new_preds):.4f}")
print(f"  Precision: {precision_score(labels, new_preds):.4f}")
print(f"  Recall:    {recall_score(labels, new_preds):.4f}")
print(f"  F1 Score:  {f1_score(labels, new_preds):.4f}")

In [None]:
import matplotlib.pyplot as plt

# ROC 曲线
fpr, tpr, _ = roc_curve(labels, probs)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, label=f'AUC = {best_auc:.3f}', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve', fontsize=14)
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)

# Precision-Recall 曲线
plt.subplot(1, 2, 2)
plt.plot(recalls, precisions, linewidth=2)
plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curve', fontsize=14)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('transformer_results.png', dpi=150, bbox_inches='tight')
print("✓ 图表已保存")
plt.show()