In [19]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader
from torch_geometric.nn import HeteroConv, SAGEConv, to_hetero
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 1. 数据预处理
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # 编码标签
    label_encoders = {}
    for col in ['TL', 'YL', 'TS', 'TZ']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le.classes_  # 存储类别列表而不是编码器对象

    # 分割特征
    terrain_features = ['DEM_MEAN', 'Slope_MEAN', 'Aspect_MEAN', 'TopographicWetnessIndex_MEAN']
    climate_features = ['PRE2022_mean_MEAN', 'TMP2022_mean_MEAN', 'ETP2022_mean_MEAN']
    vegetation_features = ['ndvi_MEAN', 'evi_MEAN']
    
    # 识别分类特征和数值特征
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numeric_features = [f for f in numeric_features if f not in terrain_features + climate_features + vegetation_features + ['TL', 'YL', 'TS', 'TZ']]

    # 创建预处理管道
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])

    # 拟合和转换数据
    X_transformed = preprocessor.fit_transform(df)
    
    # 获取特征名称
    onehot_encoder = preprocessor.named_transformers_['cat']
    cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features).tolist()
    feature_names = numeric_features + cat_feature_names
    
    # 创建新的 DataFrame
    df_transformed = pd.DataFrame(X_transformed, columns=feature_names)
    
    # 添加回标签和其他特征
    for col in ['TL', 'YL', 'TS', 'TZ'] + terrain_features + climate_features + vegetation_features:
        df_transformed[col] = df[col]

    other_features = [col for col in feature_names if col not in terrain_features + climate_features + vegetation_features]

    return df_transformed, label_encoders, terrain_features, climate_features, vegetation_features, other_features

# 2. 创建异构图 (保持不变)
def create_hetero_data(df, terrain_features, climate_features, vegetation_features, other_features):
    data = HeteroData()
    
    # 添加节点
    data['sample'].x = torch.tensor(df[other_features].values, dtype=torch.float)
    data['terrain'].x = torch.tensor(df[terrain_features].values, dtype=torch.float)
    data['climate'].x = torch.tensor(df[climate_features].values, dtype=torch.float)
    data['vegetation'].x = torch.tensor(df[vegetation_features].values, dtype=torch.float)
    
    # 确保所有节点类型都有特征，即使是空特征
    for node_type in ['sample', 'terrain', 'climate', 'vegetation']:
        if data[node_type].x is None:
            data[node_type].x = torch.zeros((len(df), 1), dtype=torch.float)
    
    # 添加边
    num_samples = len(df)
    edge_index = torch.stack([torch.arange(num_samples), torch.arange(num_samples)], dim=0)
    
    data['sample', 'has', 'terrain'].edge_index = edge_index
    data['terrain', 'belongs_to', 'sample'].edge_index = edge_index.flip([0])
    
    data['sample', 'has', 'climate'].edge_index = edge_index
    data['climate', 'belongs_to', 'sample'].edge_index = edge_index.flip([0])
    
    data['sample', 'has', 'vegetation'].edge_index = edge_index
    data['vegetation', 'belongs_to', 'sample'].edge_index = edge_index.flip([0])
    
    # 添加标签
    data['sample'].y_TL = torch.tensor(df['TL'].values, dtype=torch.long)
    data['sample'].y_YL = torch.tensor(df['YL'].values, dtype=torch.long)
    data['sample'].y_TS = torch.tensor(df['TS'].values, dtype=torch.long)
    data['sample'].y_TZ = torch.tensor(df['TZ'].values, dtype=torch.long)
    
    return data

# 3. 定义模型 (保持不变)
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers, metadata):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                edge_type: SAGEConv((-1, -1), hidden_channels)
                for edge_type in metadata[1]
            })
            self.convs.append(conv)

        self.lin = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: x.relu() if x is not None else x for key, x in x_dict.items()}
        return {key: self.lin(x) if x is not None else x for key, x in x_dict.items()}
# SoilClassifier 类
class SoilClassifier(torch.nn.Module):
    def __init__(self, hidden_channels, metadata, num_classes):
        super().__init__()
        self.gnn = HeteroGNN(hidden_channels, hidden_channels, num_layers=2, metadata=metadata)
        self.classifier_TL = torch.nn.Linear(hidden_channels, num_classes['TL'])
        self.classifier_YL = torch.nn.Linear(hidden_channels, num_classes['YL'])
        self.classifier_TS = torch.nn.Linear(hidden_channels, num_classes['TS'])
        self.classifier_TZ = torch.nn.Linear(hidden_channels, num_classes['TZ'])

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.gnn(x_dict, edge_index_dict)
        x = x_dict.get('sample')
        if x is None:
            return None
        return {
            'TL': self.classifier_TL(x),
            'YL': self.classifier_YL(x),
            'TS': self.classifier_TS(x),
            'TZ': self.classifier_TZ(x)
        }
# 4. 训练函数 (保持不变)
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x_dict, data.edge_index_dict)
        loss = (F.cross_entropy(out['TL'], data['sample'].y_TL) +
                F.cross_entropy(out['YL'], data['sample'].y_YL) +
                F.cross_entropy(out['TS'], data['sample'].y_TS) +
                F.cross_entropy(out['TZ'], data['sample'].y_TZ)) / 4
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 5. 评估函数 (保持不变)
@torch.no_grad()
def test(model, loader, device):
    model.eval()
    total_correct = {key: 0 for key in ['TL', 'YL', 'TS', 'TZ']}
    total_examples = 0
    for data in loader:
        data = data.to(device)
        out = model(data.x_dict, data.edge_index_dict)
        for key in ['TL', 'YL', 'TS', 'TZ']:
            pred = out[key].argmax(dim=-1)
            total_correct[key] += int((pred == data['sample'][f'y_{key}']).sum())
        total_examples += data['sample'].y_TL.size(0)
    return {key: total_correct[key] / total_examples for key in ['TL', 'YL', 'TS', 'TZ']}

# 主函数
def main(data_path):
    # 数据预处理
    df, label_encoders, terrain_features, climate_features, vegetation_features, other_features = preprocess_data(data_path)
    
    # 创建异构图数据
    data = create_hetero_data(df, terrain_features, climate_features, vegetation_features, other_features)
    
    # 直接分割索引，而不是分割 HeteroData 对象
    num_samples = df.shape[0]
    indices = list(range(num_samples))
    train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

    # 创建子图的索引字典
    train_subset = {
        'sample': torch.tensor(train_indices),
        'terrain': torch.tensor(train_indices),
        'climate': torch.tensor(train_indices),
        'vegetation': torch.tensor(train_indices)
    }
    test_subset = {
        'sample': torch.tensor(test_indices),
        'terrain': torch.tensor(test_indices),
        'climate': torch.tensor(test_indices),
        'vegetation': torch.tensor(test_indices)
    }

    # 使用索引创建训练集和测试集
    train_data = data.subgraph(train_subset)
    test_data = data.subgraph(test_subset)

    train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
    test_loader = DataLoader([test_data], batch_size=32, shuffle=False)

    # 定义模型
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_classes = {key: len(classes) for key, classes in label_encoders.items()}
    model = SoilClassifier(hidden_channels=64, metadata=data.metadata(), num_classes=num_classes).to(device)
    
    # 训练模型
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(100):
        loss = train(model, train_loader, optimizer, device)
        train_acc = test(model, train_loader, device)
        test_acc = test(model, test_loader, device)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc}, Test Acc: {test_acc}')


In [20]:
data_path = r"C:\Users\Runker\Desktop\train_polygon.csv"
main(data_path)

Epoch: 000, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ': 0.00625}, Test Acc: {'TL': 0.0, 'YL': 0.6, 'TS': 0.075, 'TZ': 0.025}
Epoch: 001, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ': 0.00625}, Test Acc: {'TL': 0.0, 'YL': 0.6, 'TS': 0.075, 'TZ': 0.025}
Epoch: 002, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ': 0.00625}, Test Acc: {'TL': 0.0, 'YL': 0.6, 'TS': 0.075, 'TZ': 0.025}
Epoch: 003, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ': 0.00625}, Test Acc: {'TL': 0.0, 'YL': 0.6, 'TS': 0.075, 'TZ': 0.025}
Epoch: 004, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ': 0.00625}, Test Acc: {'TL': 0.0, 'YL': 0.6, 'TS': 0.075, 'TZ': 0.025}
Epoch: 005, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ': 0.00625}, Test Acc: {'TL': 0.0, 'YL': 0.6, 'TS': 0.075, 'TZ': 0.025}
Epoch: 006, Loss: nan, Train Acc: {'TL': 0.00625, 'YL': 0.6625, 'TS': 0.0125, 'TZ'

In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from torch_geometric.loader import DataLoader
from torch_geometric.nn import HeteroConv, SAGEConv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 数据预处理
def preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # 只编码 TZ 标签
    le_tz = LabelEncoder()
    df['TZ'] = le_tz.fit_transform(df['TZ'])
    label_encoder = le_tz

    # 分割特征
    terrain_features = ['DEM_MEAN', 'Slope_MEAN', 'Aspect_MEAN', 'TopographicWetnessIndex_MEAN']
    climate_features = ['PRE2022_mean_MEAN', 'TMP2022_mean_MEAN', 'ETP2022_mean_MEAN']
    vegetation_features = ['ndvi_MEAN', 'evi_MEAN']
    
    # 识别分类特征和数值特征
    categorical_features = df.select_dtypes(include=['object']).columns.tolist()
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    numeric_features = [f for f in numeric_features if f not in terrain_features + climate_features + vegetation_features + ['TZ']]

    # 对分类特征进行编码
    le_dict = {}
    for cat_feat in categorical_features:
        le = LabelEncoder()
        df[cat_feat] = le.fit_transform(df[cat_feat])
        le_dict[cat_feat] = le

    # 对数值特征进行标准化
    scaler = StandardScaler()
    df[numeric_features] = scaler.fit_transform(df[numeric_features])

    # 合并所有特征
    all_features = numeric_features + categorical_features + terrain_features + climate_features + vegetation_features
    
    other_features = [col for col in all_features if col not in terrain_features + climate_features + vegetation_features]

    return df, label_encoder, terrain_features, climate_features, vegetation_features, other_features, le_dict

# 创建异构图
def create_hetero_data(df, terrain_features, climate_features, vegetation_features, other_features):
    data = HeteroData()
    
    # 添加节点
    data['sample'].x = torch.tensor(df[other_features].values, dtype=torch.float)
    data['terrain'].x = torch.tensor(df[terrain_features].values, dtype=torch.float)
    data['climate'].x = torch.tensor(df[climate_features].values, dtype=torch.float)
    data['vegetation'].x = torch.tensor(df[vegetation_features].values, dtype=torch.float)
    
    # 确保所有节点类型都有特征，即使是空特征
    for node_type in ['sample', 'terrain', 'climate', 'vegetation']:
        if data[node_type].x is None:
            data[node_type].x = torch.zeros((len(df), 1), dtype=torch.float)
    
    # 添加边
    num_samples = len(df)
    edge_index = torch.stack([torch.arange(num_samples), torch.arange(num_samples)], dim=0)
    
    data['sample', 'has', 'terrain'].edge_index = edge_index
    data['terrain', 'belongs_to', 'sample'].edge_index = edge_index.flip([0])
    
    data['sample', 'has', 'climate'].edge_index = edge_index
    data['climate', 'belongs_to', 'sample'].edge_index = edge_index.flip([0])
    
    data['sample', 'has', 'vegetation'].edge_index = edge_index
    data['vegetation', 'belongs_to', 'sample'].edge_index = edge_index.flip([0])
    
    # 添加标签
    data['sample'].y = torch.tensor(df['TZ'].values, dtype=torch.long)
    
    return data

# 定义模型
class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, num_layers, metadata):
        super().__init__()
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            conv = HeteroConv({
                edge_type: SAGEConv((-1, -1), hidden_channels)
                for edge_type in metadata[1]
            })
            self.convs.append(conv)

        self.lin = torch.nn.Linear(hidden_channels, out_channels)

    def forward(self, x_dict, edge_index_dict):
        for conv in self.convs:
            x_dict = conv(x_dict, edge_index_dict)
            x_dict = {key: x.relu() if x is not None else x for key, x in x_dict.items()}
        return {key: self.lin(x) if x is not None else x for key, x in x_dict.items()}

class SoilClassifier(torch.nn.Module):
    def __init__(self, hidden_channels, metadata, num_classes):
        super().__init__()
        self.gnn = HeteroGNN(hidden_channels, hidden_channels, num_layers=2, metadata=metadata)
        self.classifier = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.gnn(x_dict, edge_index_dict)
        x = x_dict['sample']
        return self.classifier(x)

# 训练函数
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x_dict, data.edge_index_dict)
        loss = F.cross_entropy(out, data['sample'].y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# 评估函数
@torch.no_grad()
def test(model, loader, device):
    model.eval()
    total_correct = 0
    total_examples = 0
    predictions = []
    true_labels = []
    for data in loader:
        data = data.to(device)
        out = model(data.x_dict, data.edge_index_dict)
        pred = out.argmax(dim=-1)
        total_correct += int((pred == data['sample'].y).sum())
        total_examples += data['sample'].y.size(0)
        predictions.extend(pred.cpu().numpy())
        true_labels.extend(data['sample'].y.cpu().numpy())
    return total_correct / total_examples, predictions, true_labels

# 绘制混淆矩阵
def plot_confusion_matrix(y_true, y_pred, classes):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

# 主函数
def main(data_path):
    # 数据预处理
    df, label_encoder, terrain_features, climate_features, vegetation_features, other_features, le_dict = preprocess_data(data_path)
    
    # 创建异构图数据
    data = create_hetero_data(df, terrain_features, climate_features, vegetation_features, other_features)
    
    # 划分训练集和测试集
    num_samples = df.shape[0]
    indices = list(range(num_samples))
    train_indices, test_indices = train_test_split(indices, test_size=0.2, random_state=42)

    # 为每种节点类型创建子图索引
    train_subset = {
        node_type: torch.tensor(train_indices)
        for node_type in data.node_types
    }
    test_subset = {
        node_type: torch.tensor(test_indices)
        for node_type in data.node_types
    }

    # 使用索引创建训练集和测试集
    train_data = data.subgraph(train_subset)
    test_data = data.subgraph(test_subset)

    train_loader = DataLoader([train_data], batch_size=32, shuffle=True)
    test_loader = DataLoader([test_data], batch_size=32, shuffle=False)

    # 定义模型
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    num_classes = len(label_encoder.classes_)
    model = SoilClassifier(hidden_channels=64, metadata=data.metadata(), num_classes=num_classes).to(device)
    
    # 训练模型
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(100):
        loss = train(model, train_loader, optimizer, device)
        train_acc, _, _ = test(model, train_loader, device)
        test_acc, test_pred, test_true = test(model, test_loader, device)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

    # 打印分类报告
    print(classification_report(test_true, test_pred, target_names=label_encoder.classes_))

    # 绘制混淆矩阵
    plot_confusion_matrix(test_true, test_pred, label_encoder.classes_)


In [6]:
data_path = r"C:\Users\Runker\Desktop\train_polygon.csv"
main(data_path)

Epoch: 000, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 001, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 002, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 003, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 004, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 005, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 006, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 007, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 008, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 009, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 010, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 011, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 012, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 013, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 014, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 015, Loss: nan, Train Acc: 0.0063, Test Acc: 0.0250
Epoch: 016, Loss: nan, Train Acc: 0.0063, Test Acc: 0.02

ValueError: Number of classes, 24, does not match size of target_names, 40. Try specifying the labels parameter