In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import joblib
import os

In [8]:
# 定义数据集类
class SoilDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        description = self.data.iloc[index]['description']
        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        spectral_columns = ['Centroid_X', 'Centroid_Y', 'DEM_MEAN', 'DEM_STD', 'ndvi_MEAN',
                            'PCA_0_MEAN', 'Slope_MEAN', 'Aspect_MEAN', 'MRVBF_MEAN',
                            'TopographicWetnessIndex_MEAN', 'Mean_MEAN', 'PH_MEAN', 'PRE_MEAN',
                            'SRA_MEAN', 'TMP_MEAN', 'VAP_MEAN', 'WIND_MEAN', 'PlanCurvature_MEAN']
        spectral_data = self.data.iloc[index][spectral_columns].values.astype(float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'spectral_data': torch.FloatTensor(spectral_data),
            'soil_class': torch.tensor(self.data.iloc[index]['soil_class_encoded'], dtype=torch.long),
            'soil_subclass': torch.tensor(self.data.iloc[index]['soil_subclass_encoded'], dtype=torch.long),
            'soil_group': torch.tensor(self.data.iloc[index]['soil_group_encoded'], dtype=torch.long),
            'soil_type': torch.tensor(self.data.iloc[index]['soil_type_encoded'], dtype=torch.long)
        }

# 定义模型
class HierarchicalSoilClassificationModel(nn.Module):
    def __init__(self, n_classes, n_spectral_features=18):
        super(HierarchicalSoilClassificationModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(0.1)
        self.spectral_fc = nn.Linear(n_spectral_features, 64)
        self.fc = nn.Linear(self.bert.config.hidden_size + 64, 256)
        
        self.soil_class = nn.Linear(256, n_classes['soil_class'])
        self.soil_subclass = nn.Linear(256 + n_classes['soil_class'], n_classes['soil_subclass'])
        self.soil_group = nn.Linear(256 + n_classes['soil_subclass'], n_classes['soil_group'])
        self.soil_type = nn.Linear(256 + n_classes['soil_group'], n_classes['soil_type'])

        self.class_to_subclass = self._create_hierarchy_matrix(n_classes['soil_class'], n_classes['soil_subclass'])
        self.subclass_to_group = self._create_hierarchy_matrix(n_classes['soil_subclass'], n_classes['soil_group'])
        self.group_to_type = self._create_hierarchy_matrix(n_classes['soil_group'], n_classes['soil_type'])

    def _create_hierarchy_matrix(self, parent_size, child_size):
        matrix = torch.zeros(parent_size, child_size)
        for i in range(child_size):
            matrix[np.random.randint(0, parent_size), i] = 1
        return nn.Parameter(matrix, requires_grad=False)

    def forward(self, input_ids, attention_mask, spectral_data):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        
        spectral_features = self.spectral_fc(spectral_data)
        spectral_features = torch.relu(spectral_features)
        
        combined = torch.cat((pooled_output, spectral_features), dim=1)
        x = self.fc(combined)
        x = torch.relu(x)
        
        soil_class = self.soil_class(x)
        soil_class_probs = torch.softmax(soil_class, dim=1)
        
        subclass_input = torch.cat((x, soil_class_probs), dim=1)
        soil_subclass = self.soil_subclass(subclass_input)
        soil_subclass_probs = torch.softmax(soil_subclass, dim=1)
        
        group_input = torch.cat((x, soil_subclass_probs), dim=1)
        soil_group = self.soil_group(group_input)
        soil_group_probs = torch.softmax(soil_group, dim=1)
        
        type_input = torch.cat((x, soil_group_probs), dim=1)
        soil_type = self.soil_type(type_input)
        
        soil_subclass_probs = soil_subclass_probs * (soil_class_probs @ self.class_to_subclass)
        soil_group_probs = soil_group_probs * (soil_subclass_probs @ self.subclass_to_group)
        soil_type_probs = torch.softmax(soil_type, dim=1) * (soil_group_probs @ self.group_to_type)
        
        return soil_class_probs, soil_subclass_probs, soil_group_probs, soil_type_probs

# 训练函数
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            optimizer.zero_grad()
            class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
            loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                    criterion(group_out, soil_group) + criterion(type_out, soil_type))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # 验证
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                spectral_data = batch['spectral_data'].to(device)
                soil_class = batch['soil_class'].to(device)
                soil_subclass = batch['soil_subclass'].to(device)
                soil_group = batch['soil_group'].to(device)
                soil_type = batch['soil_type'].to(device)

                class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
                loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                        criterion(group_out, soil_group) + criterion(type_out, soil_type))
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

# 评估函数
def evaluate_model(model, data_loader, device):
    model.eval()
    correct_predictions = {
        'soil_class': 0,
        'soil_subclass': 0,
        'soil_group': 0,
        'soil_type': 0
    }
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            outputs = model(input_ids, attention_mask, spectral_data)
            
            for i, output in enumerate(outputs):
                _, preds = torch.max(output, dim=1)
                correct_predictions[list(correct_predictions.keys())[i]] += torch.sum(preds == locals()[list(correct_predictions.keys())[i]])
            
            total_predictions += input_ids.size(0)

    accuracies = {k: (v.item() / total_predictions) * 100 for k, v in correct_predictions.items()}
    return accuracies

# 创建层级关系映射
def create_hierarchy_mappings(data):
    mappings = {
        'soil_class_to_soil_subclass': {},
        'soil_subclass_to_soil_group': {},
        'soil_group_to_soil_type': {}
    }
    
    for _, row in data.iterrows():
        class_id = row['soil_class_encoded']
        subclass_id = row['soil_subclass_encoded']
        group_id = row['soil_group_encoded']
        type_id = row['soil_type_encoded']
        
        if class_id not in mappings['soil_class_to_soil_subclass']:
            mappings['soil_class_to_soil_subclass'][class_id] = set()
        mappings['soil_class_to_soil_subclass'][class_id].add(subclass_id)
        
        if subclass_id not in mappings['soil_subclass_to_soil_group']:
            mappings['soil_subclass_to_soil_group'][subclass_id] = set()
        mappings['soil_subclass_to_soil_group'][subclass_id].add(group_id)
        
        if group_id not in mappings['soil_group_to_soil_type']:
            mappings['soil_group_to_soil_type'][group_id] = set()
        mappings['soil_group_to_soil_type'][group_id].add(type_id)
    
    # 将集合转换为列表以便JSON序列化
    for mapping in mappings.values():
        for key in mapping:
            mapping[key] = list(mapping[key])
    
    return mappings

# 主函数
def main(data, num_epochs=5, lr=2e-5):
    # 加载数据
    data = data
    
    # 编码标签
    label_encoders = {}
    for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']:
        le = LabelEncoder()
        data[f'{column}_encoded'] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    # 划分训练集和测试集
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # 初始化tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    
    # 创建数据集和数据加载器
    train_dataset = SoilDataset(train_data, tokenizer)
    test_dataset = SoilDataset(test_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 初始化模型
    n_classes = {
        'soil_class': len(label_encoders['soil_class'].classes_),
        'soil_subclass': len(label_encoders['soil_subclass'].classes_),
        'soil_group': len(label_encoders['soil_group'].classes_),
        'soil_type': len(label_encoders['soil_type'].classes_)
    }
    model = HierarchicalSoilClassificationModel(n_classes).to(device)
    
    # 设置损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # 训练模型
    train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=num_epochs)
    
    # 评估模型
    print("\n评估模型性能...")
    accuracies = evaluate_model(model, test_loader, device)
    
    print("模型准确率：")
    for category, accuracy in accuracies.items():
        print(f"{category}: {accuracy:.2f}%")

    # 创建层级关系映射
    hierarchy_mappings = create_hierarchy_mappings(data)
    
    # 保存模型和其他必要的数据
    save_path = 'model_output'
    os.makedirs(save_path, exist_ok=True)
    torch.save(model, os.path.join(save_path, 'hierarchical_soil_classification_model.pth'))
    joblib.dump(label_encoders, os.path.join(save_path, 'label_encoders.joblib'))
    joblib.dump(hierarchy_mappings, os.path.join(save_path, 'hierarchy_mappings.joblib'))

In [9]:
data = pd.read_csv("result.csv")
data.rename(columns={'TL':'soil_class','YL':'soil_subclass','TS':'soil_group','TZ':'soil_type'},inplace=True)

In [10]:
main(data=data,num_epochs=5,lr=1e-4)

Epoch 1/5, Train Loss: 12.2034, Val Loss: 12.2039
Epoch 2/5, Train Loss: 12.2034, Val Loss: 12.2039
Epoch 3/5, Train Loss: 12.2034, Val Loss: 12.2039
Epoch 4/5, Train Loss: 12.2034, Val Loss: 12.2039
Epoch 5/5, Train Loss: 12.2034, Val Loss: 12.2039

评估模型性能...
模型准确率：
soil_class: 1.56%
soil_subclass: 0.56%
soil_group: 0.56%
soil_type: 0.22%


In [12]:
for i in range(3000,3060,1):
    row_index = i
    test_text = data['description'][row_index]
    columns_from_fifth = data.iloc[:, 5:].iloc[row_index].tolist()
    result = data.iloc[:,1:5].iloc[row_index].to_list()
    print(test_text)
    print(result)
    usage_example(test_text,columns_from_fifth)
    print('-'*100)

地块成土母质为砂岩,土地利用类型为水田
['水稻土', '淹育型水稻土', '浅白粉泥田', '黄浅白粉泥田']


NameError: name 'HierarchicalSoilClassifier' is not defined