In [4]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
import os
import pandas as pd
import numpy as np

In [2]:
class SoilDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        description = self.data.iloc[index]['description']
        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        spectral_data = self.data.iloc[index][['Centroid_X', 'Centroid_Y', 'DEM_MEAN', 'DEM_STD', 'ndvi_MEAN',
       'PCA_0_MEAN', 'Slope_MEAN', 'Aspect_MEAN', 'MRVBF_MEAN',
       'TopographicWetnessIndex_MEAN', 'Mean_MEAN', 'PH_MEAN', 'PRE_MEAN',
       'SRA_MEAN', 'TMP_MEAN', 'VAP_MEAN', 'WIND_MEAN', 'PlanCurvature_MEAN']].values.astype(float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'spectral_data': torch.FloatTensor(spectral_data),
            'soil_class': torch.tensor(self.data.iloc[index]['soil_class_encoded'], dtype=torch.long),
            'soil_subclass': torch.tensor(self.data.iloc[index]['soil_subclass_encoded'], dtype=torch.long),
            'soil_group': torch.tensor(self.data.iloc[index]['soil_group_encoded'], dtype=torch.long),
            'soil_type': torch.tensor(self.data.iloc[index]['soil_type_encoded'], dtype=torch.long)
        }

class HierarchicalSoilClassificationModel(nn.Module):
    def __init__(self, n_classes, n_spectral_features=18):
        super(HierarchicalSoilClassificationModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(0.1)
        self.spectral_fc = nn.Linear(n_spectral_features, 64)
        self.fc = nn.Linear(self.bert.config.hidden_size + 64, 256)
        
        self.soil_class = nn.Linear(256, n_classes['soil_class'])
        self.soil_subclass = nn.Linear(256 + n_classes['soil_class'], n_classes['soil_subclass'])
        self.soil_group = nn.Linear(256 + n_classes['soil_subclass'], n_classes['soil_group'])
        self.soil_type = nn.Linear(256 + n_classes['soil_group'], n_classes['soil_type'])

    def forward(self, input_ids, attention_mask, spectral_data):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        
        spectral_features = self.spectral_fc(spectral_data)
        spectral_features = torch.relu(spectral_features)
        
        combined = torch.cat((pooled_output, spectral_features), dim=1)
        x = self.fc(combined)
        x = torch.relu(x)
        
        soil_class = self.soil_class(x)
        soil_class_probs = torch.softmax(soil_class, dim=1)
        
        subclass_input = torch.cat((x, soil_class_probs), dim=1)
        soil_subclass = self.soil_subclass(subclass_input)
        soil_subclass_probs = torch.softmax(soil_subclass, dim=1)
        
        group_input = torch.cat((x, soil_subclass_probs), dim=1)
        soil_group = self.soil_group(group_input)
        soil_group_probs = torch.softmax(soil_group, dim=1)
        
        type_input = torch.cat((x, soil_group_probs), dim=1)
        soil_type = self.soil_type(type_input)
        
        return soil_class, soil_subclass, soil_group, soil_type

def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            optimizer.zero_grad()
            class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
            loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                    criterion(group_out, soil_group) + criterion(type_out, soil_type))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                spectral_data = batch['spectral_data'].to(device)
                soil_class = batch['soil_class'].to(device)
                soil_subclass = batch['soil_subclass'].to(device)
                soil_group = batch['soil_group'].to(device)
                soil_type = batch['soil_type'].to(device)

                class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
                loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                        criterion(group_out, soil_group) + criterion(type_out, soil_type))
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

class HierarchicalSoilClassifier:
    def __init__(self, model_path, tokenizer_path, label_encoders, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = torch.load(model_path, map_location=self.device)
        self.model.eval()
        self.label_encoders = label_encoders

    def predict(self, text, spectral_data):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        spectral_tensor = torch.FloatTensor(spectral_data).unsqueeze(0).to(self.device)

        with torch.no_grad():
            class_out, subclass_out, group_out, type_out = self.model(input_ids, attention_mask, spectral_tensor)
            
        soil_class = torch.argmax(class_out, dim=1).item()
        soil_subclass = torch.argmax(subclass_out, dim=1).item()
        soil_group = torch.argmax(group_out, dim=1).item()
        soil_type = torch.argmax(type_out, dim=1).item()
        
        results = {
            'soil_class': self.label_encoders['soil_class'].inverse_transform([soil_class])[0],
            'soil_subclass': self.label_encoders['soil_subclass'].inverse_transform([soil_subclass])[0],
            'soil_group': self.label_encoders['soil_group'].inverse_transform([soil_group])[0],
            'soil_type': self.label_encoders['soil_type'].inverse_transform([soil_type])[0]
        }
        
        return results

def main(data, num_epochs, lr=2e-5):
    # 模型和标签存放位置
    save_path = 'model_path'
    # 生成数据
    data = data.copy()
    # 编码标签
    label_encoders = {}
    for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']:
        le = LabelEncoder()
        data[f'{column}_encoded'] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    # 划分训练集和测试集
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    # 初始化tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    
    # 创建数据集和数据加载器
    train_dataset = SoilDataset(train_data, tokenizer)
    test_dataset = SoilDataset(test_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 初始化模型
    n_classes = {
        'soil_class': len(label_encoders['soil_class'].classes_),
        'soil_subclass': len(label_encoders['soil_subclass'].classes_),
        'soil_group': len(label_encoders['soil_group'].classes_),
        'soil_type': len(label_encoders['soil_type'].classes_)
    }
    model = HierarchicalSoilClassificationModel(n_classes).to(device)
    
    # 设置损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # 训练模型
    train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=num_epochs)
    
    # 评估模型
    print("\n评估模型性能...")
    accuracies = evaluate_model(model, test_loader, device)
    
    print("模型准确率：")
    for category, accuracy in accuracies.items():
        print(f"{category}: {accuracy:.2f}%")
    
    # 保存模型
    torch.save(model, os.path.join(save_path, 'hierarchical_soil_classification_model.pth'))
    
    # 保存标签编码器
    joblib.dump(label_encoders, os.path.join(save_path, 'hierarchical_label_encoders.joblib'))
    
    # 保存tokenizer
    tokenizer.save_pretrained(os.path.join(save_path, 'hierarchical_soil_tokenizer'))

    print("分层土壤分类模型训练完成并保存。")

def evaluate_model(model, data_loader, device):
    model.eval()
    correct_predictions = {
        'soil_class': 0,
        'soil_subclass': 0,
        'soil_group': 0,
        'soil_type': 0
    }
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            outputs = model(input_ids, attention_mask, spectral_data)
            
            for i, output in enumerate(outputs):
                _, preds = torch.max(output, dim=1)
                correct_predictions[list(correct_predictions.keys())[i]] += torch.sum(preds == locals()[list(correct_predictions.keys())[i]])
            
            total_predictions += input_ids.size(0)

    accuracies = {k: (v.item() / total_predictions) * 100 for k, v in correct_predictions.items()}
    return accuracies

def usage_example(sample_text, spectral_data):
    # 模型和标签存放位置
    save_path = 'model_path'
    # 加载保存的模型和标签编码器
    label_encoders = joblib.load(os.path.join(save_path, 'hierarchical_label_encoders.joblib'))
    classifier = HierarchicalSoilClassifier(os.path.join(save_path, 'hierarchical_soil_classification_model.pth'), 
                                            os.path.join(save_path, 'hierarchical_soil_tokenizer'), 
                                            label_encoders)

    # 准备输入数据
    sample_text = sample_text
    sample_spectral_data = spectral_data

    # 进行预测
    results = classifier.predict(sample_text, sample_spectral_data)

    # 打印结果
    print("预测结果：")
    for key, value in results.items():
        print(f"{key}: {value}")

In [5]:
data = pd.read_csv("result.csv")
data.rename(columns={'TL':'soil_class','YL':'soil_subclass','TS':'soil_group','TZ':'soil_type'},inplace=True)

In [8]:
main(data=data,num_epochs=5,lr=1e-4)

Epoch 1/5, Train Loss: 14033.6881, Val Loss: 3517.9369
Epoch 2/5, Train Loss: 2493.3161, Val Loss: 2153.2983
Epoch 3/5, Train Loss: 2249.2113, Val Loss: 2311.3001
Epoch 4/5, Train Loss: 1988.1255, Val Loss: 1854.6986
Epoch 5/5, Train Loss: 1712.2149, Val Loss: 1638.9516

评估模型性能...
模型准确率：
soil_class: 64.33%
soil_subclass: 65.89%
soil_group: 30.77%
soil_type: 12.15%
分层土壤分类模型训练完成并保存。


In [7]:
for i in range(3000,3060,1):
    row_index = i
    test_text = data['description'][row_index]
    columns_from_fifth = data.iloc[:, 5:].iloc[row_index].tolist()
    result = data.iloc[:,1:5].iloc[row_index].to_list()
    print(test_text)
    print(result)
    usage_example(test_text,columns_from_fifth)
    print('-'*100)

地块成土母质为砂岩,土地利用类型为水田
['水稻土', '淹育型水稻土', '浅白粉泥田', '黄浅白粉泥田']
预测结果：
soil_class: 黄壤
soil_subclass: 淹育型水稻土
soil_group: 浅白粉泥田
soil_type: 黄浅白粉泥田
----------------------------------------------------------------------------------------------------
地块成土母质为砂岩,土地利用类型为水田
['水稻土', '淹育型水稻土', '浅白粉泥田', '黄浅白粉泥田']
预测结果：
soil_class: 粗骨土
soil_subclass: 淹育型水稻土
soil_group: 壤质黄色石灰土
soil_type: 黄浅白粉泥田
----------------------------------------------------------------------------------------------------
地块成土母质为砂岩,土地利用类型为水田
['水稻土', '淹育型水稻土', '浅白粉泥田', '黄浅白粉泥田']
预测结果：
soil_class: 粗骨土
soil_subclass: 淹育型水稻土
soil_group: 壤质黄色石灰土
soil_type: 黄浅白粉泥田
----------------------------------------------------------------------------------------------------
地块成土母质为砂岩,土地利用类型为水田
['水稻土', '淹育型水稻土', '浅白粉泥田', '黄浅白粉泥田']
预测结果：
soil_class: 黄壤
soil_subclass: 淹育型水稻土
soil_group: 壤质黄色石灰土
soil_type: 黄浅白粉泥田
----------------------------------------------------------------------------------------------------
地块成土母质为砂岩,土地利用类型为水田
['水稻土', '淹育型水稻土', '浅白粉泥田