In [1]:
import os
import hashlib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
import joblib

def hash_chinese_category(category):
    """Hash Chinese category names to fixed-length strings."""
    return hashlib.md5(category.encode('utf-8')).hexdigest()[:10]

# 1. 改进的数据集类
class OptimizedSoilDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        # Hash category names
        for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']:
            self.data[f'{column}_hashed'] = self.data[column].apply(hash_chinese_category)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        description = self.data.iloc[index]['description']
        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        spectral_data = self.data.iloc[index][['Centroid_X', 'Centroid_Y', 'DEM_MEAN', 'DEM_STD', 'ndvi_MEAN',
       'PCA_0_MEAN', 'Slope_MEAN', 'Aspect_MEAN', 'MRVBF_MEAN',
       'TopographicWetnessIndex_MEAN', 'Mean_MEAN', 'PH_MEAN', 'PRE_MEAN',
       'SRA_MEAN', 'TMP_MEAN', 'VAP_MEAN', 'WIND_MEAN', 'PlanCurvature_MEAN']].values.astype(float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'spectral_data': torch.FloatTensor(spectral_data),
            'soil_class': torch.tensor(self.data.iloc[index]['soil_class_encoded'], dtype=torch.long),
            'soil_subclass': torch.tensor(self.data.iloc[index]['soil_subclass_encoded'], dtype=torch.long),
            'soil_group': torch.tensor(self.data.iloc[index]['soil_group_encoded'], dtype=torch.long),
            'soil_type': torch.tensor(self.data.iloc[index]['soil_type_encoded'], dtype=torch.long)
        }

# 2. 模型定义（保持不变）
class RevisedHierarchicalSoilClassificationModel(nn.Module):
    def __init__(self, n_classes, n_spectral_features=18):
        super(RevisedHierarchicalSoilClassificationModel, self).__init__()
        self.n_classes = n_classes
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(0.1)
        self.spectral_fc = nn.Linear(n_spectral_features, 64)
        self.fc = nn.Linear(self.bert.config.hidden_size + 64, 256)
        
        self.soil_class = nn.Linear(256, n_classes['soil_class'])
        self.soil_subclass = nn.Linear(256, n_classes['soil_class'] * n_classes['soil_subclass'])
        self.soil_group = nn.Linear(256, n_classes['soil_subclass'] * n_classes['soil_group'])
        self.soil_type = nn.Linear(256, n_classes['soil_group'] * n_classes['soil_type'])

    def forward(self, input_ids, attention_mask, spectral_data):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        
        spectral_features = self.spectral_fc(spectral_data)
        spectral_features = torch.relu(spectral_features)
        
        combined = torch.cat((pooled_output, spectral_features), dim=1)
        x = self.fc(combined)
        x = torch.relu(x)
        
        soil_class = self.soil_class(x)
        soil_subclass = self.soil_subclass(x).view(-1, self.n_classes['soil_class'], self.n_classes['soil_subclass'])
        soil_group = self.soil_group(x).view(-1, self.n_classes['soil_subclass'], self.n_classes['soil_group'])
        soil_type = self.soil_type(x).view(-1, self.n_classes['soil_group'], self.n_classes['soil_type'])
        
        return soil_class, soil_subclass, soil_group, soil_type

# 3. 训练函数（移除了调试输出）
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            optimizer.zero_grad()
            class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
            
            loss_class = criterion(class_out, soil_class)
            loss_subclass = criterion(subclass_out[torch.arange(subclass_out.size(0)), soil_class], soil_subclass)
            loss_group = criterion(group_out[torch.arange(group_out.size(0)), soil_subclass], soil_group)
            loss_type = criterion(type_out[torch.arange(type_out.size(0)), soil_group], soil_type)
            
            loss = loss_class + loss_subclass + loss_group + loss_type
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                spectral_data = batch['spectral_data'].to(device)
                soil_class = batch['soil_class'].to(device)
                soil_subclass = batch['soil_subclass'].to(device)
                soil_group = batch['soil_group'].to(device)
                soil_type = batch['soil_type'].to(device)

                class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
                
                loss_class = criterion(class_out, soil_class)
                loss_subclass = criterion(subclass_out[torch.arange(subclass_out.size(0)), soil_class], soil_subclass)
                loss_group = criterion(group_out[torch.arange(group_out.size(0)), soil_subclass], soil_group)
                loss_type = criterion(type_out[torch.arange(type_out.size(0)), soil_group], soil_type)
                
                loss = loss_class + loss_subclass + loss_group + loss_type
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

# 4. 评估函数（保持不变）
def evaluate_model(model, data_loader, device):
    model.eval()
    correct_predictions = {
        'soil_class': 0,
        'soil_subclass': 0,
        'soil_group': 0,
        'soil_type': 0
    }
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
            
            _, class_preds = torch.max(class_out, dim=1)
            _, subclass_preds = torch.max(subclass_out[torch.arange(subclass_out.size(0)), class_preds], dim=1)
            _, group_preds = torch.max(group_out[torch.arange(group_out.size(0)), subclass_preds], dim=1)
            _, type_preds = torch.max(type_out[torch.arange(type_out.size(0)), group_preds], dim=1)
            
            correct_predictions['soil_class'] += torch.sum(class_preds == soil_class)
            correct_predictions['soil_subclass'] += torch.sum(subclass_preds == soil_subclass)
            correct_predictions['soil_group'] += torch.sum(group_preds == soil_group)
            correct_predictions['soil_type'] += torch.sum(type_preds == soil_type)
            
            total_predictions += input_ids.size(0)

    accuracies = {k: (v.item() / total_predictions) * 100 for k, v in correct_predictions.items()}
    return accuracies

# 5. 改进的预测类
class OptimizedConstrainedHierarchicalSoilClassifier:
    def __init__(self, model_path, tokenizer_path, label_encoders, hierarchy_map, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = torch.load(model_path, map_location=self.device)
        self.model.eval()
        self.label_encoders = label_encoders
        self.hierarchy_map = hierarchy_map
        self.hash_to_category = {column: {hash_chinese_category(cat): cat for cat in label_encoders[column].classes_} 
                                 for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']}

    def predict(self, text, spectral_data):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        spectral_tensor = torch.FloatTensor(spectral_data).unsqueeze(0).to(self.device)

        with torch.no_grad():
            class_out, subclass_out, group_out, type_out = self.model(input_ids, attention_mask, spectral_tensor)
        
        soil_class_idx = torch.argmax(class_out, dim=1).item()
        soil_class_hash = self.label_encoders['soil_class'].inverse_transform([soil_class_idx])[0]
        soil_class = self.hash_to_category['soil_class'][soil_class_hash]

        subclass_probs = subclass_out[0, soil_class_idx]
        valid_subclass_idxs = [self.label_encoders['soil_subclass'].transform([subclass])[0] for subclass in self.hierarchy_map[soil_class].keys()]
        valid_subclass_probs = subclass_probs[valid_subclass_idxs]
        soil_subclass_idx = valid_subclass_idxs[torch.argmax(valid_subclass_probs)]
        soil_subclass_hash = self.label_encoders['soil_subclass'].inverse_transform([soil_subclass_idx])[0]
        soil_subclass = self.hash_to_category['soil_subclass'][soil_subclass_hash]

        group_probs = group_out[0, soil_subclass_idx]
        valid_group_idxs = [self.label_encoders['soil_group'].transform([group])[0] for group in self.hierarchy_map[soil_class][soil_subclass].keys()]
        valid_group_probs = group_probs[valid_group_idxs]
        soil_group_idx = valid_group_idxs[torch.argmax(valid_group_probs)]
        soil_group_hash = self.label_encoders['soil_group'].inverse_transform([soil_group_idx])[0]
        soil_group = self.hash_to_category['soil_group'][soil_group_hash]

        type_probs = type_out[0, soil_group_idx]
        valid_type_idxs = [self.label_encoders['soil_type'].transform([type_])[0] for type_ in self.hierarchy_map[soil_class][soil_subclass][soil_group]]
        valid_type_probs = type_probs[valid_type_idxs]
        soil_type_idx = valid_type_idxs[torch.argmax(valid_type_probs)]
        soil_type_hash = self.label_encoders['soil_type'].inverse_transform([soil_type_idx])[0]
        soil_type = self.hash_to_category['soil_type'][soil_type_hash]

        results = {
            'soil_class': soil_class,
            'soil_subclass': soil_subclass,
            'soil_group': soil_group,
            'soil_type': soil_type
        }
        
        return results
# 6. 改进的层级映射函数
def create_optimized_hierarchy_map(data):
    hierarchy_map = {}
    for _, row in data.iterrows():
        soil_class = row['soil_class']
        soil_subclass = row['soil_subclass']
        soil_group = row['soil_group']
        soil_type = row['soil_type']
        
        if soil_class not in hierarchy_map:
            hierarchy_map[soil_class] = {}
        if soil_subclass not in hierarchy_map[soil_class]:
            hierarchy_map[soil_class][soil_subclass] = {}
        if soil_group not in hierarchy_map[soil_class][soil_subclass]:
            hierarchy_map[soil_class][soil_subclass][soil_group] = []
        if soil_type not in hierarchy_map[soil_class][soil_subclass][soil_group]:
            hierarchy_map[soil_class][soil_subclass][soil_group].append(soil_type)
    
    return hierarchy_map

# 7. 改进的主函数
def main(data, num_epochs, lr=2e-5):
    save_path = 'model_path'
    os.makedirs(save_path, exist_ok=True)

    data = data.copy()
    
    # 编码标签
    label_encoders = {}
    for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']:
        le = LabelEncoder()
        data[f'{column}_hashed'] = data[column].apply(hash_chinese_category)
        data[f'{column}_encoded'] = le.fit_transform(data[f'{column}_hashed'])
        label_encoders[column] = le
    
    # 创建层级映射
    hierarchy_map = create_optimized_hierarchy_map(data)
    
    # 划分训练集和测试集
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # 初始化tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    
    # 创建数据集和数据加载器
    train_dataset = OptimizedSoilDataset(train_data, tokenizer)
    test_dataset = OptimizedSoilDataset(test_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 初始化模型
    n_classes = {
        'soil_class': len(label_encoders['soil_class'].classes_),
        'soil_subclass': len(label_encoders['soil_subclass'].classes_),
        'soil_group': len(label_encoders['soil_group'].classes_),
        'soil_type': len(label_encoders['soil_type'].classes_)
    }
    model = RevisedHierarchicalSoilClassificationModel(n_classes).to(device)
    
    # 设置损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # 训练模型
    train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=num_epochs)
    
    # 评估模型
    print("\n评估模型性能...")
    accuracies = evaluate_model(model, test_loader, device)
    
    print("模型准确率：")
    for category, accuracy in accuracies.items():
        print(f"{category}: {accuracy:.2f}%")
    
    # 保存模型
    torch.save(model, os.path.join(save_path, 'soil_classification_model.pth'))
    
    # 保存标签编码器
    joblib.dump(label_encoders, os.path.join(save_path, 'label_encoders.joblib'))
    
    # 保存层级映射
    joblib.dump(hierarchy_map, os.path.join(save_path, 'hierarchy_map.joblib'))
    
    # 保存tokenizer
    tokenizer.save_pretrained(os.path.join(save_path, 'soil_tokenizer'))

    print("模型训练完成并保存。")

# 8. 改进的使用示例
def usage_example(sample_text, spectral_data):
    save_path = 'model_path'
    
    # 加载保存的模型和标签编码器
    label_encoders = joblib.load(os.path.join(save_path, 'label_encoders.joblib'))
    hierarchy_map = joblib.load(os.path.join(save_path, 'hierarchy_map.joblib'))

    classifier = OptimizedConstrainedHierarchicalSoilClassifier(
        os.path.join(save_path, 'soil_classification_model.pth'),
        os.path.join(save_path, 'soil_tokenizer'),
        label_encoders,
        hierarchy_map
    )

    # 进行预测
    results = classifier.predict(sample_text, spectral_data)

    # 打印结果
    print("预测结果：")
    for key, value in results.items():
        print(f"{key}: {value}")


In [2]:
data = pd.read_csv("result.csv")
data.rename(columns={'TL':'soil_class','YL':'soil_subclass','TS':'soil_group','TZ':'soil_type'},inplace=True)

In [3]:
# 训练
main(data, num_epochs=5)

Epoch 1/5, Train Loss: 53404.3823, Val Loss: 20883.3025
Epoch 2/5, Train Loss: 11952.0659, Val Loss: 7576.5264
Epoch 3/5, Train Loss: 5384.3865, Val Loss: 4085.1571
Epoch 4/5, Train Loss: 3066.4898, Val Loss: 2318.0420
Epoch 5/5, Train Loss: 1807.1385, Val Loss: 1424.8783

评估模型性能...
模型准确率：
soil_class: 39.02%
soil_subclass: 28.99%
soil_group: 26.31%
soil_type: 26.31%
模型训练完成并保存。


In [None]:
for i in range(4000,4001,1):
    row_index = i
    test_text = data['description'][row_index]
    columns_from_fifth = data.iloc[:, 5:].iloc[row_index].tolist()
    result = data.iloc[:,1:5].iloc[row_index].to_list()
    print(test_text,type(test_text))
    print(result)
    usage_example(test_text,columns_from_fifth)
    print('-'*100)