In [79]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
import joblib

In [84]:
# 1. 数据生成函数
def generate_soil_data(n_samples=1000):
    soil_classes = ['砂姜黑土', '赤红壤', '潮土', '黄壤', '盐土']
    soil_subclasses = [f'{c}亚类{i}' for c in soil_classes for i in range(1, 3)]
    soil_groups = [f'{sc}土属{i}' for sc in soil_subclasses for i in range(1, 3)]
    soil_types = [f'{sg}土种{i}' for sg in soil_groups for i in range(1, 3)]

    descriptions = [
        "位于平原地区，土壤呈黄褐色，质地为砂壤，有机质含量中等。",
        "山地坡地，土壤呈红色，质地粘重，有机质含量较低。",
        "河流阶地，土壤呈灰褐色，质地为壤土，有机质含量高。",
        "丘陵地带，土壤呈黄棕色，质地为粉砂质壤土，有机质含量中等。",
        "沿海平原，土壤呈灰色，质地为粘土，有机质含量较低。"
    ]

    data = []
    for _ in range(n_samples):
        soil_class = np.random.choice(soil_classes)
        soil_subclass = np.random.choice([sc for sc in soil_subclasses if soil_class in sc])
        soil_group = np.random.choice([sg for sg in soil_groups if soil_subclass in sg])
        soil_type = np.random.choice([st for st in soil_types if soil_group in st])
        description = np.random.choice(descriptions)
        
        spectral_data = {
            "NDVI": np.random.uniform(0, 1),
            "EVI": np.random.uniform(0, 2),
            "红波段反射率": np.random.uniform(0, 0.3),
            "绿波段反射率": np.random.uniform(0, 0.3),
            "蓝波段反射率": np.random.uniform(0, 0.3),
            "近红外波段反射率": np.random.uniform(0.2, 0.9),
            "短波红外波段反射率": np.random.uniform(0.1, 0.5),
            "土壤含水量": np.random.uniform(5, 40),
            "土壤温度": np.random.uniform(10, 30),
            "海拔": np.random.uniform(0, 3000),
            "坡度": np.random.uniform(0, 45)
        }
        
        data.append({
            "description": description,
            "soil_class": soil_class,
            "soil_subclass": soil_subclass,
            "soil_group": soil_group,
            "soil_type": soil_type,
            **spectral_data
        })
    
    return pd.DataFrame(data)

# 2. 数据集类
class SoilDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        description = self.data.iloc[index]['description']
        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        spectral_data = self.data.iloc[index][['Centroid_X', 'Centroid_Y', 'DEM_MEAN', 'DEM_STD', 'ndvi_MEAN',
       'PCA_0_MEAN', 'Slope_MEAN', 'Aspect_MEAN', 'MRVBF_MEAN',
       'TopographicWetnessIndex_MEAN', 'Mean_MEAN', 'PH_MEAN', 'PRE_MEAN',
       'SRA_MEAN', 'TMP_MEAN', 'VAP_MEAN', 'WIND_MEAN', 'PlanCurvature_MEAN']].values.astype(float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'spectral_data': torch.FloatTensor(spectral_data),
            'soil_class': torch.tensor(self.data.iloc[index]['soil_class_encoded'], dtype=torch.long),
            'soil_subclass': torch.tensor(self.data.iloc[index]['soil_subclass_encoded'], dtype=torch.long),
            'soil_group': torch.tensor(self.data.iloc[index]['soil_group_encoded'], dtype=torch.long),
            'soil_type': torch.tensor(self.data.iloc[index]['soil_type_encoded'], dtype=torch.long)
        }

# 3. 模型定义
class SoilClassificationModel(nn.Module):
    def __init__(self, n_classes, n_spectral_features=18):
        super(SoilClassificationModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(0.1)
        self.spectral_fc = nn.Linear(n_spectral_features, 64)
        self.fc = nn.Linear(self.bert.config.hidden_size + 64, 256)
        self.soil_class = nn.Linear(256, n_classes['soil_class'])
        self.soil_subclass = nn.Linear(256 + n_classes['soil_class'], n_classes['soil_subclass'])
        self.soil_group = nn.Linear(256 + n_classes['soil_subclass'], n_classes['soil_group'])
        self.soil_type = nn.Linear(256 + n_classes['soil_group'], n_classes['soil_type'])

    def forward(self, input_ids, attention_mask, spectral_data):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        
        spectral_features = self.spectral_fc(spectral_data)
        spectral_features = torch.relu(spectral_features)
        
        combined = torch.cat((pooled_output, spectral_features), dim=1)
        x = self.fc(combined)
        x = torch.relu(x)
        
        soil_class = self.soil_class(x)
        soil_class_probs = torch.softmax(soil_class, dim=1)
        
        subclass_input = torch.cat((x, soil_class_probs), dim=1)
        soil_subclass = self.soil_subclass(subclass_input)
        soil_subclass_probs = torch.softmax(soil_subclass, dim=1)
        
        group_input = torch.cat((x, soil_subclass_probs), dim=1)
        soil_group = self.soil_group(group_input)
        soil_group_probs = torch.softmax(soil_group, dim=1)
        
        type_input = torch.cat((x, soil_group_probs), dim=1)
        soil_type = self.soil_type(type_input)
        
        return soil_class, soil_subclass, soil_group, soil_type

# 4. 训练函数
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            optimizer.zero_grad()
            class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
            loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                    criterion(group_out, soil_group) + criterion(type_out, soil_type))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                spectral_data = batch['spectral_data'].to(device)
                soil_class = batch['soil_class'].to(device)
                soil_subclass = batch['soil_subclass'].to(device)
                soil_group = batch['soil_group'].to(device)
                soil_type = batch['soil_type'].to(device)

                class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
                loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                        criterion(group_out, soil_group) + criterion(type_out, soil_type))
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

# 5. 预测类
class SoilClassifier:
    def __init__(self, model_path, tokenizer_path, label_encoders, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = torch.load(model_path, map_location=self.device)
        self.model.eval()
        self.label_encoders = label_encoders

    def predict(self, text, spectral_data):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        spectral_tensor = torch.FloatTensor(spectral_data).unsqueeze(0).to(self.device)

        with torch.no_grad():
            class_out, subclass_out, group_out, type_out = self.model(input_ids, attention_mask, spectral_tensor)
            
        soil_class = torch.argmax(class_out, dim=1).item()
        soil_subclass = torch.argmax(subclass_out, dim=1).item()
        soil_group = torch.argmax(group_out, dim=1).item()
        soil_type = torch.argmax(type_out, dim=1).item()
        
        results = {
            'soil_class': self.label_encoders['soil_class'].inverse_transform([soil_class])[0],
            'soil_subclass': self.label_encoders['soil_subclass'].inverse_transform([soil_subclass])[0],
            'soil_group': self.label_encoders['soil_group'].inverse_transform([soil_group])[0],
            'soil_type': self.label_encoders['soil_type'].inverse_transform([soil_type])[0]
        }
        
        return results
    
# 6. 主函数
def main(data,num_epochs):
    # 模型和标签存放位置
    save_path = r'D:\model_path'

    # 生成数据
    data = data.copy()
    
    # 编码标签
    label_encoders = {}
    for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']:
        le = LabelEncoder()
        data[f'{column}_encoded'] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    # 划分训练集和测试集
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # 初始化tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    
    # 创建数据集和数据加载器
    train_dataset = SoilDataset(train_data, tokenizer)
    test_dataset = SoilDataset(test_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 初始化模型
    n_classes = {
        'soil_class': len(label_encoders['soil_class'].classes_),
        'soil_subclass': len(label_encoders['soil_subclass'].classes_),
        'soil_group': len(label_encoders['soil_group'].classes_),
        'soil_type': len(label_encoders['soil_type'].classes_)
    }
    model = SoilClassificationModel(n_classes).to(device)
    
    # 设置损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    
    # 训练模型
    train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=num_epochs)
    # 评估模型
    print("\n评估模型性能...")
    accuracies = evaluate_model(model, test_loader, device)
    
    print("模型准确率：")
    for category, accuracy in accuracies.items():
        print(f"{category}: {accuracy:.2f}%")
    # 保存模型
    torch.save(model, os.path.join(save_path,'soil_classification_model.pth'))
    
    # 保存标签编码器
    joblib.dump(label_encoders, os.path.join(save_path,'label_encoders.joblib'))
    
    # 保存tokenizer
    tokenizer.save_pretrained(os.path.join(save_path,'soil_tokenizer'))

    print("模型训练完成并保存。")


# 7.评估函数
def evaluate_model(model, data_loader, device):
    model.eval()
    correct_predictions = {
        'soil_class': 0,
        'soil_subclass': 0,
        'soil_group': 0,
        'soil_type': 0
    }
    total_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            outputs = model(input_ids, attention_mask, spectral_data)
            
            for i, output in enumerate(outputs):
                _, preds = torch.max(output, dim=1)
                correct_predictions[list(correct_predictions.keys())[i]] += torch.sum(preds == locals()[list(correct_predictions.keys())[i]])
            
            total_predictions += input_ids.size(0)

    accuracies = {k: (v.item() / total_predictions) * 100 for k, v in correct_predictions.items()}
    return accuracies




# 7. 使用示例
def usage_example(sample_text, spectral_data):
    
    # 模型和标签存放位置
    save_path = r'D:\model_path'
    # 加载保存的模型和标签编码器
    label_encoders = joblib.load(os.path.join(save_path,'label_encoders.joblib'))
    classifier = SoilClassifier(os.path.join(save_path,'soil_classification_model.pth'), os.path.join(save_path,'soil_tokenizer'), label_encoders)

    # 准备输入数据
    # sample_text = "位于山地坡中，土壤呈黑色，质地砂质，有机质含量较高。距离河流100米"
    # sample_spectral_data = [0.9, 2.2, 1.1, 0.6, 0.10, 0.9, 0.3, 22.5, 32.3, 1100, 18.5]
    sample_text = sample_text
    sample_spectral_data = spectral_data

    # 进行预测
    results = classifier.predict(sample_text, sample_spectral_data)

    # 打印结果
    print("预测结果：")
    for key, value in results.items():
        print(f"{key}: {value}")



In [81]:
data = pd.read_csv(r"D:\model_path\DATA_PATH\result.csv")

In [82]:
data.rename(columns={'TL':'soil_class','YL':'soil_subclass','TS':'soil_group','TZ':'soil_type'},inplace=True)

In [85]:
main(data=data,num_epochs=10)

Epoch 1/10, Train Loss: 33395.2562, Val Loss: 9842.3006
Epoch 2/10, Train Loss: 5464.5580, Val Loss: 3741.8538
Epoch 3/10, Train Loss: 2481.3411, Val Loss: 2077.6315
Epoch 4/10, Train Loss: 1553.4873, Val Loss: 1435.7081
Epoch 5/10, Train Loss: 1178.9606, Val Loss: 1129.6154
Epoch 6/10, Train Loss: 966.2015, Val Loss: 1086.9942
Epoch 7/10, Train Loss: 838.6094, Val Loss: 890.2121
Epoch 8/10, Train Loss: 710.9616, Val Loss: 732.1697
Epoch 9/10, Train Loss: 617.2803, Val Loss: 707.3560


In [77]:
data

Unnamed: 0,description,soil_class,soil_subclass,soil_group,soil_type,Centroid_X,Centroid_Y,DEM_MEAN,DEM_STD,ndvi_MEAN,...,MRVBF_MEAN,TopographicWetnessIndex_MEAN,Mean_MEAN,PH_MEAN,PRE_MEAN,SRA_MEAN,TMP_MEAN,VAP_MEAN,WIND_MEAN,PlanCurvature_MEAN
0,"地块成土母质为第四系红粘土,土地利用类型为水田",水稻土,潴育型水稻土,红泥田,黄红泥田,467930.568043,2.910107e+06,908.972731,1.111586,0.267906,...,1.919678,11.793416,9.191919,6.663789,100.204545,12928.856001,15.276515,1.490682,1.696970,-0.001112
1,"地块成土母质为碳酸岩,土地利用类型为水田",水稻土,潴育型水稻土,潮泥田,潮泥田,449692.580828,2.920495e+06,878.200797,1.398296,0.187911,...,4.002467,7.638341,10.636364,6.162026,98.250000,12872.166992,15.458333,1.516667,1.700000,0.000126
2,"地块成土母质为河流冲积物,土地利用类型为水田",水稻土,潴育型水稻土,潮泥田,潮泥田,444126.059554,2.909142e+06,868.042613,0.683362,0.242981,...,5.348103,7.707957,9.502974,6.382572,98.666664,12905.000000,15.525000,1.527500,1.666667,-0.000065
3,"地块成土母质为河流冲积物,土地利用类型为水田",水稻土,潴育型水稻土,潮泥田,潮泥田,443877.846932,2.909191e+06,865.562968,0.303861,0.336722,...,4.957116,9.356075,10.666667,6.256682,98.666664,12905.000000,15.525000,1.527500,1.666667,-0.000194
4,"地块成土母质为碳酸岩,土地利用类型为水田",水稻土,潴育型水稻土,潮泥田,潮泥田,443105.787397,2.909506e+06,872.290627,0.350318,0.202226,...,4.982545,6.719328,8.725694,6.508204,98.500000,12897.250000,15.525000,1.530000,1.666667,0.000960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4476,"地块成土母质为砂页岩,土地利用类型为其他园地",黄壤,典型黄壤,砂泥黄壤,中层砂泥黄壤,439043.406222,2.886773e+06,913.729417,0.535521,0.136971,...,1.479170,7.786333,12.143791,5.724277,100.333336,12968.833008,15.208333,1.513333,1.675000,0.002820
4477,"地块成土母质为砂页岩,土地利用类型为其他园地",黄壤,典型黄壤,砂泥黄壤,薄层砂泥黄壤,439634.626966,2.886422e+06,915.437504,1.154813,0.161166,...,0.726420,5.706834,11.062500,5.642592,100.083336,12997.083008,15.391666,1.529167,1.658333,0.001188
4478,"地块成土母质为砂页岩,土地利用类型为设施农用地",黄壤,典型黄壤,砂泥黄壤,薄层砂泥黄壤,439109.743837,2.884765e+06,1032.277988,13.169603,0.311776,...,0.941045,6.099571,9.708314,5.493602,100.891543,12944.676989,15.007248,1.484895,1.742960,0.000118
4479,"地块成土母质为砂页岩,土地利用类型为设施农用地",黄壤,典型黄壤,砂泥黄壤,薄层砂泥黄壤,439022.029676,2.887336e+06,1032.277988,13.169603,0.311776,...,0.941045,6.099571,9.708314,5.493602,100.891543,12944.676989,15.007248,1.484895,1.742960,0.000118


In [78]:
for i in range(100):
    row_index = i
    test_text = data['description'][row_index]
    columns_from_fifth = data.iloc[:, 5:].iloc[row_index].tolist()
    result = data.iloc[:,1:5].iloc[row_index].to_list()
    print(test_text,columns_from_fifth,result)
    print(result)
    usage_example(test_text,columns_from_fifth)
    print('-'*100)

地块成土母质为第四系红粘土,土地利用类型为水田 [467930.5680434627, 2910107.437456494, 908.9727311567826, 1.111586472439619, 0.267906056209044, 0.0075683630156246, 0.1409964459393946, 3.5534467914734376, 1.919677777131173, 11.793415546417238, 9.191919326782228, 6.663788640909091, 100.20454545454544, 12928.856001420454, 15.27651509371671, 1.4906818866729734, 1.6969697258689187, -0.0011116969915085] ['水稻土', '潴育型水稻土', '红泥田', '黄红泥田']
['水稻土', '潴育型水稻土', '红泥田', '黄红泥田']
预测结果：
soil_class: 石灰土
soil_subclass: 淹育型水稻土
soil_group: 浅白粉泥田
soil_type: 黄浅白粉泥田
----------------------------------------------------------------------------------------------------
地块成土母质为碳酸岩,土地利用类型为水田 [449692.5808281945, 2920495.1442477047, 878.2007974871035, 1.3982957518900296, 0.1879109262683364, 0.0789232633850483, 0.0612043585058773, 2.136254038861872, 4.002466899356204, 7.638341393767362, 10.6363637740152, 6.162025771291868, 98.25, 12872.1669921875, 15.458333015441896, 1.5166666507720947, 1.7000000476837158, 0.0001261899870927] ['水稻土', '潴育型水稻土',