In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
import torch
from transformers import BertTokenizer
from torch import nn

class SoilClassifier:
    def __init__(self, model_path, tokenizer_path, label_encoders, device=None):
        self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.model = torch.load(model_path, map_location=self.device)
        self.model.eval()
        self.label_encoders = label_encoders

    def predict(self, text, spectral_data):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        spectral_tensor = torch.FloatTensor(spectral_data).unsqueeze(0).to(self.device)

        with torch.no_grad():
            class_out, subclass_out, group_out, type_out = self.model(input_ids, attention_mask, spectral_tensor)
            
        soil_class = torch.argmax(class_out, dim=1).item()
        soil_subclass = torch.argmax(subclass_out, dim=1).item()
        soil_group = torch.argmax(group_out, dim=1).item()
        soil_type = torch.argmax(type_out, dim=1).item()
        
        results = {
            'soil_class': self.label_encoders['soil_class'].inverse_transform([soil_class])[0],
            'soil_subclass': self.label_encoders['soil_subclass'].inverse_transform([soil_subclass])[0],
            'soil_group': self.label_encoders['soil_group'].inverse_transform([soil_group])[0],
            'soil_type': self.label_encoders['soil_type'].inverse_transform([soil_type])[0]
        }
        
        return results

In [8]:


# 1. 数据生成函数
def generate_soil_data(n_samples=1000):
    soil_classes = ['砂姜黑土', '赤红壤', '潮土', '黄壤', '盐土']
    soil_subclasses = [f'{c}亚类{i}' for c in soil_classes for i in range(1, 3)]
    soil_groups = [f'{sc}土属{i}' for sc in soil_subclasses for i in range(1, 3)]
    soil_types = [f'{sg}土种{i}' for sg in soil_groups for i in range(1, 3)]

    descriptions = [
        "位于平原地区，土壤呈黄褐色，质地为砂壤，有机质含量中等。",
        "山地坡地，土壤呈红色，质地粘重，有机质含量较低。",
        "河流阶地，土壤呈灰褐色，质地为壤土，有机质含量高。",
        "丘陵地带，土壤呈黄棕色，质地为粉砂质壤土，有机质含量中等。",
        "沿海平原，土壤呈灰色，质地为粘土，有机质含量较低。"
    ]

    data = []
    for _ in range(n_samples):
        soil_class = np.random.choice(soil_classes)
        soil_subclass = np.random.choice([sc for sc in soil_subclasses if soil_class in sc])
        soil_group = np.random.choice([sg for sg in soil_groups if soil_subclass in sg])
        soil_type = np.random.choice([st for st in soil_types if soil_group in st])
        description = np.random.choice(descriptions)
        
        spectral_data = {
            "NDVI": np.random.uniform(0, 1),
            "EVI": np.random.uniform(0, 2),
            "红波段反射率": np.random.uniform(0, 0.3),
            "绿波段反射率": np.random.uniform(0, 0.3),
            "蓝波段反射率": np.random.uniform(0, 0.3),
            "近红外波段反射率": np.random.uniform(0.2, 0.9),
            "短波红外波段反射率": np.random.uniform(0.1, 0.5),
            "土壤含水量": np.random.uniform(5, 40),
            "土壤温度": np.random.uniform(10, 30),
            "海拔": np.random.uniform(0, 3000),
            "坡度": np.random.uniform(0, 45)
        }
        
        data.append({
            "description": description,
            "soil_class": soil_class,
            "soil_subclass": soil_subclass,
            "soil_group": soil_group,
            "soil_type": soil_type,
            **spectral_data
        })
    
    return pd.DataFrame(data)

# 2. 数据集类
class SoilDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        description = self.data.iloc[index]['description']
        encoding = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        spectral_data = self.data.iloc[index][['NDVI', 'EVI', '红波段反射率', '绿波段反射率', '蓝波段反射率',
                                               '近红外波段反射率', '短波红外波段反射率', '土壤含水量', '土壤温度',
                                               '海拔', '坡度']].values.astype(float)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'spectral_data': torch.FloatTensor(spectral_data),
            'soil_class': torch.tensor(self.data.iloc[index]['soil_class_encoded'], dtype=torch.long),
            'soil_subclass': torch.tensor(self.data.iloc[index]['soil_subclass_encoded'], dtype=torch.long),
            'soil_group': torch.tensor(self.data.iloc[index]['soil_group_encoded'], dtype=torch.long),
            'soil_type': torch.tensor(self.data.iloc[index]['soil_type_encoded'], dtype=torch.long)
        }

# 3. 模型定义
class SoilClassificationModel(nn.Module):
    def __init__(self, n_classes, n_spectral_features=11):
        super(SoilClassificationModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(0.1)
        self.spectral_fc = nn.Linear(n_spectral_features, 64)
        self.fc = nn.Linear(self.bert.config.hidden_size + 64, 256)
        self.soil_class = nn.Linear(256, n_classes['soil_class'])
        self.soil_subclass = nn.Linear(256 + n_classes['soil_class'], n_classes['soil_subclass'])
        self.soil_group = nn.Linear(256 + n_classes['soil_subclass'], n_classes['soil_group'])
        self.soil_type = nn.Linear(256 + n_classes['soil_group'], n_classes['soil_type'])

    def forward(self, input_ids, attention_mask, spectral_data):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        pooled_output = self.dropout(pooled_output)
        
        spectral_features = self.spectral_fc(spectral_data)
        spectral_features = torch.relu(spectral_features)
        
        combined = torch.cat((pooled_output, spectral_features), dim=1)
        x = self.fc(combined)
        x = torch.relu(x)
        
        soil_class = self.soil_class(x)
        soil_class_probs = torch.softmax(soil_class, dim=1)
        
        subclass_input = torch.cat((x, soil_class_probs), dim=1)
        soil_subclass = self.soil_subclass(subclass_input)
        soil_subclass_probs = torch.softmax(soil_subclass, dim=1)
        
        group_input = torch.cat((x, soil_subclass_probs), dim=1)
        soil_group = self.soil_group(group_input)
        soil_group_probs = torch.softmax(soil_group, dim=1)
        
        type_input = torch.cat((x, soil_group_probs), dim=1)
        soil_type = self.soil_type(type_input)
        
        return soil_class, soil_subclass, soil_group, soil_type

# 4. 训练函数
def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            spectral_data = batch['spectral_data'].to(device)
            soil_class = batch['soil_class'].to(device)
            soil_subclass = batch['soil_subclass'].to(device)
            soil_group = batch['soil_group'].to(device)
            soil_type = batch['soil_type'].to(device)

            optimizer.zero_grad()
            class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
            loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                    criterion(group_out, soil_group) + criterion(type_out, soil_type))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                spectral_data = batch['spectral_data'].to(device)
                soil_class = batch['soil_class'].to(device)
                soil_subclass = batch['soil_subclass'].to(device)
                soil_group = batch['soil_group'].to(device)
                soil_type = batch['soil_type'].to(device)

                class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_data)
                loss = (criterion(class_out, soil_class) + criterion(subclass_out, soil_subclass) +
                        criterion(group_out, soil_group) + criterion(type_out, soil_type))
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')

# 5. 预测函数
def predict(model, tokenizer, text, spectral_data, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    spectral_tensor = torch.FloatTensor(spectral_data).unsqueeze(0).to(device)

    with torch.no_grad():
        class_out, subclass_out, group_out, type_out = model(input_ids, attention_mask, spectral_tensor)
        
    soil_class = torch.argmax(class_out, dim=1).item()
    soil_subclass = torch.argmax(subclass_out, dim=1).item()
    soil_group = torch.argmax(group_out, dim=1).item()
    soil_type = torch.argmax(type_out, dim=1).item()
    
    return soil_class, soil_subclass, soil_group, soil_type

# 6. 主函数
def main():
    # 生成数据
    data = generate_soil_data(n_samples=1000)
    
    # 编码标签
    label_encoders = {}
    for column in ['soil_class', 'soil_subclass', 'soil_group', 'soil_type']:
        le = LabelEncoder()
        data[f'{column}_encoded'] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    # 划分训练集和测试集
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # 初始化tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    
    # 创建数据集和数据加载器
    train_dataset = SoilDataset(train_data, tokenizer)
    test_dataset = SoilDataset(test_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 初始化模型
    n_classes = {
        'soil_class': len(label_encoders['soil_class'].classes_),
        'soil_subclass': len(label_encoders['soil_subclass'].classes_),
        'soil_group': len(label_encoders['soil_group'].classes_),
        'soil_type': len(label_encoders['soil_type'].classes_)
    }
    model = SoilClassificationModel(n_classes).to(device)
    
    # 设置损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=2e-5)
    
    # 训练模型
    train_model(model, train_loader, test_loader, criterion, optimizer, device, num_epochs=5)
    
    # 预测示例
    sample_text = "位于山地坡地，土壤呈红色，质地粘重，有机质含量较低。"
    sample_spectral_data = [0.5, 1.2, 0.1, 0.2, 0.15, 0.6, 0.3, 20.5, 22.3, 1500, 15.5]
    
    soil_class, soil_subclass, soil_group, soil_type = predict(model, tokenizer, sample_text, sample_spectral_data, device)
    
    print("预测结果：")
    print(f"土类: {label_encoders['soil_class'].inverse_transform([soil_class])[0]}")
    print(f"亚类: {label_encoders['soil_subclass'].inverse_transform([soil_subclass])[0]}")
    print(f"土属: {label_encoders['soil_group'].inverse_transform([soil_group])[0]}")
    print(f"土种: {label_encoders['soil_type'].inverse_transform([soil_type])[0]}")

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Epoch 1/5, Train Loss: 49.0507, Val Loss: 30.7745
Epoch 2/5, Train Loss: 25.0238, Val Loss: 19.1953
Epoch 3/5, Train Loss: 17.2239, Val Loss: 14.9792
Epoch 4/5, Train Loss: 14.1246, Val Loss: 12.8167
Epoch 5/5, Train Loss: 12.5131, Val Loss: 11.9536
预测结果：
土类: 赤红壤
亚类: 赤红壤亚类2
土属: 潮土亚类2土属2
土种: 赤红壤亚类2土属2土种1
