In [1]:
import os
import json
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
from pyproj import Transformer
from tqdm import tqdm
import pickle

In [2]:
# 获取当前日期
def get_dtime():
    # 例如20240516
    import datetime
    dtime = datetime.datetime.now().strftime('%Y%m%d')
    return dtime

In [3]:
# 导入autogluon模型
model_path = r"F:\cache_data\zone_ana\dy\modle\autogluon_type_20240618"
TL_predictor = TabularPredictor.load(model_path)

In [4]:
TL_predictor.model_best,TL_predictor.model_names()

('WeightedEnsemble_L2',
 ['NeuralNetFastAI',
  'RandomForestGini',
  'RandomForestEntr',
  'ExtraTreesGini',
  'ExtraTreesEntr',
  'NeuralNetTorch',
  'WeightedEnsemble_L2'])

In [5]:
# 获取特征
TL_feature_names = TL_predictor.feature_metadata_in.get_features()
print("TL",TL_feature_names)


TL ['DLMC', 'MZ', 'XMin', 'XMax', 'YMin', 'YMax', 'Centroid_X', 'Centroid_Y', 'DEM_MIN', 'DEM_MAX', 'DEM_RANGE', 'DEM_MEAN', 'DEM_STD', 'DEM_MEDIAN', 'DEM_PCT90', 'PlanCurvature_RANGE', 'PlanCurvature_MEAN', 'PlanCurvature_STD', 'RelativeSlopePosition_RANGE', 'RelativeSlopePosition_MEAN', 'RelativeSlopePosition_STD', 'Slope_RANGE', 'Slope_MEAN', 'Slope_STD', 'TopographicWetnessIndex_RANGE', 'TopographicWetnessIndex_MEAN', 'TopographicWetnessIndex_STD', 'Mean_RANGE', 'Mean_MEAN', 'Mean_STD', 'ndvi_RANGE', 'ndvi_MEAN', 'ndvi_STD', 'PCA_0_RANGE', 'PCA_0_MEAN', 'PCA_0_STD', 'PRE_MIN', 'PRE_MAX', 'PRE_RANGE', 'PRE_MEAN', 'SRA_MIN', 'SRA_MAX', 'SRA_RANGE', 'SRA_MEAN', 'TMP_MIN', 'TMP_MAX', 'TMP_RANGE', 'TMP_MEAN', 'VAP_MIN', 'VAP_MAX', 'VAP_RANGE', 'VAP_MEAN', 'WIND_MIN', 'WIND_MAX', 'WIND_RANGE', 'WIND_MEAN', 'BIO_RANGE', 'BIO_MEAN', 'MRVBF_MEAN', 'MRVBF_STD']


In [6]:
# 存放预测结果的路径
result_path =  r"F:\cache_data\zone_ana\dy\prediction_result"

In [7]:
# 读取数据集
feature_path = r'F:\cache_data\zone_ana\dy\prediction_data\result_20240618.csv'

In [8]:
feature_df = pd.read_csv(feature_path)

In [9]:
feature_df.rename(columns={'母质':'MZ'}, inplace=True)

In [10]:
# 获取对照字典
with open(r'D:\worker_code\Terrain_Test\data\soil_dict_20240618.json', 'r') as f:
    soil_dict = json.load(f)
# 将键转为int
soil_dict = {int(k):v for k, v in soil_dict.items()}

In [11]:
# 模型选择
select_model = 'RandomForestGini'

In [12]:
# 概率预测函数
def predict_top_classes(data, predictor,feature_names, model, top_n, soil_dict):
    """
    根据给定的AutoGluon模型，预测数据集中的前n个最可能的类别及其概率，并根据提供的字典转换类别编号为描述字符串。
    参数:
    data (DataFrame): 输入的数据集。
    predictor（model）: 对应的模型
    feature_names (list): 用于预测的特征名列表。
    model (str): 选择的模型名称。
    top_n (int): 需要返回的最高概率的类别数量。
    soil_dict (dict): 类别编号到描述字符串的映射字典。
    
    返回:
    DataFrame: 原始数据与预测结果合并后的DataFrame。
    """
    # 复制数据
    data = data.copy()
    # 预测概率
    pred_probs = predictor.predict_proba(data[feature_names], model=model)
    # 获取概率最高的前n个类别及其概率
    top_classes = pred_probs.apply(lambda x: pd.Series(x.nlargest(top_n).index.astype(int).tolist() + x.nlargest(top_n).values.tolist()), axis=1)
    # 重命名列
    class_cols = ['Class{}'.format(i+1) for i in range(top_n)]
    prob_cols = ['Prob{}'.format(i+1) for i in range(top_n)]
    top_classes.columns = class_cols + prob_cols
    # 转换类别编号为描述字符串
    for col in class_cols:
        top_classes[col] = top_classes[col].map(soil_dict)
    # 计算每个预测的熵
    entropy = pred_probs.apply(lambda x: -np.sum(x * np.log(x + 1e-9)), axis=1)
    top_classes['Entropy'] = entropy
    # 计算每个预测的不确定性（标准差）
    uncertainty = pred_probs.std(axis=1)
    top_classes['Uncertainty'] = uncertainty
    # 将新列添加到原有的DataFrame中
    return pd.concat([data, top_classes], axis=1)

In [13]:
# TZ
tl_result_df = predict_top_classes(feature_df, TL_predictor,TL_feature_names, select_model, 3, soil_dict)

In [14]:
# 获取指定列
result_df = tl_result_df[['OBJECTID','Class1', 'Class2', 'Class3', 'Prob1','Prob2', 'Prob3', 'Entropy', 'Uncertainty']]

In [15]:
result_df

Unnamed: 0,OBJECTID,Class1,Class2,Class3,Prob1,Prob2,Prob3,Entropy,Uncertainty
0,1,[中层灰泥质黄色石灰土],[潮泥田],[中层壤质黄色石灰土],0.266667,0.086667,0.063333,2.681243,0.041623
1,2,[中层灰泥质黄色石灰土],[潮泥田],[石灰泥田],0.286667,0.150000,0.076667,2.543663,0.045662
2,3,[中层灰泥质黄色石灰土],[潮泥田],[腐薄层壤质黄色石灰土],0.363333,0.096667,0.063333,2.524259,0.052043
3,4,[中层灰泥质黄色石灰土],[腐薄层灰泥质黄壤],[腐薄层硅质黄壤],0.276667,0.093333,0.083333,2.740639,0.042118
4,5,[潮泥田],[石灰泥田],[中层灰泥质黄色石灰土],0.313333,0.130000,0.120000,2.456341,0.049046
...,...,...,...,...,...,...,...,...,...
140116,71782,[腐厚层泥质黄壤],[腐薄层泥质黄壤],[腐薄层壤质黄色石灰土],0.246667,0.153333,0.113333,2.504245,0.043926
140117,71782,[腐厚层泥质黄壤],[腐薄层泥质黄壤],[腐薄层壤质黄色石灰土],0.246667,0.153333,0.113333,2.498165,0.043955
140118,71784,[腐厚层泥质黄壤],[腐薄层硅质黄壤],[腐薄层壤质黄色石灰土],0.376667,0.103333,0.070000,2.446990,0.054471
140119,74834,[腐薄层硅质黄壤],[腐中层壤质黄色石灰土],[腐厚层硅质黄壤],0.526667,0.103333,0.093333,1.875842,0.073934


In [None]:
# 获取三普土种对照字典
# 读取Excel文件
sp_file_path = r"C:\Users\Runker\Desktop\search_dict.xlsx"
df = pd.read_excel(sp_file_path)

# 使用前向填充（ffill）处理合并单元格的情况
df_filled = df.ffill()
# 定义一个函数来为每一行生成一个字典
def create_dict(row):
    return {
        row['三普土种']: {
            '土类': row['三普土类'],
            '亚类': row['三普亚类'],
            '土属': row['三普土属'],
        }
    }

# 使用apply方法为每一行应用这个函数，并将结果合并到一个字典中
sp_soiltype_dict = {}
for d in df_filled.apply(create_dict, axis=1):
    sp_soiltype_dict.update(d)

In [None]:
sp_soiltype_dict

In [None]:
# 使用 apply 方法结合 lambda 函数来获取对应的 '土类' 值
result_df['Class1_tl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土类', None))
result_df['Class1_yl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('亚类', None))
result_df['Class1_ts'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土属', None))

In [16]:
result_df

Unnamed: 0,OBJECTID,Class1,Class2,Class3,Prob1,Prob2,Prob3,Entropy,Uncertainty
0,1,[中层灰泥质黄色石灰土],[潮泥田],[中层壤质黄色石灰土],0.266667,0.086667,0.063333,2.681243,0.041623
1,2,[中层灰泥质黄色石灰土],[潮泥田],[石灰泥田],0.286667,0.150000,0.076667,2.543663,0.045662
2,3,[中层灰泥质黄色石灰土],[潮泥田],[腐薄层壤质黄色石灰土],0.363333,0.096667,0.063333,2.524259,0.052043
3,4,[中层灰泥质黄色石灰土],[腐薄层灰泥质黄壤],[腐薄层硅质黄壤],0.276667,0.093333,0.083333,2.740639,0.042118
4,5,[潮泥田],[石灰泥田],[中层灰泥质黄色石灰土],0.313333,0.130000,0.120000,2.456341,0.049046
...,...,...,...,...,...,...,...,...,...
140116,71782,[腐厚层泥质黄壤],[腐薄层泥质黄壤],[腐薄层壤质黄色石灰土],0.246667,0.153333,0.113333,2.504245,0.043926
140117,71782,[腐厚层泥质黄壤],[腐薄层泥质黄壤],[腐薄层壤质黄色石灰土],0.246667,0.153333,0.113333,2.498165,0.043955
140118,71784,[腐厚层泥质黄壤],[腐薄层硅质黄壤],[腐薄层壤质黄色石灰土],0.376667,0.103333,0.070000,2.446990,0.054471
140119,74834,[腐薄层硅质黄壤],[腐中层壤质黄色石灰土],[腐厚层硅质黄壤],0.526667,0.103333,0.093333,1.875842,0.073934


In [17]:
# 保存数据
result_df.to_csv(os.path.join(result_path, f'prediction_class_{select_model}_{get_dtime()}.csv'),index=False)