In [1]:
import os
import json
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
from pyproj import Transformer
from tqdm import tqdm
import pickle

In [29]:
# 获取当前日期
def get_dtime():
    # 例如20240516
    import datetime
    dtime = datetime.datetime.now().strftime('%Y%m%d')
    return dtime

In [30]:
# 导入autogluon模型
model_path = r"F:\cache_data\zone_ana\dy\modle\autogluon_type_20240619"
TL_predictor = TabularPredictor.load(model_path)

In [31]:
TL_predictor.model_best,TL_predictor.model_names()

('WeightedEnsemble_L2',
 ['NeuralNetFastAI',
  'RandomForestGini',
  'RandomForestEntr',
  'ExtraTreesGini',
  'ExtraTreesEntr',
  'NeuralNetTorch',
  'WeightedEnsemble_L2'])

In [32]:
# 获取特征
TL_feature_names = TL_predictor.feature_metadata_in.get_features()
print("TL",TL_feature_names)


TL ['DLMC', 'MZ', 'Centroid_X', 'Centroid_Y', 'DEM_MIN', 'DEM_MAX', 'DEM_RANGE', 'DEM_MEAN', 'DEM_STD', 'DEM_MEDIAN', 'DEM_PCT90', 'PlanCurvature_RANGE', 'PlanCurvature_MEAN', 'PlanCurvature_STD', 'PH_MIN', 'PH_MAX', 'PH_MEAN', 'Slope_RANGE', 'Slope_MEAN', 'Slope_STD', 'TopographicWetnessIndex_RANGE', 'TopographicWetnessIndex_MEAN', 'TopographicWetnessIndex_STD', 'Mean_RANGE', 'Mean_MEAN', 'Mean_STD', 'ndvi_RANGE', 'ndvi_MEAN', 'ndvi_STD', 'PCA_0_RANGE', 'PCA_0_MEAN', 'PCA_0_STD', 'PRE_MIN', 'PRE_MAX', 'PRE_RANGE', 'PRE_MEAN', 'SRA_MIN', 'SRA_MAX', 'SRA_RANGE', 'SRA_MEAN', 'TMP_MIN', 'TMP_MAX', 'TMP_RANGE', 'TMP_MEAN', 'VAP_MIN', 'VAP_MAX', 'VAP_RANGE', 'VAP_MEAN', 'WIND_MIN', 'WIND_MAX', 'WIND_RANGE', 'WIND_MEAN', 'BIO_RANGE', 'BIO_MEAN', 'MRVBF_MEAN', 'MRVBF_STD']


In [33]:
# 存放预测结果的路径
result_path =  r"F:\cache_data\zone_ana\dy\prediction_result"

In [34]:
# 读取数据集
feature_path = r'F:\cache_data\zone_ana\dy\prediction_data\result_20240618.csv'

In [35]:
feature_df = pd.read_csv(feature_path)

In [36]:
feature_df.rename(columns={'母质':'MZ'}, inplace=True)

In [37]:
# 获取对照字典
with open(r'D:\worker_code\Terrain_Test\data\soil_dict_20240618.json', 'r') as f:
    soil_dict = json.load(f)
# 将键转为int
soil_dict = {int(k):v for k, v in soil_dict.items()}

In [38]:
# 模型选择
select_model = 'NeuralNetFastAI'

In [39]:
# 概率预测函数
def predict_top_classes(data, predictor,feature_names, model, top_n, soil_dict):
    """
    根据给定的AutoGluon模型，预测数据集中的前n个最可能的类别及其概率，并根据提供的字典转换类别编号为描述字符串。
    参数:
    data (DataFrame): 输入的数据集。
    predictor（model）: 对应的模型
    feature_names (list): 用于预测的特征名列表。
    model (str): 选择的模型名称。
    top_n (int): 需要返回的最高概率的类别数量。
    soil_dict (dict): 类别编号到描述字符串的映射字典。
    
    返回:
    DataFrame: 原始数据与预测结果合并后的DataFrame。
    """
    # 复制数据
    data = data.copy()
    # 预测概率
    pred_probs = predictor.predict_proba(data[feature_names], model=model)
    # 获取概率最高的前n个类别及其概率
    top_classes = pred_probs.apply(lambda x: pd.Series(x.nlargest(top_n).index.astype(int).tolist() + x.nlargest(top_n).values.tolist()), axis=1)
    # 重命名列
    class_cols = ['Class{}'.format(i+1) for i in range(top_n)]
    prob_cols = ['Prob{}'.format(i+1) for i in range(top_n)]
    top_classes.columns = class_cols + prob_cols
    # 转换类别编号为描述字符串
    for col in class_cols:
        top_classes[col] = top_classes[col].map(soil_dict)
    # 计算每个预测的熵
    entropy = pred_probs.apply(lambda x: -np.sum(x * np.log(x + 1e-9)), axis=1)
    top_classes['Entropy'] = entropy
    # 计算每个预测的不确定性（标准差）
    uncertainty = pred_probs.std(axis=1)
    top_classes['Uncertainty'] = uncertainty
    # 将新列添加到原有的DataFrame中
    return pd.concat([data, top_classes], axis=1)

In [40]:
# TZ
tl_result_df = predict_top_classes(feature_df, TL_predictor,TL_feature_names, select_model, 3, soil_dict)

In [41]:
# 获取指定列
result_df = tl_result_df[['OBJECTID','Class1', 'Class2', 'Class3', 'Prob1','Prob2', 'Prob3', 'Entropy', 'Uncertainty']]

In [42]:
result_df

Unnamed: 0,OBJECTID,Class1,Class2,Class3,Prob1,Prob2,Prob3,Entropy,Uncertainty
0,1,浅灰泥田,灰泥质黄色石灰土,壤质黄色石灰土,0.811990,0.060935,0.057241,0.812538,0.155702
1,2,潮泥田,石灰泥田,浅灰泥田,0.504683,0.327412,0.122216,1.135644,0.114589
2,3,浅灰泥田,灰泥质黄色石灰土,壤质黄色石灰土,0.628393,0.189196,0.088144,1.191434,0.124668
3,4,浅灰泥田,石灰泥田,壤质黄色石灰土,0.519425,0.211330,0.140093,1.390924,0.107974
4,5,石灰泥田,浅灰泥田,壤质黄色石灰土,0.732037,0.230015,0.016281,0.743263,0.145738
...,...,...,...,...,...,...,...,...,...
140116,71782,泥质黄壤,红泥质黄壤,硅质黄壤,0.998783,0.000667,0.000186,0.011204,0.192207
140117,71782,泥质黄壤,红泥质黄壤,硅质黄壤,0.998789,0.000671,0.000179,0.011139,0.192208
140118,71784,泥质黄壤,黏质黄色石灰土,红泥质黄壤,0.998148,0.000936,0.000357,0.016408,0.192080
140119,74834,硅质黄壤,砾硅质黄壤,潮砂泥田,0.996847,0.002115,0.000302,0.025294,0.191820


In [43]:
result_df['Class1'].value_counts(),len(list(result_df['Class1'].value_counts()))

(Class1
 硅质黄壤         24532
 红泥质黄壤        21440
 泥质黄壤         21086
 灰泥质黄色石灰土     13590
 壤质黄色石灰土      13336
 潮泥田          11591
 灰泥质黄壤         8547
 壤质酸性紫色土       4386
 石灰泥田          4355
 红泥田           3702
 泥砂质黄壤         2756
 壤质黑色石灰土       1955
 砾硅质黄壤         1886
 紫泥田           1757
 浅灰泥田          1632
 潮砂泥田          1474
 浅红泥田          1044
 黏质黄色石灰土        978
 硅质山地灌丛草甸土       74
 Name: count, dtype: int64,
 19)

In [19]:
# 获取三普土种对照字典
# 读取Excel文件
sp_file_path = r"C:\Users\Runker\Desktop\search_dict.xlsx"
df = pd.read_excel(sp_file_path)

# 使用前向填充（ffill）处理合并单元格的情况
df_filled = df.ffill()
# 定义一个函数来为每一行生成一个字典
def create_dict(row):
    return {
        row['三普土种']: {
            '土类': row['三普土类'],
            '亚类': row['三普亚类'],
            '土属': row['三普土属'],
        }
    }

# 使用apply方法为每一行应用这个函数，并将结果合并到一个字典中
sp_soiltype_dict = {}
for d in df_filled.apply(create_dict, axis=1):
    sp_soiltype_dict.update(d)

In [23]:
# 获取三普土种对照字典2
# 读取Excel文件
sp_file_path = r"C:\Users\Runker\Desktop\search_dict2.xlsx"
df = pd.read_excel(sp_file_path)

# 使用前向填充（ffill）处理合并单元格的情况
df_filled = df.ffill()
# 定义一个函数来为每一行生成一个字典
def create_dict(row):
    return {
        row['TZ']: {
            '土类': row['TL'],
            '亚类': row['YL'],
            '土属': row['TS'],
        }
    }

# 使用apply方法为每一行应用这个函数，并将结果合并到一个字典中
sp_soiltype_dict = {}
for d in df_filled.apply(create_dict, axis=1):
    sp_soiltype_dict.update(d)

In [24]:
sp_soiltype_dict

{'黄红泥田': {'土类': '水稻土', '亚类': '潴育型水稻土', '土属': '红泥田'},
 '潮泥田': {'土类': '水稻土', '亚类': '潴育型水稻土', '土属': '潮泥田'},
 '潮砂泥田': {'土类': '水稻土', '亚类': '潴育型水稻土', '土属': '潮砂泥田'},
 '黄白粉泥田': {'土类': '水稻土', '亚类': '潜育型水稻土', '土属': '青白粉泥田'},
 '石灰泥田': {'土类': '水稻土', '亚类': '潴育型水稻土', '土属': '石灰泥田'},
 '黄浅鳝泥田': {'土类': '水稻土', '亚类': '淹育型水稻土', '土属': '浅鳝泥田'},
 '青石灰泥田': {'土类': '水稻土', '亚类': '潜育型水稻土', '土属': '青灰泥田'},
 '浅石灰泥田': {'土类': '水稻土', '亚类': '淹育型水稻土', '土属': '浅灰泥田'},
 '黄青白粉泥田': {'土类': '水稻土', '亚类': '潜育型水稻土', '土属': '青白粉泥田'},
 '黄青砂泥田': {'土类': '水稻土', '亚类': '潜育型水稻土', '土属': '青砂泥田'},
 '黄浅白粉泥田': {'土类': '水稻土', '亚类': '淹育型水稻土', '土属': '浅白粉泥田'},
 '黄浅红泥田': {'土类': '水稻土', '亚类': '淹育型水稻土', '土属': '浅红泥田'},
 '中层灰泥质黄色石灰土': {'土类': '石灰土', '亚类': '黄色石灰土', '土属': '灰泥质黄色石灰土'},
 '薄层灰泥质黄色石灰土': {'土类': '石灰土', '亚类': '黄色石灰土', '土属': '灰泥质黄色石灰土'},
 '中层壤质黄色石灰土': {'土类': '石灰土', '亚类': '黄色石灰土', '土属': '壤质黄色石灰土'},
 '薄层硅质黄壤': {'土类': '黄壤', '亚类': '典型黄壤', '土属': '硅质黄壤'},
 '薄层砾硅质黄壤': {'土类': '黄壤', '亚类': '典型黄壤', '土属': '砾硅质黄壤'},
 '中层砾壤质黑色石灰土': {'土类': '石灰土', '亚类': '黑色石灰土', '土属

In [25]:
# 使用 apply 方法结合 lambda 函数来获取对应的 '土类' 值
result_df['Class1_tl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土类', None))
result_df['Class1_yl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('亚类', None))
result_df['Class1_ts'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土属', None))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['Class1_tl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土类', None))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['Class1_yl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('亚类', None))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-

In [26]:
result_df

Unnamed: 0,OBJECTID,Class1,Class2,Class3,Prob1,Prob2,Prob3,Entropy,Uncertainty,Class1_tl,Class1_yl,Class1_ts
0,1,中层灰泥质黄色石灰土,潮泥田,石灰泥田,0.216667,0.150000,0.096667,2.682110,0.039586,石灰土,黄色石灰土,灰泥质黄色石灰土
1,2,中层灰泥质黄色石灰土,潮泥田,石灰泥田,0.293333,0.166667,0.083333,2.495103,0.047299,石灰土,黄色石灰土,灰泥质黄色石灰土
2,3,中层灰泥质黄色石灰土,潮泥田,浅石灰泥田,0.390000,0.090000,0.080000,2.405076,0.055635,石灰土,黄色石灰土,灰泥质黄色石灰土
3,4,中层灰泥质黄色石灰土,腐薄层硅质黄壤,腐薄层壤质黑色石灰土,0.283333,0.090000,0.066667,2.765042,0.042158,石灰土,黄色石灰土,灰泥质黄色石灰土
4,5,潮泥田,石灰泥田,中层灰泥质黄色石灰土,0.303333,0.176667,0.140000,2.277581,0.051627,水稻土,潴育型水稻土,潮泥田
...,...,...,...,...,...,...,...,...,...,...,...,...
140116,71782,腐薄层泥质黄壤,腐厚层泥质黄壤,腐薄层壤质黄色石灰土,0.236667,0.216667,0.130000,2.290682,0.048832,黄壤,典型黄壤,泥质黄壤
140117,71782,腐薄层泥质黄壤,腐厚层泥质黄壤,腐薄层壤质黄色石灰土,0.236667,0.216667,0.130000,2.294067,0.048736,黄壤,典型黄壤,泥质黄壤
140118,71784,腐厚层泥质黄壤,腐厚层硅质黄壤,腐薄层泥质黄壤,0.346667,0.160000,0.150000,2.179596,0.056300,黄壤,典型黄壤,泥质黄壤
140119,74834,腐薄层硅质黄壤,腐厚层硅质黄壤,腐中层壤质黄色石灰土,0.563333,0.123333,0.076667,1.670252,0.079289,黄壤,典型黄壤,硅质黄壤


In [27]:
# 保存数据
result_df.to_csv(os.path.join(result_path, f'prediction_class_{select_model}_{get_dtime()}.csv'),index=False)