In [None]:
import os
import json
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
from pyproj import Transformer
from tqdm import tqdm
import pickle

In [None]:
# 获取当前日期
def get_dtime():
    # 例如20240516
    import datetime
    dtime = datetime.datetime.now().strftime('%Y%m%d')
    return dtime

In [None]:
# 导入autogluon模型
model_path = r"F:\cache_data\zone_ana\dy\modle\autogluon_type_class"
sdt_predictor = TabularPredictor.load(os.path.join(model_path, f"{'sdt'}_model"))
fzrt_predictor = TabularPredictor.load(os.path.join(model_path, f"{'fzrt'}_model"))
zrt_predictor = TabularPredictor.load(os.path.join(model_path, f"{'zrt'}_model"))

In [None]:
sdt_predictor.model_best,sdt_predictor.model_names()

In [None]:
fzrt_predictor.model_best,fzrt_predictor.model_names()

In [None]:
zrt_predictor.model_best,zrt_predictor.model_names()

In [None]:
# 获取特征
sdt_feature_names = sdt_predictor.feature_metadata_in.get_features()
print("SDT",sdt_feature_names)
fzrt_feature_names = fzrt_predictor.feature_metadata_in.get_features()
print("FZRT",fzrt_feature_names)
zrt_feature_names = zrt_predictor.feature_metadata_in.get_features()
print("ZRT",zrt_feature_names)

In [None]:
# 存放预测结果的路径
result_path =  r"F:\cache_data\zone_ana\dy\prediction_result"

In [None]:
# 读取数据集
feature_path = r'F:\cache_data\zone_ana\dy\prediction_data\result.csv'

In [None]:
feature_df = pd.read_csv(feature_path)

In [None]:
# 区分数据
# 筛选水稻土数据
sdt_data = feature_df[feature_df['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂'])]

In [None]:
# 筛选非自然土数据
fzrt_data = feature_df[~feature_df['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂'])]

In [None]:
# 筛选自然土数据
zrt_data = feature_df[feature_df['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])]

In [None]:
# 检查数据完整性
sdt_data.shape,fzrt_data.shape,zrt_data.shape,feature_df.shape,sdt_data.shape[0]+fzrt_data.shape[0]+zrt_data.shape[0]==feature_df.shape[0]

In [None]:
# 获取对照字典
with open(r'D:\worker_code\Terrain_Test\data\soil_dict.json', 'r') as f:
    soil_dict = json.load(f)
# 将键转为int
soil_dict = {int(k):v for k, v in soil_dict.items()}

In [None]:
# 模型选择
select_model = 'RandomForestEntr'

In [None]:
# 概率预测函数
def predict_top_classes(data, predictor,feature_names, model, top_n, soil_dict):
    """
    根据给定的AutoGluon模型，预测数据集中的前n个最可能的类别及其概率，并根据提供的字典转换类别编号为描述字符串。
    参数:
    data (DataFrame): 输入的数据集。
    predictor（model）: 对应的模型
    feature_names (list): 用于预测的特征名列表。
    model (str): 选择的模型名称。
    top_n (int): 需要返回的最高概率的类别数量。
    soil_dict (dict): 类别编号到描述字符串的映射字典。
    
    返回:
    DataFrame: 原始数据与预测结果合并后的DataFrame。
    """
    # 复制数据
    data = data.copy()
    # 预测概率
    pred_probs = predictor.predict_proba(data[feature_names], model=model)
    # 获取概率最高的前n个类别及其概率
    top_classes = pred_probs.apply(lambda x: pd.Series(x.nlargest(top_n).index.astype(int).tolist() + x.nlargest(top_n).values.tolist()), axis=1)
    # 重命名列
    class_cols = ['Class{}'.format(i+1) for i in range(top_n)]
    prob_cols = ['Prob{}'.format(i+1) for i in range(top_n)]
    top_classes.columns = class_cols + prob_cols
    # 转换类别编号为描述字符串
    for col in class_cols:
        top_classes[col] = top_classes[col].map(soil_dict)
    # 计算每个预测的熵
    entropy = pred_probs.apply(lambda x: -np.sum(x * np.log(x + 1e-9)), axis=1)
    top_classes['Entropy'] = entropy
    # 计算每个预测的不确定性（标准差）
    uncertainty = pred_probs.std(axis=1)
    top_classes['Uncertainty'] = uncertainty
    # 将新列添加到原有的DataFrame中
    return pd.concat([data, top_classes], axis=1)

In [None]:
# SDT
sdt_result_df = predict_top_classes(sdt_data, sdt_predictor,sdt_feature_names, select_model, 3, soil_dict)

In [None]:
# FZRT
fzrt_result_df = predict_top_classes(fzrt_data, fzrt_predictor,fzrt_feature_names, select_model, 3, soil_dict)

In [None]:
# ZRT
zrt_result_df = predict_top_classes(zrt_data, zrt_predictor,zrt_feature_names, select_model, 3, soil_dict)

In [None]:
# 合并数据
result_df = pd.concat([sdt_result_df,fzrt_result_df,zrt_result_df])

In [None]:
result_df.columns

In [None]:
# 获取指定列
result_df = result_df[['OBJECTID','Class1', 'Class2', 'Class3', 'Prob1','Prob2', 'Prob3', 'Entropy', 'Uncertainty']]

In [None]:
# 获取三普土种对照字典
# 读取Excel文件
sp_file_path = r"C:\Users\Runker\Desktop\search_dict.xlsx"
df = pd.read_excel(sp_file_path)

# 使用前向填充（ffill）处理合并单元格的情况
df_filled = df.ffill()
# 定义一个函数来为每一行生成一个字典
def create_dict(row):
    return {
        row['三普土种']: {
            '土类': row['三普土类'],
            '亚类': row['三普亚类'],
            '土属': row['三普土属'],
        }
    }

# 使用apply方法为每一行应用这个函数，并将结果合并到一个字典中
sp_soiltype_dict = {}
for d in df_filled.apply(create_dict, axis=1):
    sp_soiltype_dict.update(d)

In [None]:
sp_soiltype_dict

In [None]:
# 使用 apply 方法结合 lambda 函数来获取对应的 '土类' 值
result_df['Class1_tl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土类', None))
result_df['Class1_yl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('亚类', None))
result_df['Class1_ts'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土属', None))

In [None]:
result_df

In [None]:
# 保存数据
result_df.to_csv(os.path.join(result_path, f'prediction_class_{select_model}_{get_dtime()}.csv'),index=False)