In [1]:
import os
import json
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
from pyproj import Transformer
from tqdm import tqdm
import pickle

In [2]:
# 获取当前日期
def get_dtime():
    # 例如20240516
    import datetime
    dtime = datetime.datetime.now().strftime('%Y%m%d')
    return dtime

In [3]:
# 导入autogluon模型
model_path = r"F:\cache_data\zone_ana\dy\modle\rcnt_class"
cnt_predictor = TabularPredictor.load(os.path.join(model_path, f"{'cnt'}_model"))

In [4]:
cnt_predictor.model_best,cnt_predictor.model_names()

('WeightedEnsemble_L2',
 ['RandomForestGini',
  'RandomForestEntr',
  'ExtraTreesGini',
  'ExtraTreesEntr',
  'NeuralNetFastAI',
  'NeuralNetTorch',
  'WeightedEnsemble_L2'])

In [5]:
# 获取特征
cnt_feature_names = cnt_predictor.feature_metadata_in.get_features()
print("CNT",cnt_feature_names)


CNT ['DEM_MIN', 'DEM_MAX', 'DEM_RANGE', 'DEM_MEAN', 'DEM_STD', 'DEM_PCT90', 'PlanCurvature_RANGE', 'PlanCurvature_MEAN', 'PlanCurvature_STD', 'ProfileCurvature_RANGE', 'ProfileCurvature_MEAN', 'ProfileCurvature_STD', 'Slope_MIN', 'Slope_MAX', 'Slope_RANGE', 'Slope_MEAN', 'Slope_STD', 'ndvi_MIN', 'ndvi_MAX', 'ndvi_RANGE', 'ndvi_MEAN', 'ndvi_STD', 'PCA_0_MEAN', 'PCA_0_STD', 'MRRTF_MEAN', 'MRRTF_STD', 'MRVBF_MIN', 'MRVBF_MAX', 'MRVBF_RANGE', 'MRVBF_MEAN', 'MRVBF_STD']


In [7]:
# 存放预测结果的路径
result_path =  r"C:\Users\Runker\Desktop\CNT_TRAIN\pre_table"

In [8]:
# 读取数据集
feature_path = r"C:\Users\Runker\Desktop\CNT_TRAIN\train_table\pre.csv"

In [9]:
feature_df = pd.read_csv(feature_path)

In [11]:
# 获取对照字典
with open(r'D:\worker_code\Terrain_Test\data\cnt_dict.json', 'r') as f:
    cnt_dict = json.load(f)
# 将键转为int
cnt_dict = {int(k):v for k, v in cnt_dict.items()}

In [13]:
# 模型选择
select_model = 'RandomForestGini'

In [14]:
# 概率预测函数
def predict_top_classes(data, predictor,feature_names, model, top_n, cnt_dict):
    """
    根据给定的AutoGluon模型，预测数据集中的前n个最可能的类别及其概率，并根据提供的字典转换类别编号为描述字符串。
    参数:
    data (DataFrame): 输入的数据集。
    predictor（model）: 对应的模型
    feature_names (list): 用于预测的特征名列表。
    model (str): 选择的模型名称。
    top_n (int): 需要返回的最高概率的类别数量。
    soil_dict (dict): 类别编号到描述字符串的映射字典。
    
    返回:
    DataFrame: 原始数据与预测结果合并后的DataFrame。
    """
    # 复制数据
    data = data.copy()
    # 预测概率
    pred_probs = predictor.predict_proba(data[feature_names], model=model)
    # 获取概率最高的前n个类别及其概率
    top_classes = pred_probs.apply(lambda x: pd.Series(x.nlargest(top_n).index.astype(int).tolist() + x.nlargest(top_n).values.tolist()), axis=1)
    # 重命名列
    class_cols = ['Class{}'.format(i+1) for i in range(top_n)]
    prob_cols = ['Prob{}'.format(i+1) for i in range(top_n)]
    top_classes.columns = class_cols + prob_cols
    # 转换类别编号为描述字符串
    for col in class_cols:
        top_classes[col] = top_classes[col].map(cnt_dict)
    # 计算每个预测的熵
    entropy = pred_probs.apply(lambda x: -np.sum(x * np.log(x + 1e-9)), axis=1)
    top_classes['Entropy'] = entropy
    # 计算每个预测的不确定性（标准差）
    uncertainty = pred_probs.std(axis=1)
    top_classes['Uncertainty'] = uncertainty
    # 将新列添加到原有的DataFrame中
    return pd.concat([data, top_classes], axis=1)

In [15]:
# CNT
cnt_result_df = predict_top_classes(feature_df, cnt_predictor,cnt_feature_names, select_model, 2, cnt_dict)

In [16]:
cnt_result_df.columns

Index(['Unnamed: 0', 'OBJECTID_1', 'DEM_COUNT', 'DEM_AREA', 'DEM_MIN',
       'DEM_MAX', 'DEM_RANGE', 'DEM_MEAN', 'DEM_STD', 'DEM_SUM', 'DEM_MEDIAN',
       'DEM_PCT90', 'PlanCurvature_COUNT', 'PlanCurvature_AREA',
       'PlanCurvature_MIN', 'PlanCurvature_MAX', 'PlanCurvature_RANGE',
       'PlanCurvature_MEAN', 'PlanCurvature_STD', 'PlanCurvature_SUM',
       'PlanCurvature_MEDIAN', 'PlanCurvature_PCT90', 'ProfileCurvature_COUNT',
       'ProfileCurvature_AREA', 'ProfileCurvature_MIN', 'ProfileCurvature_MAX',
       'ProfileCurvature_RANGE', 'ProfileCurvature_MEAN',
       'ProfileCurvature_STD', 'ProfileCurvature_SUM',
       'ProfileCurvature_MEDIAN', 'ProfileCurvature_PCT90', 'Slope_COUNT',
       'Slope_AREA', 'Slope_MIN', 'Slope_MAX', 'Slope_RANGE', 'Slope_MEAN',
       'Slope_STD', 'Slope_SUM', 'Slope_MEDIAN', 'Slope_PCT90', 'ndvi_COUNT',
       'ndvi_AREA', 'ndvi_MIN', 'ndvi_MAX', 'ndvi_RANGE', 'ndvi_MEAN',
       'ndvi_STD', 'ndvi_SUM', 'ndvi_MEDIAN', 'ndvi_PCT90', 'PCA_0_CO

In [18]:
# 获取指定列
result_df = cnt_result_df[['OBJECTID_1','Class1', 'Class2',  'Prob1','Prob2',  'Entropy', 'Uncertainty']]

In [19]:
# 保存数据
result_df.to_csv(os.path.join(result_path, f'prediction_class_{select_model}_{get_dtime()}.csv'),index=False)

In [20]:
result_df

Unnamed: 0,OBJECTID_1,Class1,Class2,Prob1,Prob2,Entropy,Uncertainty
0,6,NOT_CNT,IS_CNT,0.790000,0.210000,0.513957,0.410122
1,7,NOT_CNT,IS_CNT,0.893333,0.106667,0.339489,0.556257
2,8,NOT_CNT,IS_CNT,0.530000,0.470000,0.691346,0.042426
3,15,NOT_CNT,IS_CNT,0.966667,0.033333,0.146145,0.659966
4,17,NOT_CNT,IS_CNT,0.816667,0.183333,0.476411,0.447834
...,...,...,...,...,...,...,...
12372,22201,NOT_CNT,IS_CNT,0.980000,0.020000,0.098039,0.678823
12373,22202,IS_CNT,NOT_CNT,0.856667,0.143333,0.410969,0.504403
12374,22203,IS_CNT,NOT_CNT,0.863333,0.136667,0.398866,0.513831
12375,60319,NOT_CNT,IS_CNT,0.993333,0.006667,0.040049,0.697679
