In [None]:
import os
import json
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
from pyproj import Transformer
from tqdm import tqdm
import pickle

In [None]:
# 获取当前日期
def get_dtime():
    # 例如20240516
    import datetime
    dtime = datetime.datetime.now().strftime('%Y%m%d')
    return dtime

In [None]:
# 导入autogluon模型
model_path = r"F:\cache_data\zone_ana\dy\modle\autogluon_type_class_DY"

In [None]:
# 砂岩
sy_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'sy'),'sdt'))
sy_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'sy'),'fzrt'))
sy_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'sy'),'zrt'))
# 碳酸岩
tsy_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'tsy'),'sdt'))
tsy_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'tsy'),'fzrt'))
tsy_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'tsy'),'zrt'))
# 第四系红粘土
hnt_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'hnt'),'sdt'))
hnt_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'hnt'),'fzrt'))
hnt_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'hnt'),'zrt'))
# 泥页岩
nyy_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'nyy'),'sdt'))
nyy_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'nyy'),'fzrt'))
nyy_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'nyy'),'zrt'))
# 紫红色砂页岩
zhsyy_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'zhsyy'),'sdt'))
zhsyy_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'zhsyy'),'fzrt'))
zhsyy_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'zhsyy'),'zrt'))
# 河流冲积物
hlcjw_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'hlcjw'),'sdt'))
# 砾岩
ly_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'ly'),'sdt'))
ly_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'ly'),'fzrt'))
ly_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'ly'),'zrt'))
# 第四纪冰川冲积物
bccjw_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'bccjw'),'sdt'))
bccjw_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'bccjw'),'fzrt'))
bccjw_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'bccjw'),'zrt'))
# 砂页岩
syy_sdt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'syy'),'sdt'))
syy_fzrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'syy'),'fzrt'))
syy_zrt_predictor = TabularPredictor.load(os.path.join(os.path.join(model_path,'syy'),'zrt'))

In [None]:
# 获取特征函数

def get_feature_names(predictor):
    feature_names = predictor.feature_metadata_in.get_features()
    return feature_names
sdt_feature_names = syy_sdt_predictor.feature_metadata_in.get_features()
print("SDT",sdt_feature_names)
fzrt_feature_names = syy_fzrt_predictor.feature_metadata_in.get_features()
print("FZRT",fzrt_feature_names)
zrt_feature_names = syy_zrt_predictor.feature_metadata_in.get_features()
print("ZRT",zrt_feature_names)

In [None]:
# 存放预测结果的路径
result_path =  r"F:\cache_data\zone_ana\dy\prediction_result"

In [None]:
# 读取数据集
feature_path = r'F:\cache_data\zone_ana\dy\prediction_data\result_20240619.csv'

In [None]:
feature_df = pd.read_csv(feature_path)

In [None]:
feature_df.columns

In [None]:
feature_df.rename(columns={'母质':'MZ'},inplace=True)

In [None]:
dataset = feature_df

In [None]:
# 筛选砂岩水稻土数据
sy_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      & (dataset['MZ'] == '砂岩')]
# 筛砂岩选非自然土数据
sy_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '砂岩')]
# 筛选自然土数据
sy_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '砂岩')]

In [None]:
# 筛选碳酸岩水稻土数据
tsy_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      & (dataset['MZ'] == '碳酸岩')]

# 筛选碳酸岩选非自然土数据
tsy_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                        &(dataset['MZ'] != '碳酸岩')]

# 筛选碳酸岩自然土数据
tsy_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '碳酸岩')]


In [None]:
# 筛选第四系红粘土水稻土数据
hnt_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      & (dataset['MZ'] == '第四系红粘土')]

# 筛选第四系红粘土非自然土数据
hnt_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '第四系红粘土')]

# 筛选第四系红粘土自然土数据
hnt_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '第四系红粘土')]


In [None]:
# 筛选泥(页)岩水稻土数据
nyy_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      & (dataset['MZ'] == '泥(页)岩')]

# 筛选泥(页)岩非自然土数据
nyy_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '泥(页)岩')]

# 筛选泥(页)岩自然土数据
nyy_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '泥(页)岩')]


In [None]:
# 筛选紫红色砂页岩水稻土数据
zhsyy_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      &(dataset['MZ'] == '紫红色砂页岩')]

# 筛选紫红色砂页岩非自然土数据
zhsyy_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '紫红色砂页岩')]

# 筛选紫红色砂页岩自然土数据
zhsyy_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '紫红色砂页岩')]


In [None]:
# 筛选河流冲积物水稻土数据
hlcjw_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂'])
                         & (dataset['MZ'] == '河流冲积物')]


In [None]:
# 筛选砾岩水稻土数据
ly_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      & (dataset['MZ'] == '砾岩')]

# 筛选砾岩非自然土数据
ly_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '砾岩')]

# 筛选砾岩非自然土数据
ly_zrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '砾岩')]


In [None]:
# 筛选第四纪冰川冲积物水稻土数据
bccjw_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                     & (dataset['MZ'] == '第四纪冰川冲积物')]

# 筛选第四纪冰川冲积物非自然土数据
bccjw_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '第四纪冰川冲积物')]

# 筛选第四纪冰川冲积物自然土数据
bccjw_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '第四纪冰川冲积物')]


In [None]:
# 筛选砂页岩水稻土数据
syy_sdt_data = dataset[dataset['DLMC'].isin(['水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                      & (dataset['MZ'] == '砂页岩')]

# 筛选砂页岩非自然土数据
syy_fzrt_data = dataset[~dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地','水田','水浇地','坑塘水面','养殖坑塘','内陆滩涂']) 
                       & (dataset['MZ'] == '砂页岩')]

# 筛选砂页岩物自然土数据
syy_zrt_data = dataset[dataset['DLMC'].isin(['乔木林地','灌木林地','竹林地','其他林地','其他草地','天然牧草地','人工牧草地'])
                   & (dataset['MZ'] == '砂页岩')]


In [None]:
# 查看各个数据集的大小
print('砂岩',sy_sdt_data.shape,sy_fzrt_data.shape,sy_zrt_data.shape)
print('碳酸岩',tsy_sdt_data.shape,tsy_fzrt_data.shape,tsy_zrt_data.shape)
print('第四系红粘土',hnt_sdt_data.shape,hnt_fzrt_data.shape,hnt_zrt_data.shape)
print('泥页岩',nyy_sdt_data.shape,nyy_fzrt_data.shape,nyy_zrt_data.shape)
print('紫红色砂页岩',zhsyy_sdt_data.shape,zhsyy_fzrt_data.shape,zhsyy_zrt_data.shape)
print('河流冲积物',hlcjw_sdt_data.shape)
print('砾岩',ly_sdt_data.shape,ly_fzrt_data.shape,ly_zrt_data.shape)
print('第四纪冰川冲积物',bccjw_sdt_data.shape,bccjw_fzrt_data.shape,bccjw_zrt_data.shape)
print('砂页岩',syy_sdt_data.shape,syy_fzrt_data.shape,syy_zrt_data.shape)



In [None]:
# 获取对照字典
with open(r'D:\worker_code\Terrain_Test\data\soil_dict_20240619.json', 'r') as f:
    soil_dict = json.load(f)
# 将键转为int
soil_dict = {int(k):v for k, v in soil_dict.items()}

In [None]:
# 模型选择
select_model = 'WeightedEnsemble_L2'
# sdt_model = 'WeightedEnsemble_L2'

In [None]:
# 概率预测函数
def predict_top_classes(data, predictor,feature_names, model, top_n, soil_dict):
    """
    根据给定的AutoGluon模型，预测数据集中的前n个最可能的类别及其概率，并根据提供的字典转换类别编号为描述字符串。
    参数:
    data (DataFrame): 输入的数据集。
    predictor（model）: 对应的模型
    feature_names (list): 用于预测的特征名列表。
    model (str): 选择的模型名称。
    top_n (int): 需要返回的最高概率的类别数量。
    soil_dict (dict): 类别编号到描述字符串的映射字典。
    
    返回:
    DataFrame: 原始数据与预测结果合并后的DataFrame。
    """
    # 复制数据
    data = data.copy()
    # 预测概率
    pred_probs = predictor.predict_proba(data[feature_names], model=model)
    # 获取概率最高的前n个类别及其概率
    top_classes = pred_probs.apply(lambda x: pd.Series(x.nlargest(top_n).index.astype(int).tolist() + x.nlargest(top_n).values.tolist()), axis=1)
    # 重命名列
    class_cols = ['Class{}'.format(i+1) for i in range(top_n)]
    prob_cols = ['Prob{}'.format(i+1) for i in range(top_n)]
    top_classes.columns = class_cols + prob_cols
    # 转换类别编号为描述字符串
    for col in class_cols:
        top_classes[col] = top_classes[col].map(soil_dict)
    # 计算每个预测的熵
    entropy = pred_probs.apply(lambda x: -np.sum(x * np.log(x + 1e-9)), axis=1)
    top_classes['Entropy'] = entropy
    # 计算每个预测的不确定性（标准差）
    uncertainty = pred_probs.std(axis=1)
    top_classes['Uncertainty'] = uncertainty
    # 将新列添加到原有的DataFrame中
    return pd.concat([data, top_classes], axis=1)

In [None]:
# 模型预测
sy_sdt_result_df = predict_top_classes(sy_sdt_data, sy_sdt_predictor,sdt_feature_names, select_model, 2, soil_dict)
sy_fzrt_result_df = predict_top_classes(sy_fzrt_data, sy_fzrt_predictor,fzrt_feature_names, select_model, 2, soil_dict)
sy_zrt_result_df = predict_top_classes(sy_zrt_data, sy_zrt_predictor,zrt_feature_names, select_model, 2, soil_dict)

tsy_sdt_result_df = predict_top_classes(tsy_sdt_data, tsy_sdt_predictor,sdt_feature_names, select_model, 2, soil_dict)
tsy_fzrt_result_df = predict_top_classes(tsy_fzrt_data, tsy_fzrt_predictor,fzrt_feature_names, select_model, 2, soil_dict)
tsy_zrt_result_df = predict_top_classes(tsy_zrt_data, tsy_zrt_predictor,zrt_feature_names, select_model, 2, soil_dict)

hnt_sdt_result_df = predict_top_classes(hnt_sdt_data, hnt_sdt_predictor,sdt_feature_names, select_model, 2, soil_dict)
hnt_fzrt_result_df = predict_top_classes(hnt_fzrt_data, hnt_fzrt_predictor,fzrt_feature_names, select_model, 2, soil_dict)
hnt_zrt_result_df = predict_top_classes(hnt_zrt_data, hnt_zrt_predictor,zrt_feature_names, select_model, 2, soil_dict)

nyy_sdt_result_df = predict_top_classes(nyy_sdt_data, nyy_sdt_predictor,sdt_feature_names, select_model, 2, soil_dict)
nyy_fzrt_result_df = predict_top_classes(nyy_fzrt_data, nyy_fzrt_predictor,fzrt_feature_names, select_model, 2, soil_dict)
nyy_zrt_result_df = predict_top_classes(nyy_zrt_data, nyy_zrt_predictor,zrt_feature_names, select_model, 2, soil_dict)

zhsyy_sdt_result_df = predict_top_classes(zhsyy_sdt_data, zhsyy_sdt_predictor,get_feature_names(zhsyy_sdt_predictor), select_model, 2, soil_dict)
zhsyy_fzrt_result_df = predict_top_classes(zhsyy_fzrt_data, zhsyy_fzrt_predictor,fzrt_feature_names, select_model, 2, soil_dict)
zhsyy_zrt_result_df = predict_top_classes(zhsyy_zrt_data, zhsyy_zrt_predictor,zrt_feature_names, select_model, 2, soil_dict)

hlcjw_sdt_result_df = predict_top_classes(hlcjw_sdt_data, hlcjw_sdt_predictor,get_feature_names(hlcjw_sdt_predictor), select_model, 2, soil_dict)


ly_sdt_result_df = predict_top_classes(ly_sdt_data, ly_sdt_predictor,get_feature_names(ly_sdt_predictor), select_model, 2, soil_dict)
ly_fzrt_result_df = predict_top_classes(ly_fzrt_data, ly_fzrt_predictor,get_feature_names(ly_fzrt_predictor), select_model, 2, soil_dict)
ly_zrt_result_df = predict_top_classes(ly_zrt_data, ly_zrt_predictor,get_feature_names(ly_zrt_predictor), select_model, 2, soil_dict)

bccjw_sdt_result_df = predict_top_classes(bccjw_sdt_data, bccjw_sdt_predictor,get_feature_names(bccjw_sdt_predictor), select_model, 2, soil_dict)
bccjw_fzrt_result_df = predict_top_classes(bccjw_fzrt_data, bccjw_fzrt_predictor,get_feature_names(bccjw_fzrt_predictor), select_model, 2, soil_dict)
bccjw_zrt_result_df = predict_top_classes(bccjw_zrt_data, bccjw_zrt_predictor,get_feature_names(bccjw_zrt_predictor), select_model, 2, soil_dict)

syy_sdt_result_df = predict_top_classes(syy_sdt_data, syy_sdt_predictor,get_feature_names(syy_sdt_predictor), select_model, 2, soil_dict)
syy_fzrt_result_df = predict_top_classes(syy_fzrt_data, syy_fzrt_predictor,get_feature_names(syy_fzrt_predictor), select_model, 2, soil_dict)
syy_zrt_result_df = predict_top_classes(syy_zrt_data, syy_zrt_predictor,get_feature_names(syy_zrt_predictor), select_model, 2, soil_dict)

In [None]:
# 将所有的DataFrame放入一个列表中
dfs = [
    sy_sdt_result_df, sy_fzrt_result_df, sy_zrt_result_df,
    tsy_sdt_result_df, tsy_fzrt_result_df, tsy_zrt_result_df,
    hnt_sdt_result_df, hnt_fzrt_result_df, hnt_zrt_result_df,
    nyy_sdt_result_df, nyy_fzrt_result_df, nyy_zrt_result_df,
    zhsyy_sdt_result_df, zhsyy_fzrt_result_df, zhsyy_zrt_result_df,
    hlcjw_sdt_result_df,
    ly_sdt_result_df, ly_fzrt_result_df, ly_zrt_result_df,
    bccjw_sdt_result_df, bccjw_fzrt_result_df, bccjw_zrt_result_df,
    syy_sdt_result_df, syy_fzrt_result_df, syy_zrt_result_df
]

# 使用pd.concat合并所有的DataFrame
result_df = pd.concat(dfs, ignore_index=True)


In [None]:
result_df.columns

In [None]:
# 获取指定列
result_df = result_df[['OBJECTID','Class1', 'Class2',  'Prob1','Prob2', 'Entropy', 'Uncertainty']]

In [None]:
# 获取三普土种对照字典
# 读取Excel文件
sp_file_path = r"C:\Users\Runker\Desktop\search_dict3.xlsx"
df = pd.read_excel(sp_file_path)

# 使用前向填充（ffill）处理合并单元格的情况
df_filled = df.ffill()
# 定义一个函数来为每一行生成一个字典
def create_dict(row):
    return {
        row['TZ']: {
            '土类': row['TL'],
            '亚类': row['YL'],
            '土属': row['TS'],
        }
    }

# 使用apply方法为每一行应用这个函数，并将结果合并到一个字典中
sp_soiltype_dict = {}
for d in df_filled.apply(create_dict, axis=1):
    sp_soiltype_dict.update(d)

In [None]:
sp_soiltype_dict

In [None]:
# 使用 apply 方法结合 lambda 函数来获取对应的 '土类' 值
result_df = result_df.copy()
result_df['Class1_tl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土类', None))
result_df['Class1_yl'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('亚类', None))
result_df['Class1_ts'] = result_df['Class1'].apply(lambda x: sp_soiltype_dict.get(x, {}).get('土属', None))


In [None]:
result_df

In [None]:
duplicates = result_df.duplicated(subset=['OBJECTID'], keep='last')
df_duplicates = result_df[duplicates]
df_duplicates


In [None]:
# 保存数据
result_df.to_csv(os.path.join(result_path, f'prediction_class_{select_model}_{get_dtime()}.csv'),index=False)

In [None]:
pd.DataFrame(result_df['Class1'].value_counts()),len(list(result_df['Class1'].value_counts())),len(result_df['Class1_ts'].value_counts())