In [2]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
import os
from tqdm import tqdm

In [3]:
# 特征表格存放文件
csv_path = r"F:\cache_data\pred_feature_table\ky\feature_table\all"

In [4]:
# 获取所有的特征表格列表
def get_all_csv_list(path):
    csv_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                csv_list.append(os.path.join(root, file))
    sorted_files = sorted(csv_list, key=lambda x: int(x.rsplit('_', 1)[-1].split('.')[0]))
    return sorted_files
csv_list = get_all_csv_list(csv_path)
print(len(csv_list),csv_list[0])

335 F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_000.csv


In [5]:
# 导入训练好的模型
predictor = TabularPredictor.load(r"F:\cache_data\model_path\ky\ph\autogluon\all_feature")

In [6]:
predictor.model_best,predictor.model_names()

('WeightedEnsemble_L2',
 ['KNeighborsUnif',
  'KNeighborsDist',
  'LightGBMXT',
  'LightGBM',
  'RandomForestMSE',
  'CatBoost',
  'ExtraTreesMSE',
  'NeuralNetFastAI',
  'XGBoost',
  'NeuralNetTorch',
  'LightGBMLarge',
  'WeightedEnsemble_L2'])

In [7]:
# 获取特征
feature_names = predictor.feature_metadata_in.get_features()
print(feature_names)

['AnalyticalHillshading', 'Aspect', 'ChannelNetworkBaseLevel', 'ChannelNetworkDistance', 'ClosedDepressions', 'ConvergenceIndex', 'dem', 'LSFactor', 'PlanCurvature', 'ProfileCurvature', 'RelativeSlopePosition', 'Slope', 'TopographicWetnessIndex', 'TotalCatchmentArea', 'ValleyDepth', 'PRE', 'SRA', 'TMP', 'VAP', 'WIND', 'BIO', 'Contrast', 'Correlation', 'Dissimilarity', 'Entropy', 'Homogeneity', 'Mean', 'ndvi', 'PCA_0', 'PCA_1', 'SecondMoment', 'Variance', 'LON', 'LAT']


In [8]:
# 存放预测结果的路径
result_path =  r"F:\cache_data\pred_feature_table\ky\predict_table"
pred_df = pd.DataFrame()

In [9]:
# 使用autogluon训练
predictions = []
for idx,one_pred_csv in enumerate(csv_list):

    try:
        data_df = pd.read_csv(one_pred_csv)
        data_df = data_df.rename(columns={'DEM': 'dem'})
        data_df = data_df[feature_names]
        # 对所有列进行通用的数据清洗
        for col in data_df.columns:
            # 检查列是否为数值类型，如果不是，则尝试清洗和转换
            if not pd.api.types.is_numeric_dtype(data_df[col]):
                # 移除非数字字符
                data_df[col] = data_df[col].replace(to_replace=r'[^\d.]+', value='', regex=True)

        # 对数据进行预测
        temp_pred = predictor.predict(data_df, model='RandomForestMSE') # 用指定模型训练
        # temp_pred = predictor.predict(data_df) # 用最佳模型训练

        # 将预测结果添加到列表中
        predictions.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
        print(one_pred_csv)

    except Exception as e:
        print(f"处理文件 {one_pred_csv} 时发生错误: {e}")

# 一次性将所有预测结果合并为一个 DataFrame
pred_df = pd.concat(predictions, axis=1)

# 保存 pred_df 到新的 CSV 文件
pred_df.to_csv(os.path.join(result_path, 'prediction_ph_RandomForestMSE_20240307.csv'), index=False)

F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_000.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_001.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_002.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_003.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_004.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_005.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_006.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_007.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_008.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_009.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_010.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_011.csv
F:\cache_data\pred_feature_table\ky\feature_table\all\data_chunk_012.csv
F:\cache_data\pred_feature_table\ky\feature_table\a

In [10]:
predictions = []
for idx,one_pred_csv in enumerate(csv_list):
    data_df = pd.read_csv(one_pred_csv)
    temp_pred = predictor.predict(data_df, model='RandomForestMSE')

    # 将预测结果添加到列表中
    # 如果 temp_pred 不是一个 pandas Series 或 DataFrame，可以先将其转换
    predictions.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
    print(one_pred_csv)

# 一次性将所有预测结果合并为一个 DataFrame
pred_df = pd.concat(predictions, axis=1)

# 保存 pred_df 到新的 CSV 文件
pred_df.to_csv(os.path.join(result_path, 'prediction_ph.csv'), index=False)

D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_0.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_500000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_1000000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_1500000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_2000000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_2500000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_3000000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_3500000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_4000000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_4500000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\feature_table\data_chunk_5000000.csv
D:\ArcGISProjects\workspace\pred_feature_table\dy\