In [1]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from pykrige.ok import OrdinaryKriging
from sklearn.metrics import r2_score
import os
from tqdm import tqdm
import pickle

In [2]:
# 特征表格存放文件
csv_path = r"F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv"

In [3]:
# 获取所有的特征表格列表
def get_all_csv_list(path):
    csv_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                csv_list.append(os.path.join(root, file))
    sorted_files = sorted(csv_list, key=lambda x: int(x.rsplit('_', 1)[-1].split('.')[0]))
    return sorted_files
csv_list = get_all_csv_list(csv_path)
print(len(csv_list),csv_list[0])

344 F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_000.csv


In [11]:
# 导入训练好的模型
# predictor = TabularPredictor.load(r"F:\cache_data\model_path\dy\soil_type\cart\cart_tree.pkl")
model_path = r"F:\cache_data\model_path\dy\soil_type\cart\cart_tree.pkl"
# 加载模型
with open(model_path, 'rb') as f:
    predictor = pickle.load(f)

In [4]:
# 导入autogluon模型
predictor = TabularPredictor.load(r"F:\cache_data\model_path\dy\soil_type\autogluon\have_dz_dl")

In [5]:
predictor.model_best,predictor.model_names()

('WeightedEnsemble_L2',
 ['KNeighborsUnif',
  'KNeighborsDist',
  'NeuralNetFastAI',
  'LightGBMXT',
  'LightGBM',
  'RandomForestGini',
  'RandomForestEntr',
  'CatBoost',
  'WeightedEnsemble_L2'])

In [6]:
# 获取特征
feature_names = predictor.feature_metadata_in.get_features()
print(feature_names)

['DEM', 'Mean', 'ndvi', 'PCA_0', 'LON', 'LAT', 'PH', 'DL', 'DZ']


In [7]:
# 存放预测结果的路径
result_path =  r"F:\cache_data\pre_soiltype_table\dy\autogluon\predict_csv"
pred_df = pd.DataFrame()

In [None]:
# 使用cart训练
predictions = []
for idx,one_pred_csv in enumerate(csv_list):
    data_df = pd.read_csv(one_pred_csv)
    data_df = data_df[data_df.columns[1:]]
    temp_pred = predictor.predict(data_df)

    # 将预测结果添加到列表中
    # 如果 temp_pred 不是一个 pandas Series 或 DataFrame，可以先将其转换
    predictions.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
    print(one_pred_csv)

# 一次性将所有预测结果合并为一个 DataFrame
pred_df = pd.concat(predictions, axis=1)

# 保存 pred_df 到新的 CSV 文件
pred_df.to_csv(os.path.join(result_path, 'prediction_.csv'), index=False)

In [9]:
# 使用autogluon训练
predictions = []
for idx,one_pred_csv in enumerate(csv_list):
    # data_df = pd.read_csv(one_pred_csv)
    # data_df = data_df[feature_names]
    # temp_pred = predictor.predict(data_df, model='RandomForestEntr')
    # # 将预测结果添加到列表中
    # # 如果 temp_pred 不是一个 pandas Series 或 DataFrame，可以先将其转换
    # predictions.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
    # print(one_pred_csv)

    try:
        data_df = pd.read_csv(one_pred_csv)
        data_df = data_df[feature_names]
        # 对所有列进行通用的数据清洗
        for col in data_df.columns:
            # 检查列是否为数值类型，如果不是，则尝试清洗和转换
            if not pd.api.types.is_numeric_dtype(data_df[col]):
                # 移除非数字字符
                data_df[col] = data_df[col].replace(to_replace=r'[^\d.]+', value='', regex=True)
                # 尝试转换为浮点数，无法转换的设置为NaN
                # data_df[col] = pd.to_numeric(data_df[col], errors='coerce')

        # 丢弃或填充NaN值，视您的需求而定
        # data_df.dropna(inplace=True)  # 丢弃任何包含NaN的行
        # 或者
        # data_df.fillna(0, inplace=True)  # 用0填充NaN值

        # 对数据进行预测
        temp_pred = predictor.predict(data_df, model='RandomForestEntr')

        # 将预测结果添加到列表中
        predictions.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
        print(one_pred_csv)

    except Exception as e:
        print(f"处理文件 {one_pred_csv} 时发生错误: {e}")

# 一次性将所有预测结果合并为一个 DataFrame
pred_df = pd.concat(predictions, axis=1)

# 保存 pred_df 到新的 CSV 文件
pred_df.to_csv(os.path.join(result_path, 'prediction_ph_RandomForestEntr_20240221.csv'), index=False)

F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_000.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_001.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_002.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_003.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_004.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_005.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_006.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_007.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_008.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_009.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_010.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_011.csv
F:\cache_data\pre_soiltype_table\dy\autogluon\feature_csv\data_chunk_012.csv

In [7]:
# 使用autogluon训练
predictions = []
for idx,one_pred_csv in enumerate(csv_list):
    data_df = pd.read_csv(one_pred_csv)
    data_df = data_df[data_df.columns[1:]]
    temp_pred = predictor.predict(data_df, model='RandomForestGini')
    # 将预测结果添加到列表中
    # 如果 temp_pred 不是一个 pandas Series 或 DataFrame，可以先将其转换
    predictions.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
    print(one_pred_csv)

# 一次性将所有预测结果合并为一个 DataFrame
pred_df = pd.concat(predictions, axis=1) 

# 保存 pred_df 到新的 CSV 文件
pred_df.to_csv(os.path.join(result_path, 'prediction_ph_RandomForestGini_dldz.csv'), index=False)

F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_000.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_001.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_002.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_003.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_004.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_005.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_006.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_007.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_008.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_009.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_010.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_011.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_csv\data_chunk_012.csv
F:\cache_data\pre_soiltype_table\dy\cart\features_c