In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from pykrige.rk import RegressionKriging
from pykrige.ok import OrdinaryKriging
from typing import Tuple, Dict, List

In [8]:
# 预测结果存放路径
pre_save_path = r'F:\cache_data\pre_property_table\dy\rfrk_pre'
# 特征文件存放路径
features_path = r'F:\cache_data\pre_property_table\dy\features_table'
# 用于训练的数据路径
data_path = r"F:\cache_data\pre_property_table\dy\feature_ph_dy.csv"
# 读取数据
data = pd.read_csv(data_path)

In [9]:
# 获取所有的特征表格列表
def get_all_csv_list(path):
    csv_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".csv"):
                csv_list.append(os.path.join(root, file))
    sorted_files = sorted(csv_list, key=lambda x: int(x.rsplit('_', 1)[-1].split('.')[0]))
    return sorted_files
csv_list = get_all_csv_list(features_path)
print(len(csv_list),csv_list[0])

344 F:\cache_data\pre_property_table\dy\features_table\data_chunk_000.csv


In [10]:
def compare_models_and_train_kriging(
    X: pd.DataFrame, 
    y: pd.Series, 
    rf_model: RandomForestRegressor,
    coord_cols:list,
    test_size: float = 0.3,
    random_state: int = 42
) -> Tuple[Dict[str, float], RandomForestRegressor]:
    """
    比较RF和RFRK模型的性能，如果RFRK更好，则进行克里金残差训练。

    :param X: 特征数据
    :param y: 目标变量
    :param rf_model: 已训练的随机森林模型
    :param coordinates: 样本点的空间坐标，形状为 (n_samples, n_dims)
    :param test_size: 测试集比例
    :param random_state: 随机种子
    :return: 包含R2分数的字典和可能更新的RF模型
    """
    # 分割数据
    X_train, X_test, y_train, y_test, = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # 评估RF模型
    rf_predictions = rf_model.predict(X_test)
    rf_r2 = r2_score(y_test, rf_predictions)

    # 计算RF模型的残差
    rf_train_predictions = rf_model.predict(X_train)
    residuals_train = y_train - rf_train_predictions

    # 使用普通克里金对残差进行插值
    OK = OrdinaryKriging(
        X_train[coord_cols[0]], 
        X_train[coord_cols[1]], 
        residuals_train, 
        variogram_model='spherical'
    )
    kriging_predictions_test, _ = OK.execute('points', X_test[coord_cols[0]], X_test[coord_cols[1]])

    # 组合RF预测和克里金插值
    rf_kriging_predictions = rf_model.predict(X_test) + kriging_predictions_test
    rf_kriging_r2 = r2_score(y_test, rf_kriging_predictions)

    print(f"RF R2 score: {rf_r2}")
    print(f"RF+Kriging R2 score: {rf_kriging_r2}")

    results = {"RF_R2": rf_r2, "RFRK_R2": rf_kriging_r2}

    if rf_kriging_r2 > rf_r2:
        print("RFRK性能更好，正在更新模型...")
        # 更新RF模型的预测
        def updated_predict(X_new):
            rf_pred = rf_model.predict(X_new)
            kriging_pred, _ = OK.execute('points', X_new[coord_cols[0]], X_new[coord_cols[1]])
            return rf_pred + kriging_pred

        # 将更新后的预测方法添加到RF模型
        rf_model.predict_with_kriging = updated_predict

        # 评估更新后的模型
        updated_predictions = updated_predict(X_test)
        updated_r2 = r2_score(y_test, updated_predictions)
        print(f"Updated RF+Kriging R2 score: {updated_r2}")
        results["Updated_RF_Kriging_R2"] = updated_r2

    return results, rf_model



In [11]:
# 选择数值列并计算它们的均值
numeric_cols = data.select_dtypes(include=[np.number])
means = numeric_cols.mean()
# 使用均值填充每个数值列的缺失值
data[numeric_cols.columns] = data[numeric_cols.columns].fillna(means)
data['DL'] = data['DL'].astype("category")
data['DZ'] = data['DZ'].astype("category")
data['SlopeClass'] = data['SlopeClass'].astype("category")

In [13]:
for one_lable in ['ph', 'ylzjhl', 'yjz', 'qdan', 'qlin', 'qjia', 'qxi', 'yxlin', 'sxjia',
       'hxjia', 'yxliu', 'yxgui', 'yxtie', 'yxmeng', 'yxtong', 'yxxing',
       'yxpeng', 'yxmu', 'zgong', 'zshen', 'zqian', 'zge', 'zge2', 'znie',
       'jxzc11', 'jxzc12', 'jxzc13', 'jxzc14',]:
# for one_lable in ['zqian', 'zge', 'zge2', 'znie',
#        'jxzc11', 'jxzc12', 'jxzc13', 'jxzc14',]:
    print(one_lable)    
    label = one_lable
    model_path = os.path.join(r"F:\cache_data\model_path\dy\rfrk",label)
    model_abs_path = os.path.join(model_path,f"{label}_rf_model.pkl")

    # 加载模型
    with open(model_abs_path, 'rb') as file:
        rf_model = pickle.load(file)

    # 获取模型的特征列
    model_features = rf_model.feature_names_in_
    X = data[model_features]
    y = data[label]
    coordinates = ['LON', 'LAT']
    # 使用示例
    results, updated_model = compare_models_and_train_kriging(X, y, rf_model, coordinates)
    print(results)
    # 进行预测
    # predictions_list = []
    # for idx,one_csv in tqdm(enumerate(csv_list)):
    #     temp_data = pd.read_csv(one_csv)
    #     temp_X = temp_data[model_features]
    #     # 处理无穷小和无穷大问题，类型变更为float32
    #     temp_X = temp_X.replace([np.inf, -np.inf], np.nan).astype(np.float32)
    #     temp_pred = updated_model.predict_with_kriging(temp_X) if hasattr(updated_model, 'predict_with_kriging') else updated_model.predict(temp_X)
    #     predictions_list.append(pd.Series(temp_pred, name=f'prediction_{idx}'))
    # pred_df = pd.concat(predictions_list, axis=1)
    # # 保存预测结果
    # pred_df.to_csv(os.path.join(pre_save_path, f"{label}_pred.csv"), index=False)

ph
RF R2 score: 0.33581385528492314
RF+Kriging R2 score: 0.3329569541060371
{'RF_R2': 0.33581385528492314, 'RFRK_R2': 0.3329569541060371}
ylzjhl
RF R2 score: 0.33691983718536045
RF+Kriging R2 score: 0.3232962687706491
{'RF_R2': 0.33691983718536045, 'RFRK_R2': 0.3232962687706491}
yjz
RF R2 score: 0.39130073518521036
RF+Kriging R2 score: 0.3917569718088443
RFRK性能更好，正在更新模型...
Updated RF+Kriging R2 score: 0.3917569718088443
{'RF_R2': 0.39130073518521036, 'RFRK_R2': 0.3917569718088443, 'Updated_RF_Kriging_R2': 0.3917569718088443}
qdan
RF R2 score: 0.37669903998801724
RF+Kriging R2 score: 0.3727358811810366
{'RF_R2': 0.37669903998801724, 'RFRK_R2': 0.3727358811810366}
qlin
RF R2 score: 0.39615012848067266
RF+Kriging R2 score: 0.3540743182043775
{'RF_R2': 0.39615012848067266, 'RFRK_R2': 0.3540743182043775}
qjia
RF R2 score: 0.5138803742731934
RF+Kriging R2 score: 0.5413710890382638
RFRK性能更好，正在更新模型...
Updated RF+Kriging R2 score: 0.5413710890382638
{'RF_R2': 0.5138803742731934, 'RFRK_R2': 0.54