In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt
from pykrige.ok import OrdinaryKriging
from pykrige.rk import RegressionKriging

# RF


In [3]:

def rf_classics(df, label_col, feature_cols, coord_cols, param_grid, save_model_path):
    # 分离特征和标签
    X = df[feature_cols + coord_cols]
    y = df[label_col]
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 初步超参数搜索（随机搜索）
    rf = RandomForestRegressor()
    n_iter_search = 100
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=n_iter_search, cv=5, random_state=42, n_jobs=-1, verbose=2)
    random_search.fit(X_train, y_train)
    # 打印最佳参数
    best_random_params = random_search.best_params_
    print('Best Random Parameters: \n', best_random_params)
    # 使用最佳参数对测试集进行评估
    best_randomrf = random_search.best_estimator_
    testrandom_score = best_randomrf.score(X_test, y_test)
    trainrandom_score = best_randomrf.score(X_train, y_train)

    print('RandomSearch Test accuracy:', testrandom_score,'RandomSearch Train accuracy:',trainrandom_score)
    # 基于随机搜索结果的超参数范围
    print(f"Random search best params: {best_random_params}")
    param_grid_fine = {
        'n_estimators': [max(10, best_random_params['n_estimators'] - 50), best_random_params['n_estimators'], min(1000, best_random_params['n_estimators'] + 50)],
        'max_depth': [max(1, best_random_params['max_depth'] - 5), best_random_params['max_depth'], best_random_params['max_depth'] + 5],
        'min_samples_split': [max(2, best_random_params['min_samples_split'] - 2), best_random_params['min_samples_split'], best_random_params['min_samples_split'] + 2],
        'min_samples_leaf': [max(1, best_random_params['min_samples_leaf'] - 1), best_random_params['min_samples_leaf'], best_random_params['min_samples_leaf'] + 1]
    }
    # 精细超参数搜索（网格搜索）
    grid_search = GridSearchCV(estimator=best_randomrf, param_grid=param_grid_fine, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    # 最优参数
    best_gr_params = grid_search.best_params_
    best_gr_rf = grid_search.best_estimator_
    # 打印最佳参数
    print('Best Grid Parameters: \n', best_gr_params)
    # 使用最佳参数对测试集进行评估
    test_gr_score = best_gr_rf.score(X_test, y_test)
    train_gr_score = best_gr_rf.score(X_train, y_train)
    print('GridSearch Test accuracy:', test_gr_score,'GridSearch Train accuracy:',train_gr_score)
    # 特征重要性
    feature_gr_importances = best_gr_rf.feature_importances_
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_gr_importances
    }).sort_values(by='Importance', ascending=False)
    
    # 保存特征重要性
    importance_df.to_csv(os.path.join(os.path.dirname(save_model_path), 'feature_importance.csv'), index=False)
    
    # 绘制特征重要性图
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importances')
    plt.gca().invert_yaxis()
    plt.savefig(os.path.join(os.path.dirname(save_model_path), 'feature_importance_importance.png'))
    
    # 预测与评估
    y_train_pred = best_gr_rf.predict(X_train)
    y_test_pred = best_gr_rf.predict(X_test)
    
    r2_rf = r2_score(y_test, y_test_pred)
    mae_rf = mean_absolute_error(y_test, y_test_pred)
    mse_rf = mean_squared_error(y_test, y_test_pred)
    rmse_rf = np.sqrt(mse_rf)
    
    # 输出随机森林评估分数
    print(f"Random Forest R2: {r2_rf}")
    print(f"Random Forest MAE: {mae_rf}")
    print(f"Random Forest MSE: {mse_rf}")
    print(f"Random Forest RMSE: {rmse_rf}")
    
    # 计算残差
    residuals_train = y_train - y_train_pred
    
    # 克里金残差训练
    OK = OrdinaryKriging(X_train[coord_cols[0]], X_train[coord_cols[1]], residuals_train, variogram_model='spherical')
    kriging_predictions_test, _ = OK.execute('points', X_test[coord_cols[0]], X_test[coord_cols[1]])
    
    # 最终预测
    predictions_test = y_test_pred + kriging_predictions_test
    
    # 计算克里金残差评估分数
    r2_rk = r2_score(y_test, predictions_test)
    mae_rk = mean_absolute_error(y_test, predictions_test)
    mse_rk = mean_squared_error(y_test, predictions_test)
    rmse_rk = np.sqrt(mse_rk)
    
    # 输出克里金残差评估分数
    print(f"Regression Kriging R2: {r2_rk}")
    print(f"Regression Kriging MAE: {mae_rk}")
    print(f"Regression Kriging MSE: {mse_rk}")
    print(f"Regression Kriging RMSE: {rmse_rk}")
    # 绘制随机森林评估分数图
    plt.figure(figsize=(10, 6))
    metrics = ['R2', 'MAE', 'MSE', 'RMSE']
    values_rf = [r2_rf, mae_rf, mse_rf, rmse_rf]
    values_rk = [r2_rk, mae_rk, mse_rk, rmse_rk]

    # 随机森林评估分数图
    plt.subplot(2, 1, 1)
    bars_rf = plt.bar(metrics, values_rf)
    plt.ylabel('Scores')
    plt.title('Random Forest Evaluation Scores')

    # 在条形图上标注值
    for bar, value in zip(bars_rf, values_rf):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.05 * (max(values_rf) - min(values_rf)), 
                f'{value:.2f}', ha='center', va='bottom')

    # 克里金回归评估分数图
    plt.subplot(2, 1, 2)
    bars_rk = plt.bar(metrics, values_rk)
    plt.xlabel('Metrics')
    plt.ylabel('Scores')
    plt.title('Regression Kriging Evaluation Scores')
    # 在条形图上标注值
    for bar, value in zip(bars_rk, values_rk):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.05 * (max(values_rk) - min(values_rk)), 
                f'{value:.2f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig(os.path.join(os.path.dirname(save_model_path), 'evaluation_scores.png'))
    # 保存模型
    with open(save_model_path, 'wb') as f:
        pickle.dump(best_gr_rf, f)
    
    return {
        "SelectedFeatures": feature_cols,
        "FeatureImportance": importance_df,
        "RandomForest": {
            "R2": r2_rf,
            "MAE": mae_rf,
            "MSE": mse_rf,
            "RMSE": rmse_rf
        },
        "RegressionKriging": {
            "R2": r2_rk,
            "MAE": mae_rk,
            "MSE": mse_rk,
            "RMSE": rmse_rk
        }
    }

# RFRK

In [4]:

def regression_prediction_cc(df, label_col, feature_cols, coord_cols, param_grid, save_model_path):
    # 分离特征和标签
    X = df[feature_cols + coord_cols]
    y = df[label_col]
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 初始化随机森林回归器
    rf = RandomForestRegressor(random_state=42)
    
    # 迭代特征优化
    selector = RFECV(rf, step=1, cv=5)
    selector = selector.fit(X_train[feature_cols], y_train)
    X_train_selected = selector.transform(X_train[feature_cols])
    X_test_selected = selector.transform(X_test[feature_cols])
    
    # 获取选择的特征
    selected_features = np.array(feature_cols)[selector.support_]
    print(f"Selected features: {selected_features}")
    
    # 初步超参数搜索（随机搜索）
    n_iter_search = 50
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=n_iter_search, cv=5, random_state=42, n_jobs=-1, verbose=2)
    random_search.fit(X_train_selected, y_train)
    
    # 基于随机搜索结果的超参数范围
    best_params = random_search.best_params_
    print(f"Random search best params: {best_params}")
    
    param_grid_fine = {
        'n_estimators': [max(10, best_params['n_estimators'] - 50), best_params['n_estimators'], min(1000, best_params['n_estimators'] + 50)],
        'max_depth': [max(1, best_params['max_depth'] - 5), best_params['max_depth'], best_params['max_depth'] + 5],
        'min_samples_split': [max(2, best_params['min_samples_split'] - 2), best_params['min_samples_split'], best_params['min_samples_split'] + 2],
        'min_samples_leaf': [max(1, best_params['min_samples_leaf'] - 1), best_params['min_samples_leaf'], best_params['min_samples_leaf'] + 1]
    }
    
    # 精细超参数搜索（网格搜索）
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_fine, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_selected, y_train)
    
    # 最优参数
    best_params = grid_search.best_params_
    best_rf = grid_search.best_estimator_
    
    # 特征重要性
    feature_importances = best_rf.feature_importances_
    importance_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    # 保存特征重要性
    importance_df.to_csv(os.path.join(os.path.dirname(save_model_path), 'feature_importance.csv'), index=False)
    
    # 绘制特征重要性图
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importances')
    plt.gca().invert_yaxis()
    plt.savefig(os.path.join(os.path.dirname(save_model_path), 'feature_importance_importance.png'))
    
    # 预测与评估
    y_train_pred = best_rf.predict(X_train_selected)
    y_test_pred = best_rf.predict(X_test_selected)
    
    r2_rf = r2_score(y_test, y_test_pred)
    mae_rf = mean_absolute_error(y_test, y_test_pred)
    mse_rf = mean_squared_error(y_test, y_test_pred)
    rmse_rf = np.sqrt(mse_rf)
    
    # 输出随机森林评估分数
    print(f"Random Forest R2: {r2_rf}")
    print(f"Random Forest MAE: {mae_rf}")
    print(f"Random Forest MSE: {mse_rf}")
    print(f"Random Forest RMSE: {rmse_rf}")
    
    # 计算残差
    residuals_train = y_train - y_train_pred
    
    # 克里金残差训练
    OK = OrdinaryKriging(X_train[coord_cols[0]], X_train[coord_cols[1]], residuals_train, variogram_model='spherical')
    kriging_predictions_test, _ = OK.execute('points', X_test[coord_cols[0]], X_test[coord_cols[1]])
    
    # 最终预测
    predictions_test = y_test_pred + kriging_predictions_test
    
    # 计算克里金残差评估分数
    r2_rk = r2_score(y_test, predictions_test)
    mae_rk = mean_absolute_error(y_test, predictions_test)
    mse_rk = mean_squared_error(y_test, predictions_test)
    rmse_rk = np.sqrt(mse_rk)
    
    # 输出克里金残差评估分数
    print(f"Regression Kriging R2: {r2_rk}")
    print(f"Regression Kriging MAE: {mae_rk}")
    print(f"Regression Kriging MSE: {mse_rk}")
    print(f"Regression Kriging RMSE: {rmse_rk}")
    # 绘制随机森林评估分数图
    plt.figure(figsize=(10, 6))
    metrics = ['R2', 'MAE', 'MSE', 'RMSE']
    values_rf = [r2_rf, mae_rf, mse_rf, rmse_rf]
    values_rk = [r2_rk, mae_rk, mse_rk, rmse_rk]

    # 随机森林评估分数图
    plt.subplot(2, 1, 1)
    bars_rf = plt.bar(metrics, values_rf)
    plt.ylabel('Scores')
    plt.title('Random Forest Evaluation Scores')

    # 在条形图上标注值
    for bar, value in zip(bars_rf, values_rf):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.05 * (max(values_rf) - min(values_rf)), 
                f'{value:.2f}', ha='center', va='bottom')

    # 克里金回归评估分数图
    plt.subplot(2, 1, 2)
    bars_rk = plt.bar(metrics, values_rk)
    plt.xlabel('Metrics')
    plt.ylabel('Scores')
    plt.title('Regression Kriging Evaluation Scores')

    # 在条形图上标注值
    for bar, value in zip(bars_rk, values_rk):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.05 * (max(values_rk) - min(values_rk)), 
                f'{value:.2f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig(os.path.join(os.path.dirname(save_model_path), 'evaluation_scores.png'))
    # 保存模型
    with open(save_model_path, 'wb') as f:
        pickle.dump(best_rf, f)
    
    return {
        "SelectedFeatures": selected_features,
        "FeatureImportance": importance_df,
        "RandomForest": {
            "R2": r2_rf,
            "MAE": mae_rf,
            "MSE": mse_rf,
            "RMSE": rmse_rf
        },
        "RegressionKriging": {
            "R2": r2_rk,
            "MAE": mae_rk,
            "MSE": mse_rk,
            "RMSE": rmse_rk
        }
    }





def regression_prediction(df, label_col, feature_cols, coord_cols, param_grid, save_model_path):
    # 分离特征和标签
    X = df[feature_cols + coord_cols]
    y = df[label_col]
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 初始化随机森林回归器
    rf = RandomForestRegressor(random_state=42)
    
    # 迭代特征优化
    selector = RFECV(rf, step=1, cv=5)
    selector = selector.fit(X_train[feature_cols], y_train)
    X_train_selected = selector.transform(X_train[feature_cols])
    X_test_selected = selector.transform(X_test[feature_cols])
    
    # 获取选择的特征
    selected_features = np.array(feature_cols)[selector.support_]
    print(f"Selected features: {selected_features}")
    
    # 初步超参数搜索（随机搜索）
    n_iter_search = 50
    random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=n_iter_search, cv=5, random_state=42, n_jobs=-1, verbose=2)
    random_search.fit(X_train_selected, y_train)
    
    # 基于随机搜索结果的超参数范围
    best_params = random_search.best_params_
    print(f"Random search best params: {best_params}")
    
    param_grid_fine = {
        'n_estimators': [_ for _ in range(best_params['n_estimators'] - 5, best_params['n_estimators'] + 5,2)],
        'max_depth': [_ for _ in range(best_params['max_depth'] - 2, best_params['max_depth'] + 2)],
        'min_samples_split': [_ for _ in range(best_params['min_samples_split'] - 2, best_params['min_samples_split'] + 2)],
        'min_samples_leaf': [_ for _ in range(best_params['min_samples_leaf'] - 2, best_params['min_samples_leaf'] + 2)]
    }
    
    # 精细超参数搜索（网格搜索）
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_fine, cv=5, n_jobs=-1, verbose=2)
    grid_search.fit(X_train_selected, y_train)
    
    # 最优参数
    best_params = grid_search.best_params_
    best_rf = grid_search.best_estimator_
    
    # 特征重要性
    feature_importances = best_rf.feature_importances_
    importance_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)
    
    # 保存特征重要性
    importance_df.to_csv(os.path.join(os.path.dirname(save_model_path), 'feature_importance.csv'), index=False)
    
    # 绘制特征重要性图
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importances')
    plt.gca().invert_yaxis()
    plt.savefig(os.path.join(os.path.dirname(save_model_path), 'feature_importance_importance.png'))
    
    # 预测与评估
    y_train_pred = best_rf.predict(X_train_selected)
    y_test_pred = best_rf.predict(X_test_selected)
    
    r2_rf = r2_score(y_test, y_test_pred)
    mae_rf = mean_absolute_error(y_test, y_test_pred)
    mse_rf = mean_squared_error(y_test, y_test_pred)
    rmse_rf = np.sqrt(mse_rf)
    
    # 输出随机森林评估分数
    print(f"Random Forest R2: {r2_rf}")
    print(f"Random Forest MAE: {mae_rf}")
    print(f"Random Forest MSE: {mse_rf}")
    print(f"Random Forest RMSE: {rmse_rf}")
    

    
    # 使用克里回归模型训练
    rk = RegressionKriging(regression_model=best_rf,n_closest_points=36)
    rk.fit(X_train_selected, X_train[coord_cols].values, y_train)
    y_pred_rk = rk.predict(X_test_selected, X_test[coord_cols].values)

    r2_rk = r2_score(y_test, y_pred_rk)
    mae_rk = mean_absolute_error(y_test, y_pred_rk)
    mse_rk = mean_squared_error(y_test, y_pred_rk)
    rmse_rk = np.sqrt(mse_rk)

    # 输出克里金残差评估分数
    print(f"Regression Kriging R2: {r2_rk}")
    print(f"Regression Kriging MAE: {mae_rk}")
    print(f"Regression Kriging MSE: {mse_rk}")
    print(f"Regression Kriging RMSE: {rmse_rk}")

    # 绘制随机森林评估分数图
    plt.figure(figsize=(10, 6))
    metrics = ['R2', 'MAE', 'MSE', 'RMSE']
    values_rf = [r2_rf, mae_rf, mse_rf, rmse_rf]
    values_rk = [r2_rk, mae_rk, mse_rk, rmse_rk]

    # 随机森林评估分数图
    plt.subplot(2, 1, 1)
    bars_rf = plt.bar(metrics, values_rf)
    plt.ylabel('Scores')
    plt.title('Random Forest Evaluation Scores')

    # 在条形图上标注值
    for bar, value in zip(bars_rf, values_rf):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.05 * (max(values_rf) - min(values_rf)), 
                f'{value:.2f}', ha='center', va='bottom')

    # 克里金回归评估分数图
    plt.subplot(2, 1, 2)
    bars_rk = plt.bar(metrics, values_rk)
    plt.xlabel('Metrics')
    plt.ylabel('Scores')
    plt.title('Regression Kriging Evaluation Scores')

    # 在条形图上标注值
    for bar, value in zip(bars_rk, values_rk):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() - 0.05 * (max(values_rk) - min(values_rk)), 
                f'{value:.2f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.savefig(os.path.join(os.path.dirname(save_model_path), 'evaluation_scores.png'))
    # 保存模型
    with open(save_model_path, 'wb') as f:
        pickle.dump(best_rf, f)
    
    return {
        "SelectedFeatures": selected_features,
        "FeatureImportance": importance_df,
        "RandomForest": {
            "R2": r2_rf,
            "MAE": mae_rf,
            "MSE": mse_rf,
            "RMSE": rmse_rf
        },
        "RegressionKriging": {
            "R2": r2_rk,
            "MAE": mae_rk,
            "MSE": mse_rk,
            "RMSE": rmse_rk
        }
    }




In [5]:
# 读取数据
data = pd.read_csv(r"F:\cache_data\pre_property_table\dy\feature_ph_dy.csv")
print(len(data))
# 删除有缺失值的行
# data.dropna(inplace=True)
# len(data),data.columns

# 选择数值列并计算它们的均值
numeric_cols = data.select_dtypes(include=[np.number])
means = numeric_cols.mean()
# 使用均值填充每个数值列的缺失值
data[numeric_cols.columns] = data[numeric_cols.columns].fillna(means)

1159


In [6]:
data['DL'] = data['DL'].astype("category")
data['DZ'] = data['DZ'].astype("category")
data['SlopeClass'] = data['SlopeClass'].astype("category")
# 用户上传的DataFrame
df = data


In [7]:

coord_cols = ["LON", "LAT"]
# 用户选择的标签列和特征列
# label_col = "ph"
feature_cols = ['DEM', 'AnalyticalHillshading', 'Aspect',
       'ChannelNetworkBaseLevel', 'ChannelNetworkDistance',
       'ClosedDepressions', 'ConvergenceIndex', 'LSFactor', 'MRRTF', 'MRVBF',
       'PlanCurvature', 'ProfileCurvature', 'RelativeSlopePosition', 'Slope',
       'TopographicWetnessIndex', 'TotalCatchmentArea', 'ValleyDepth',
       'NIGHT2022', 'ETP2022_mean', 'TMP2022_mean', 'PRE2022_mean',
       'PRE2022_3', 'PRE2022_11', 'ETP2022_3', 'ETP2022_11', 'TMP2022_3',
       'TMP2022_11', 'evi', 'lswi', 'mndwi', 'ndmi', 'ndvi', 'ndwi', 'PCA_0',
       'PCA_1', 'savi', 'vari', 'DL', 'DZ','SlopeClass']

# 用户指定的超参数调优范围
param_grid = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_depth': [None] + list(np.arange(1, 100)),
    'min_samples_split': np.arange(2, 100),
    'min_samples_leaf': np.arange(1, 100),
}

# 用户指定的保存模型路径
# save_model_path = r"C:\Users\Runker\Desktop\testrf\best_model.pkl"

In [None]:
# 逐个训练并导出
label_cols_list = ['zge', 'zge2', 'znie',
       'jxzc11', 'jxzc12', 'jxzc13', 'jxzc14']
# rf模型目录
rf_dir = r"F:\cache_data\model_path\dy\rfrk"
for col in label_cols_list:
    print(col)
    # 选取符合条件的数据
    pre_data = df[df[f"{col}_Status"]=='Normal']
    X = pre_data[feature_cols+coord_cols]
    y = pre_data[col]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    # 输出训练集和测试集的形状
    print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
    # 模型存放位置,检查是否存在该目录,不存在则创建
    temp_dir_path = os.path.join(rf_dir,col)
    if not os.path.exists(temp_dir_path):
        os.makedirs(temp_dir_path)
    # 模型存放路径
    save_model_path =os.path.join(temp_dir_path,f"{col}_rf_model.pkl") 
    # 训练模型
    train_log = rf_classics(pre_data,col,feature_cols,coord_cols,param_grid,save_model_path)
    print(train_log)