In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.svm import SVR,SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,roc_auc_score,roc_curve,precision_recall_curve,r2_score,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error,root_mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split,GridSearchCV,RandomizedSearchCV
from pykrige.ok import OrdinaryKriging
import shutil
import graphviz
import dtreeviz


In [3]:
# 读取数据
data = pd.read_csv(r"F:\cache_data\pre_property_table\dy\feature_ph_dy.csv")
print(len(data))
# 删除有缺失值的行
# data.dropna(inplace=True)
# len(data),data.columns

# 选择数值列并计算它们的均值
numeric_cols = data.select_dtypes(include=[np.number])
means = numeric_cols.mean()
# 使用均值填充每个数值列的缺失值
data[numeric_cols.columns] = data[numeric_cols.columns].fillna(means)
len(data),data.columns

1159


(1159,
 Index(['ph', 'ylzjhl', 'yjz', 'qdan', 'qlin', 'qjia', 'qxi', 'yxlin', 'sxjia',
        'hxjia', 'yxliu', 'yxgui', 'yxtie', 'yxmeng', 'yxtong', 'yxxing',
        'yxpeng', 'yxmu', 'zgong', 'zshen', 'zqian', 'zge', 'zge2', 'znie',
        'jxzc11', 'jxzc12', 'jxzc13', 'jxzc14', 'ph_Status', 'ylzjhl_Status',
        'yjz_Status', 'qdan_Status', 'qlin_Status', 'qjia_Status', 'qxi_Status',
        'yxlin_Status', 'sxjia_Status', 'hxjia_Status', 'yxliu_Status',
        'yxgui_Status', 'yxtie_Status', 'yxmeng_Status', 'yxtong_Status',
        'yxxing_Status', 'yxpeng_Status', 'yxmu_Status', 'zgong_Status',
        'zshen_Status', 'zqian_Status', 'zge_Status', 'zge2_Status',
        'znie_Status', 'jxzc11_Status', 'jxzc12_Status', 'jxzc13_Status',
        'jxzc14_Status', 'DEM', 'AnalyticalHillshading', 'Aspect',
        'ChannelNetworkBaseLevel', 'ChannelNetworkDistance',
        'ClosedDepressions', 'ConvergenceIndex', 'LSFactor', 'MRRTF', 'MRVBF',
        'PlanCurvature', 'ProfileCu

In [4]:
duplicates = data.duplicated(subset=list(data.columns)[1:], keep='first')
df_duplicates = data[duplicates]
df_duplicates


Unnamed: 0,ph,ylzjhl,yjz,qdan,qlin,qjia,qxi,yxlin,sxjia,hxjia,...,ndwi,PCA_0,PCA_1,savi,vari,DL,DZ,LON,LAT,SlopeClass


In [5]:
# 去除重复数据
data.drop_duplicates(subset=list(data.columns)[1:], keep='first', inplace=True)

In [6]:
data.head()

Unnamed: 0,ph,ylzjhl,yjz,qdan,qlin,qjia,qxi,yxlin,sxjia,hxjia,...,ndwi,PCA_0,PCA_1,savi,vari,DL,DZ,LON,LAT,SlopeClass
0,5.3,6.33,27.6,1.52,0.49,4.52,0.22,19.5,94.0,88.0,...,-0.593337,0.144017,0.068568,0.434746,0.174145,13,2,423585.0,2891090.0,1
1,6.0,6.4,17.4,1.09,0.28,3.52,0.15,28.7,94.1,77.7,...,-0.579832,0.164497,0.080262,0.429757,0.189109,13,7,435385.0,2901755.0,2
2,4.38,5.64,17.5,0.73,0.11,4.29,0.43,3.39,61.2,98.0,...,-0.765559,-0.068682,-0.074368,0.426341,0.417362,3,3,469330.0,2899265.0,4
3,4.68,14.9,46.8,2.28,0.32,7.9,0.86,3.99,95.1,171.0,...,-0.707025,-0.043486,-0.025812,0.414616,0.384712,3,2,457710.0,2885820.0,4
4,7.04,14.2,51.3,3.01,0.53,24.3,0.25,5.82,112.0,306.0,...,-0.589759,0.031788,0.048853,0.38916,0.181265,13,2,442105.0,2909305.0,2


In [None]:
# 提取单数行（偶数索引）和双数行（奇数索引）的pH值
even_index_pH = df_duplicates.iloc[::2]['ph']  # 偶数索引行
odd_index_pH = df_duplicates.iloc[1::2]['ph']  # 奇数索引行

# 创建折线图
plt.figure(figsize=(10, 6))
plt.plot(even_index_pH.index, even_index_pH, label='Even Index Rows')
plt.plot(odd_index_pH.index, odd_index_pH, label='Odd Index Rows')

# 添加图例
plt.legend()

# 添加标题和标签
plt.title('Comparison of pH Values in Even and Odd Rows')
plt.xlabel('Index')
plt.ylabel('pH Value')

# 显示图表
plt.show()

In [7]:
# 改写分类字段的类型
data['DL'] = data['DL'].astype("category")
data['DZ'] = data['DZ'].astype("category")
data['SlopeClass'] = data['SlopeClass'].astype("category")

In [None]:
data.dtypes

In [None]:
data.columns

In [8]:
X = data[['DEM', 'AnalyticalHillshading', 'Aspect',
       'ChannelNetworkBaseLevel', 'ChannelNetworkDistance',
       'ClosedDepressions', 'ConvergenceIndex', 'LSFactor', 'MRRTF', 'MRVBF',
       'PlanCurvature', 'ProfileCurvature', 'RelativeSlopePosition', 'Slope',
       'TopographicWetnessIndex', 'TotalCatchmentArea', 'ValleyDepth',
       'NIGHT2022', 'ETP2022_mean', 'TMP2022_mean', 'PRE2022_mean',
       'PRE2022_3', 'PRE2022_11', 'ETP2022_3', 'ETP2022_11', 'TMP2022_3',
       'TMP2022_11', 'evi', 'lswi', 'mndwi', 'ndmi', 'ndvi', 'ndwi', 'PCA_0',
       'PCA_1', 'savi', 'vari', 'DL', 'DZ', 'LON', 'LAT', 'SlopeClass']]
y = data['ph']

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# 输出训练集和测试集的形状
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(927, 42) (232, 42) (927,) (232,)


In [10]:
# 定义随机森林超参数的取值范围
param_dist = {
    'n_estimators': np.arange(10, 1000, 10),
    'max_features': [1.0],
    'max_depth': [None] + list(np.arange(1, 28)),
    'min_samples_split': np.arange(2, 21),
    'min_samples_leaf': np.arange(1, 21),
    'bootstrap': [True, False]
}

# 创建随机森林回归器
clf = RandomForestRegressor()

# 使用RandomizedSearchCV来寻找最佳参数
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# 打印最佳参数
print('Best Parameters: \n', random_search.best_params_)

# 使用最佳参数对测试集进行评估
best_clf = random_search.best_estimator_
score = best_clf.score(X_test, y_test)
print('Test accuracy:', score)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: 
 {'n_estimators': 690, 'min_samples_split': 12, 'min_samples_leaf': 10, 'max_features': 1.0, 'max_depth': 26, 'bootstrap': True}
Test accuracy: 0.25507751234829357


In [19]:
y_train_pred = best_clf.predict(X_train)
y_test_pred = best_clf.predict(X_test)

In [23]:
r2_score(y_train,y_train_pred),r2_score(y_test,y_test_pred)

(0.5779226889840031, 0.25507751234829357)

In [24]:
coord_cols = ['LON','LAT']

In [25]:
# 计算残差
residuals_train = y_train - y_train_pred

# 克里金残差训练
OK = OrdinaryKriging(X_train[coord_cols[0]], X_train[coord_cols[1]], residuals_train, variogram_model='spherical')
kriging_predictions_test, _ = OK.execute('points', X_test[coord_cols[0]], X_test[coord_cols[1]])

# 最终预测
predictions_test = y_test_pred + kriging_predictions_test

# 计算克里金残差评估分数
r2_rk = r2_score(y_test, predictions_test)
mae_rk = mean_absolute_error(y_test, predictions_test)
mse_rk = mean_squared_error(y_test, predictions_test)
rmse_rk = np.sqrt(mse_rk)


In [None]:
    
    # 输出克里金残差评估分数
    print(f"Regression Kriging R2: {r2_rk}")
    print(f"Regression Kriging MAE: {mae_rk}")
    print(f"Regression Kriging MSE: {mse_rk}")
    print(f"Regression Kriging RMSE: {rmse_rk}")

In [None]:
# 使用最优参数训练RandomForestRegressor模型
rf = RandomForestRegressor(n_estimators=90,criterion='squared_error', min_samples_split=6, min_samples_leaf= 8, max_features=1.0, max_depth=21, bootstrap= True)
rf.fit(X_train,y_train)
y_test_pred = rf.predict(X_test)
y_train_pred = rf.predict(X_train)
r2_score(y_test,y_test_pred),r2_score(y_train,y_train_pred)

In [None]:
r2 = r2_score(y_test,y_test_pred)
# 画图
plt.scatter(y_test, y_test_pred, c='b', alpha=0.5)

fit = np.polyfit(y_test, y_test_pred,deg=1)
fit_fn = np.poly1d(fit) 
plt.plot(y_test, fit_fn(y_test), c='r')

plt.xlim([min(y_test)-0.5, max(y_test)+0.5])
plt.ylim([min(y_test_pred)-0.5, max(y_test_pred)+0.5])
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('R^2: %.2f' % r2)
plt.grid()

plt.show()

In [None]:
plt.figure(figsize=(10,5))
plt.plot(range(len(y_test[:100])),y_test[:100],c='r',label='True value')
plt.plot(range(len(y_test_pred[:100])),y_test_pred[:100],c='c',label = 'Prediction value')
plt.legend()
plt.show()

In [None]:
mse_score = mean_squared_error(y_test, y_test_pred)
mae_score = mean_absolute_error(y_test, y_test_pred)
mape_score = mean_absolute_percentage_error(y_test, y_test_pred)
rmse_score = root_mean_squared_error(y_test, y_test_pred)
r2score = r2_score(y_test, y_test_pred)
print('Mse:', mse_score,'Mae',mae_score,'Mape',mape_score,'Rmse',rmse_score,'r2score',r2score)


In [None]:
a = rf.feature_importances_
a

In [None]:
# 绘制特征重要性柱状图
plt.figure(figsize=(10, 8.5))
plt.barh(X.columns, a)
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()

In [None]:
# 数据标准化后再训练
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler



# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 实例化标准化器

# scaler = StandardScaler()  # Z得分标准化（Standard Scaling）:将数据按属性（按列进行）减去其均值，并除以其标准差。结果的分布将具有均值为 0 和标准差为 1。

scaler = MinMaxScaler()  # 最小-最大标准化（Min-Max Scaling）:将所有特征缩放到 [0, 1] 范围内，或者是其他指定的范围。对异常值非常敏感。

# 加载数据
# scaler = RobustScaler()  # 稳健标准化（Robust Scaling）:使用四分位数范围来缩放数据，因此它对异常值不敏感。
for model in [StandardScaler(),MinMaxScaler(),RobustScaler()]:
    scaler = model

    # 对训练数据进行拟合和转换
    X_train_scaled = scaler.fit_transform(X_train)

    # 对测试数据进行转换
    X_test_scaled = scaler.transform(X_test)

    # 训练随机森林模型
    rf.fit(X_train_scaled, y_train)

    # 模型评估（例如，使用 R2 分数）
    r2score = rf.score(X_test_scaled, y_test)
    print("R2 Score: ", r2score)



In [None]:
# 克里金残差训练
# 计算残差
residuals_test =y_train - y_train_pred
# 克里金残差测试
OK = OrdinaryKriging(X_train['LON'], X_train['LAT'], residuals_test, variogram_model='spherical')  #variogram_model:linear,gaussian,exponential,spherical
kriging_predictions_test, _ = OK.execute('points', X_test['LON'], X_test['LAT'])
predictions_test = y_test_pred + kriging_predictions_test
# 计算R2
r2 = r2_score(y_test, predictions_test)
r2


In [None]:
# 计算R2
r2 = r2_score(y_test, y_test_pred)

r2

In [None]:
# 递归特征消除 (选择最佳组合特征)
from sklearn.feature_selection import RFE,RFECV

# RFE
selector = RFECV(RandomForestRegressor(n_jobs=4),step=1,cv=5,n_jobs=4)
selector = selector.fit(X_train, y_train)

# 查看选中的特征
selected_features = selector.support_
# 计算测试集的 R2 分数
y_pred = selector.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("Selected Features: ", selected_features)
print("Number of Selected Features: ", selector.n_features_)
print("R2 Score: ", r2)

In [None]:
selected_features,selector.n_features_

In [None]:
# 迭代优化 (选择最佳组合特征)

best_score = 0
best_features = None

# 尝试不同数量的特征
for i in range(1, X_train.shape[1] + 1):
    # RFE 选择特征
    selector = RFE(RandomForestRegressor(n_jobs=4), n_features_to_select=i, step=1)
    selector = selector.fit(X_train, y_train)

    # 预测并计算 R2 分数
    y_pred = selector.predict(X_test)
    score = r2_score(y_test, y_pred)

    # 更新最佳分数和特征
    if score > best_score:
        best_score = score
        best_features = selector.support_

print("Best R2 Score: ", best_score)
print("Best Features: ", best_features)

In [None]:
a = [True,  True, False, False,  True, False, False, False,  True,  True,  True,  True,
 False, False,  True, False, False, False, False, False, False, False,  True,  True,
 False, False,  True,  True, False, False, False,  True,  True,  True]
features_list = list(data.columns)
features_list.remove('pH')

In [None]:
features = [features_list[index] for index, item in enumerate(a) if item == True]


In [None]:
print(features),len(features)