In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1.载入数据

In [3]:
def load_data(clean_csv_folder, noisy_csv_folder):
    clean_data_frames = []
    noisy_data_frames = []
    
    for filename in os.listdir(clean_csv_folder):
        if filename.endswith('.csv'):
            clean_csv_path = os.path.join(clean_csv_folder, filename)
            noisy_csv_path = os.path.join(noisy_csv_folder, filename)
            
            clean_df = pd.read_csv(clean_csv_path)
            noisy_df = pd.read_csv(noisy_csv_path)
            
            clean_data_frames.append(clean_df)
            noisy_data_frames.append(noisy_df)
    
    # 合并所有DataFrame为单一DataFrame
    clean_data = pd.concat(clean_data_frames, ignore_index=True)
    noisy_data = pd.concat(noisy_data_frames, ignore_index=True)
    
    return clean_data.values, noisy_data.values

clean_csv_folder = 'E:\\wavelet\\wavelet coefficient\\source_db6\\test_source_小波系数能量自适应阈值降维'  # Update this path
noisy_csv_folder = 'E:\\wavelet\\wavelet coefficient\\Gauss_db6\\test_Gauss_小波系数能量自适应阈值降维'  # Update this path
clean_data, noisy_data = load_data(clean_csv_folder, noisy_csv_folder)

# 2.划分数据集

In [4]:
X_train, X_test, y_train, y_test = train_test_split(noisy_data, clean_data, test_size=0.2, random_state=42)

# 3.设置网格搜索参数

In [5]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 4, 6]
}

# 初始化随机森林回归器
rf = RandomForestRegressor(random_state=42)

# 初始化网格搜索
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)

# 执行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数
print(f"Best parameters found: {grid_search.best_params_}")

# 使用最佳参数的模型进行预测
y_pred = grid_search.best_estimator_.predict(X_test)

Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   5.9s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   5.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   5.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  11.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  11.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=  11.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=300; total time=  17.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=300; total time=  16.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=300; total time=  16.8s
[CV] END max_depth=None, min_samples_split=4, n_estimators=100; total time=   5.0s
[CV] END max_depth=None, min_samples_split=4, n_estimators=100; total time=   4.8s
[CV] END max_depth=None, 

# 4.模型评估

In [6]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 3044.4978512149482


# 5.保存模型

In [8]:
import joblib

# 保存模型的路径
model_save_path = 'E:\\wavelet\\trained_models\\random_forest_regressor.joblib'

# 获取模型保存路径的目录部分
model_save_dir = os.path.dirname(model_save_path)

# 检查目录是否存在，如果不存在，则创建
if not os.path.exists(model_save_dir):
    os.makedirs(model_save_dir)
    print(f"创建目录：{model_save_dir}")

# 保存模型
joblib.dump(grid_search.best_estimator_, model_save_path)

print(f"模型已保存到 {model_save_path}")

模型已保存到 E:\wavelet\trained_models\random_forest_regressor.joblib


# 6.模型应用于验证集

In [9]:
import pandas as pd
import os
import joblib

# 模型加载
model_save_path = 'E:\\wavelet\\trained_models\\random_forest_regressor.joblib'
loaded_model = joblib.load(model_save_path)

# 验证集文件夹路径
validation_data_folder = 'E:\\wavelet\\wavelet coefficient\\Gauss\\Validation_Gauss_小波系数能量自适应阈值降维'
# 
# # 结果保存文件夹
# result_save_folder = 'E:\\wavelet\\wavelet coefficient\\predictions\\validation'
# if not os.path.exists(result_save_folder):
#     os.makedirs(result_save_folder)
# 
# def process_and_save_predictions(validation_data_folder, result_save_folder, model):
#     for filename in os.listdir(validation_data_folder):
#         if filename.endswith('.csv'):
#             file_path = os.path.join(validation_data_folder, filename)
#             df = pd.read_csv(file_path)
#             
#             # 假设模型预测基于特定的特征列
#             # X = df[['feature1', 'feature2', ...]].values
#             X = df.values  # 如果模型使用了所有列作为特征
#             
#             # 进行预测
#             predictions = model.predict(X)
#             
#             # 将预测结果保存回新的DataFrame（如果需要保留其他列，请根据需要调整）
#             result_df = pd.DataFrame(predictions, columns=['Predicted'])  # 调整列名和结构以匹配原文件
#             # result_df = pd.concat([df, result_df], axis=1) # 如果需要原始数据和预测结果一起保存
#             
#             # 保存预测结果到新文件
#             result_file_path = os.path.join(result_save_folder, f"predicted_{filename}")
#             result_df.to_csv(result_file_path, index=False)
#             print(f"预测结果已保存到 {result_file_path}")
# 
# # 处理验证集数据并保存预测结果
# process_and_save_predictions(validation_data_folder, result_save_folder, loaded_model)


def process_and_aggregate_predictions(validation_data_folder, model):
    aggregated_predictions = []  # 聚合预测结果的列表
    for filename in os.listdir(validation_data_folder):
        if filename.endswith('.csv'):
            file_path = os.path.join(validation_data_folder, filename)
            df = pd.read_csv(file_path)
            
            # 如果模型使用了所有列作为特征
            X = df.values
            
            # 进行预测
            predictions = model.predict(X)
            
            # 聚合预测结果
            aggregated_predictions.extend(predictions)
    
    return np.array(aggregated_predictions)  # 返回聚合后的numpy数组

# 使用聚合函数处理验证集数据
y_val_pred = process_and_aggregate_predictions(validation_data_folder, loaded_model)

# 7.验证集测试评估

In [10]:
# 真实值文件夹路径
true_data_folder = 'E:\\wavelet\\wavelet coefficient\\source\\Validation_source_小波系数能量自适应阈值降维'

# 加载验证集真实值的函数
def load_true_data(true_data_folder):
    true_data_frames = []
    for filename in os.listdir(true_data_folder):
        if filename.endswith('.csv'):
            true_csv_path = os.path.join(true_data_folder, filename)
            true_df = pd.read_csv(true_csv_path)
            true_data_frames.append(true_df)
    
    # 合并所有DataFrame为单一DataFrame
    combined_true_data = pd.concat(true_data_frames, ignore_index=True)
    
    return combined_true_data.values  # 返回numpy数组

# 加载验证集真实值
y_val_true = load_true_data(true_data_folder)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 计算评估指标
mse = mean_squared_error(y_val_true, y_val_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val_true, y_val_pred)
r2 = r2_score(y_val_true, y_val_pred)

# 打印评估指标
print(f"Validation MSE: {mse}")
print(f"Validation RMSE: {rmse}")
print(f"Validation MAE: {mae}")
print(f"Validation R²: {r2}")

Validation MSE: 1863.1722961889673
Validation RMSE: 43.164479565830135
Validation MAE: 23.88976176690688
Validation R²: 0.43927595972815037
