In [35]:
import os
import pandas as pd

# 定义主目录路径
base_dir = './hour'

# 遍历1到24文件夹
for folder_num in range(1, 25):
    folder_path = os.path.join(base_dir, str(folder_num))
    
    # 读取cost_differences.csv
    cost_diff_path = os.path.join(folder_path, 'cost_differences.csv')
    cost_diff_df = pd.read_csv(cost_diff_path)
    
    # 读取perfect_costs.csv
    perfect_cost_path = os.path.join(folder_path, 'perfect_costs.csv')
    perfect_cost_df = pd.read_csv(perfect_cost_path)
    
    # 将Change1-Change5列与Perfect_Cost列对应数据相除
    result_df = cost_diff_df.div(perfect_cost_df['Perfect_Cost'], axis=0)
    
    # 保存结果到新的CSV文件
    result_path = os.path.join(folder_path, 'divided_costs.csv')
    result_df.to_csv(result_path, index=False)
    
    print(f"处理完成: {folder_path}")

print("所有文件夹处理完成。")

处理完成: ./hour\1
处理完成: ./hour\2
处理完成: ./hour\3
处理完成: ./hour\4
处理完成: ./hour\5
处理完成: ./hour\6
处理完成: ./hour\7
处理完成: ./hour\8
处理完成: ./hour\9
处理完成: ./hour\10
处理完成: ./hour\11
处理完成: ./hour\12
处理完成: ./hour\13
处理完成: ./hour\14
处理完成: ./hour\15
处理完成: ./hour\16
处理完成: ./hour\17
处理完成: ./hour\18
处理完成: ./hour\19
处理完成: ./hour\20
处理完成: ./hour\21
处理完成: ./hour\22
处理完成: ./hour\23
处理完成: ./hour\24
所有文件夹处理完成。


In [None]:
import os
import pandas as pd

# 定义目录路径
load_variation_dir = './load_variation'
hourly_data_dir = './hourly_data'

# 遍历 1 到 24
for i in range(1, 25):
    # 构造文件路径
    load_variation_file = os.path.join(load_variation_dir, f'load_variation_{i}.csv')
    y_train_file = os.path.join(hourly_data_dir, f'Y_train_hour_{i}.csv')
    
    # 读取 load_variation 文件
    load_variation_df = pd.read_csv(load_variation_file)
    
    # 读取 Y_train_hour 文件的第一列
    y_train_df = pd.read_csv(y_train_file)
    y_train_column = y_train_df.iloc[:, 0]  # 获取第一列数据
    
    # 将 load_variation 的各列与 Y_train_hour 的第一列相除
    result_df = load_variation_df.div(y_train_column, axis=0)
    
    # 保存结果到新的 CSV 文件
    result_file = os.path.join(load_variation_dir, f'load_variation_divided_{i}.csv')
    result_df.to_csv(result_file, index=False)
    
    print(f"处理完成: {load_variation_file} -> {result_file}")

print("所有文件处理完成。")

处理完成: ./load_variation\load_variation_1.csv -> ./load_variation\load_variation_divided_1.csv
处理完成: ./load_variation\load_variation_2.csv -> ./load_variation\load_variation_divided_2.csv
处理完成: ./load_variation\load_variation_3.csv -> ./load_variation\load_variation_divided_3.csv
处理完成: ./load_variation\load_variation_4.csv -> ./load_variation\load_variation_divided_4.csv
处理完成: ./load_variation\load_variation_5.csv -> ./load_variation\load_variation_divided_5.csv
处理完成: ./load_variation\load_variation_6.csv -> ./load_variation\load_variation_divided_6.csv
处理完成: ./load_variation\load_variation_7.csv -> ./load_variation\load_variation_divided_7.csv
处理完成: ./load_variation\load_variation_8.csv -> ./load_variation\load_variation_divided_8.csv
处理完成: ./load_variation\load_variation_9.csv -> ./load_variation\load_variation_divided_9.csv
处理完成: ./load_variation\load_variation_10.csv -> ./load_variation\load_variation_divided_10.csv
处理完成: ./load_variation\load_variation_11.csv -> ./load_variation\loa

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from scipy.signal import savgol_filter
from scipy.integrate import quad
from scipy.interpolate import interp1d

# 定义函数将多列数据连接为一列
def concatenate_columns(df):
    return df.values.flatten()

# 计算 Savitzky-Golay 滤波后的二阶导数的 2/5 次方积分
def compute_second_derivative_integral(x, y):
    if np.any(np.diff(x) <= 0):
        print("警告: x 不是严格递增的，跳过二阶导数计算")
        return np.nan
    # 计算二阶导数（Savitzky-Golay 滤波）
    y_savgol_second_derivative = savgol_filter(y, window_length=min(31, len(y) - 1), polyorder=3, deriv=2)
    # 计算 2/5 次方的积分
    integral, _ = quad(lambda eps: np.interp(eps, x, np.abs(y_savgol_second_derivative) ** (2/5)), np.min(x), np.max(x))
    return integral ** (5/2)

# 计算最优分段数 K
def compute_optimal_segments(x, y, tau=1e-7):
    integral_value = compute_second_derivative_integral(x, y)
    if np.isnan(integral_value):
        return np.nan
    K = np.sqrt(integral_value / (np.sqrt(120) * tau))
    return int(np.ceil(K))

# 计算累积分布函数 F(e)
def compute_cumulative_distribution(x, y):
    y_savgol_second_derivative = savgol_filter(y, window_length=min(31, len(y) - 1), polyorder=3, deriv=2)
    # 计算分母积分
    denominator, _ = quad(lambda eps: np.interp(eps, x, np.abs(y_savgol_second_derivative) ** (2/5)), np.min(x), np.max(x))
    
    # 计算累积分布 F(ε)
    F_epsilon = np.zeros_like(x)
    for i, eps in enumerate(x):
        numerator, _ = quad(lambda e: np.interp(e, x, np.abs(y_savgol_second_derivative) ** (2/5)), np.min(x), eps)
        F_epsilon[i] = numerator / denominator
    
    return F_epsilon

# 计算分段点
def compute_breakpoints(x, y, K):
    if np.isnan(K) or K <= 1:
        return []
    F_epsilon = compute_cumulative_distribution(x, y)
    # 构造 F(e) 的插值函数
    F_interp = interp1d(F_epsilon, x, kind="linear", bounds_error=False, fill_value=(x.min(), x.max()))
    # 计算均匀划分的断点
    breakpoints = F_interp(np.linspace(0, 1, K + 1)[1:-1])  # 去掉 0 和 1
    # 将 0 插入到正确位置，确保 Breakpoints 严格递增
    breakpoints = np.sort(np.append(breakpoints, 0))
    return breakpoints

# 处理所有小时的数据
def process_all_hours(load_variation_dir, hour_dir, output_dir, output_csv):
    all_x = []
    all_y = []
    # 读取并合并所有小时的数据
    for hour in range(1, 25):
        # 加载 load_variation 文件，仅读取第一列
        load_variation_file = os.path.join(load_variation_dir, f'load_variation_divided_{hour}.csv')
        if not os.path.exists(load_variation_file):
            print(f"文件 {load_variation_file} 不存在，跳过。")
            continue
        load_variation_data = pd.read_csv(load_variation_file, usecols=[0])
        x = np.array(concatenate_columns(load_variation_data))
        
        # 加载 cost_differences 文件，仅读取第一列
        cost_differences_file = os.path.join(hour_dir, str(hour), 'divided_costs.csv')
        if not os.path.exists(cost_differences_file):
            print(f"文件 {cost_differences_file} 不存在，跳过。")
            continue
        cost_differences_data = pd.read_csv(cost_differences_file, usecols=[0])
        y = np.array(concatenate_columns(cost_differences_data))
        
        # 合并数据
        all_x.extend(x)
        all_y.extend(y)
    
    # 对 x 和 y 进行排序，并去重
    all_x = np.array(all_x)
    all_y = np.array(all_y)
    sort_indices = np.argsort(all_x)
    x_sorted = all_x[sort_indices]
    y_sorted = all_y[sort_indices]
    x_sorted, unique_indices = np.unique(x_sorted, return_index=True)
    y_sorted = y_sorted[unique_indices]
    
    # 计算 Savitzky-Golay 滤波后的平滑曲线
    y_savgol_smooth = savgol_filter(y_sorted, window_length=min(31, len(y_sorted) - 1), polyorder=3)
    
    # 计算最优分段数 K
    if len(x_sorted) > 3:
        K_opt = compute_optimal_segments(x_sorted, y_savgol_smooth, tau=1e-7)
    else:
        K_opt = np.nan
    
    # 计算分段点位置
    breakpoints = compute_breakpoints(x_sorted, y_savgol_smooth, K_opt)
    
    # 绘制累积分布函数 F(ε)
    F_epsilon = compute_cumulative_distribution(x_sorted, y_savgol_smooth)
    plt.figure(figsize=(10, 6))
    plt.plot(x_sorted, F_epsilon, color='blue', label='$F(\\epsilon)$')  # 修复字符串闭合问题
    for bp in breakpoints:
        plt.axvline(x=bp, color='green', linestyle='--')  # 画出分段点
    plt.title('Cumulative Distribution Function (All Hours)')
    plt.xlabel('$\\epsilon$')  # 修复字符串闭合问题
    plt.ylabel('$F(\\epsilon)$')  # 修复字符串闭合问题
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, 'all_hours_cumulative_distribution.png'))
    plt.close()
    
    # 绘制平滑曲线并标注断点
    plt.figure(figsize=(10, 6))
    plt.plot(x_sorted, y_savgol_smooth, color='red', label='Smoothing spline')
    plt.scatter(breakpoints, np.interp(breakpoints, x_sorted, y_savgol_smooth), color='black', marker='D', label='Breakpoints')
    plt.title('Smoothing Spline and Breakpoints (All Hours)')
    plt.xlabel('$\\epsilon$')  # 修复字符串闭合问题
    plt.ylabel('$S(\\epsilon)$')  # 修复字符串闭合问题
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, 'all_hours_smooth_spline_with_breakpoints.png'))
    plt.close()
    
    # 保存全局的 breakpoint 和 Optimal_Segments_K
    results_df = pd.DataFrame({'Optimal_Segments_K': [K_opt], 'Breakpoints': [breakpoints]})
    results_df.to_csv(output_csv, index=False)
    print(f"全局分段点计算完成，结果已保存到 {output_csv}")

# 定义路径
load_variation_dir = './load_variation'
hour_dir = './hour'
output_dir = './breakpoints_plots'
output_csv = './all_hours_optimal_segments_breakpoints.csv'

# 运行
process_all_hours(load_variation_dir, hour_dir, output_dir, output_csv)

  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  integral, _ = quad(lambda eps: np.interp(eps, x, np.abs(y_savgol_second_derivative) ** (2/5)), np.min(x), np.max(x))
  If increasing the limit yields no improvement it is advised to analyze 
  the integrand in order to determine the difficulties.  If the position of a 
  local difficulty can be determined (singularity, discontinuity) one will 
  probably gain from splitting up the interval and calling the integrator 
  on the subranges.  Perhaps a special-purpose integrator should be used.
  denominator, _ = quad(lambda eps: np.interp(eps, x, np.abs(y_savgol_second_derivative) ** (2/5)), np.min(x), np.max(x))
  I

全局分段点计算完成，结果已保存到 ./all_hours_optimal_segments_breakpoints.csv


In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# 线性拟合函数
def fit_linear_segment(x, y):
    """ 对单个分段进行线性拟合，返回斜率和截距 """
    model = LinearRegression()
    model.fit(x.reshape(-1, 1), y)
    return model.coef_[0], model.intercept_

# 约束条件下的线性拟合函数
def fit_constrained_segment(x, y, split_point, y_constraint):
    """ 对单个分段进行线性拟合，约束拟合直线通过 (split_point, y_constraint) """
    x_centered = x - split_point
    slope = np.sum(x_centered * (y - y_constraint)) / np.sum(x_centered ** 2)
    intercept = y_constraint - slope * split_point
    return slope, intercept

# 处理所有小时的数据并进行全局分段线性拟合
def process_all_hours(load_variation_dir, hour_dir, output_dir, output_csv):
    """ 读取数据 -> 合并数据集 -> 进行全局分段线性拟合 -> 存储图像和拟合参数 """
    all_x = []
    all_y = []
    
    # 读取并合并所有小时的数据
    for hour in range(1, 25):
        # 加载 load_variation 文件，仅读取第一列
        load_variation_file = os.path.join(load_variation_dir, f'load_variation_divided_{hour}.csv')
        if not os.path.exists(load_variation_file):
            print(f"文件 {load_variation_file} 不存在，跳过 Hour {hour}。")
            continue
        
        # 加载 cost_differences 文件，仅读取第一列
        cost_differences_file = os.path.join(hour_dir, str(hour), 'divided_costs.csv')
        if not os.path.exists(cost_differences_file):
            print(f"文件 {cost_differences_file} 不存在，跳过 Hour {hour}。")
            continue
        
        # 仅读取第一列
        load_variation_data = pd.read_csv(load_variation_file, usecols=[0])
        cost_differences_data = pd.read_csv(cost_differences_file, usecols=[0])
        
        # 合并数据
        all_x.extend(load_variation_data.values.flatten())
        all_y.extend(cost_differences_data.values.flatten())
    
    # 转换为 NumPy 数组并排序
    all_x = np.array(all_x)
    all_y = np.array(all_y)
    sort_indices = np.argsort(all_x)
    x_sorted = all_x[sort_indices]
    y_sorted = all_y[sort_indices]
    
    # 读取全局的分段点
    breakpoints_df = pd.read_csv('./all_hours_optimal_segments_breakpoints.csv')
    if breakpoints_df.empty or 'Breakpoints' not in breakpoints_df.columns:
        print("全局分段点不存在，跳过。")
        return
    
    # 解析分段点
    breakpoints_str = breakpoints_df['Breakpoints'].values[0]
    try:
        if isinstance(breakpoints_str, str):
            breakpoints_str = breakpoints_str.strip('[]').replace('\n', ' ').strip()
            breakpoints = list(map(float, breakpoints_str.split()))
        else:
            breakpoints = list(breakpoints_str)
    except Exception as e:
        print(f"解析 Breakpoints 失败: {e}")
        return
    
    # 添加边界，并确保第一个 breakpoint 为 -inf，最后一个为 inf
    breakpoints = [-np.inf] + breakpoints + [np.inf]
    
    # 存储拟合参数
    segment_results = []
    
    # 绘制图像
    plt.figure(figsize=(10, 6))
    plt.scatter(x_sorted, y_sorted, color='blue', label='原始数据', alpha=0.5)
    
    # 遍历分段并拟合（确保分段点连续）
    prev_y = None
    for i in range(len(breakpoints) - 1):
        x_segment = x_sorted[(x_sorted > breakpoints[i]) & (x_sorted <= breakpoints[i + 1])]
        y_segment = y_sorted[(x_sorted > breakpoints[i]) & (x_sorted <= breakpoints[i + 1])]
        if len(x_segment) > 1:  # 至少需要 2 个点进行拟合
            if prev_y is not None:
                slope, intercept = fit_constrained_segment(x_segment, y_segment, breakpoints[i], prev_y)
            else:
                slope, intercept = fit_linear_segment(x_segment, y_segment)
            # 存储拟合结果
            segment_results.append([i + 1, breakpoints[i], breakpoints[i + 1], slope, intercept])
            # 绘制分段线
            x_fit = np.linspace(breakpoints[i], breakpoints[i + 1], 100)
            y_fit = slope * x_fit + intercept
            plt.plot(x_fit, y_fit, label=f'Segment {i+1}', linewidth=2)
            # 更新 prev_y 为当前段的终点值
            prev_y = slope * breakpoints[i + 1] + intercept
        else:
            print(f"分段点 {breakpoints[i + 1]} 处点数不足，跳过该段。")
    
    # 图像设置
    plt.title('Global Piecewise Linear Fit (Continuous)')
    plt.xlabel('Load Deviation')
    plt.ylabel('Cost Deviation')
    plt.legend()
    plt.grid(True)
    
    # 保存图像
    os.makedirs(output_dir, exist_ok=True)
    plot_file = os.path.join(output_dir, 'global_piecewise_fit_continuous.png')
    plt.savefig(plot_file)
    plt.close()
    print(f"图像已保存到 {plot_file}")
    
    # 确保第一个 breakpoint 为 -inf，最后一个 breakpoint 为 inf
    if segment_results[0][1] != -np.inf:
        segment_results[0][1] = -np.inf
    if segment_results[-1][2] != np.inf:
        segment_results[-1][2] = np.inf
    
    # 保存结果到 CSV
    results_df = pd.DataFrame(segment_results, columns=['Segment', 'Breakpoint_Start', 'Breakpoint_End', 'Slope', 'Intercept'])
    results_df.to_csv(output_csv, index=False)
    print(f"全局分段线性拟合结果已保存到 {output_csv}")

# 定义路径
load_variation_dir = './load_variation'
hour_dir = './hour'
output_dir = './global_piecewise_linear_plots_continuous'
output_csv = './global_piecewise_linear_fit_results_continuous.csv'

# 运行全局分段线性拟合
process_all_hours(load_variation_dir, hour_dir, output_dir, output_csv)

图像已保存到 ./global_piecewise_linear_plots_continuous\global_piecewise_fit_continuous.png
全局分段线性拟合结果已保存到 ./global_piecewise_linear_fit_results_continuous.csv


  y *= step
  y += start
  plt.savefig(plot_file)
  plt.savefig(plot_file)
  plt.savefig(plot_file)
  plt.savefig(plot_file)


In [1]:
import numpy as np
import pandas as pd

def smooth_segment(breakpoint, slope_left, intercept_left, slope_right, intercept_right, delta=0.0001):
    """
    在分段点处进行平滑化处理，使用二次曲线近似平滑化。
    :param breakpoint: 分段点位置
    :param slope_left: 左侧线性曲线的斜率
    :param intercept_left: 左侧线性曲线的截距
    :param slope_right: 右侧线性曲线的斜率
    :param intercept_right: 右侧线性曲线的截距
    :param delta: 平滑化范围
    :return: 二次曲线的系数 a, b, c
    """
    # 左侧点位置和右侧点位置
    x_left = breakpoint - delta
    x_right = breakpoint + delta
    
    # 左侧线性曲线在 x_left 处的值和导数
    y_left = slope_left * x_left + intercept_left
    dy_left = slope_left
    
    # 右侧线性曲线在 x_right 处的值和导数
    y_right = slope_right * x_right + intercept_right
    dy_right = slope_right
    
    # 构建方程组求解二次曲线系数
    A = np.array([[x_left**2, x_left, 1],
                  [x_right**2, x_right, 1],
                  [2 * x_left, 1, 0],
                  [2 * x_right, 1, 0]])
    b = np.array([y_left, y_right, dy_left, dy_right])
    
    # 求解二次曲线系数
    a, b, c = np.linalg.lstsq(A, b, rcond=None)[0]
    
    # 验证条件是否成立
    y_left_curve = a * x_left**2 + b * x_left + c
    y_right_curve = a * x_right**2 + b * x_right + c
    dy_left_curve = 2 * a * x_left + b
    dy_right_curve = 2 * a * x_right + b
    
    assert np.isclose(y_left_curve, y_left), "Left value condition not satisfied"
    assert np.isclose(y_right_curve, y_right), "Right value condition not satisfied"
    assert np.isclose(dy_left_curve, dy_left), "Left derivative condition not satisfied"
    assert np.isclose(dy_right_curve, dy_right), "Right derivative condition not satisfied"
    
    return a, b, c

def smooth_breakpoints(results_df, delta=0.0001):
    """
    对所有分段点进行平滑化处理，并将结果存储到一个新的 CSV 文件中。
    :param results_df: 原分段线性拟合结果
    :param delta: 平滑化范围
    """
    smoothed_results = []
    
    for i in range(len(results_df) - 1):
        segment_left = results_df.iloc[i]
        segment_right = results_df.iloc[i + 1]
        
        # 分段点位置
        breakpoint_left = segment_left['Breakpoint_End']
        breakpoint_right = segment_right['Breakpoint_Start']
        
        # 如果分段点相同，则进行平滑化
        if breakpoint_left == breakpoint_right and not np.isinf(breakpoint_left):
            # 获取左侧和右侧的斜率和截距
            slope_left = segment_left['Slope']
            intercept_left = segment_left['Intercept']
            slope_right = segment_right['Slope']
            intercept_right = segment_right['Intercept']
            
            # 平滑化处理
            a, b, c = smooth_segment(breakpoint_left, slope_left, intercept_left, slope_right, intercept_right, delta)
            
            # 存储平滑化后的结果
            smoothed_results.append([breakpoint_left, a, b, c])
    
    # 将平滑化后的结果存储到新的 CSV 文件中
    smoothed_df = pd.DataFrame(smoothed_results, columns=['Breakpoint', 'A', 'B', 'C'])
    smoothed_df.to_csv('./smooth_breakpoints_results.csv', index=False)
    print("平滑化结果已保存到 smooth_breakpoints_results.csv")

# 读取原分段线性拟合结果
results_df = pd.read_csv('./global_piecewise_linear_fit_results_continuous.csv')

# 进行平滑化处理
smooth_breakpoints(results_df)

平滑化结果已保存到 smooth_breakpoints_results.csv


In [1]:
import numpy as np
import pandas as pd
import os

# 定义路径
fit_results_path = './global_piecewise_linear_fit_results_continuous.csv'  # 分段线性拟合结果
smooth_results_path = './smooth_breakpoints_results.csv'  # 平滑化结果
output_dir = './predictions'  # 预测结果输出文件夹
model_params_dir = './model_params'  # 模型参数保存文件夹
epsilon_dir = './epsilon_values'  # 保存迭代过程中的 epsilon 文件夹

# 创建输出文件夹
os.makedirs(output_dir, exist_ok=True)
os.makedirs(model_params_dir, exist_ok=True)
os.makedirs(epsilon_dir, exist_ok=True)

# 读取分段线性拟合和平滑化结果
fit_results_df = pd.read_csv(fit_results_path)
smooth_results_df = pd.read_csv(smooth_results_path)

# 读取训练和测试数据
X_train = pd.read_csv('./X_train.csv').values
y_train = pd.read_csv('./Y_train.csv').values.reshape(-1, 1)
X_test = pd.read_csv('./X_test.csv').values
y_test = pd.read_csv('./Y_test.csv').values.reshape(-1, 1)

# 对 X 进行最小-最大归一化到 [-1,1]
X_train_min = np.min(X_train, axis=0)
X_train_max = np.max(X_train, axis=0)
X_train = 2 * (X_train - X_train_min) / (X_train_max - X_train_min) - 1
X_test = 2 * (X_test - X_train_min) / (X_train_max - X_train_min) - 1

# 在 X_train 和 X_test 最后一列添加全 1 作为偏置项
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])

# 超参数
delta = 0.0001  # 分段阈值
gamma = 0.001  # 学习率衰减率
max_iter = 1000  # 最大迭代次数
eps = 0  # 避免除零错误


# 计算分段梯度
def piecewise_gradient(epsilon_i, delta, a_k, breakpoints, smooth_results_df):
    """ 计算样本 i 的梯度 """
    grad = 0
    segment_index = np.digitize([epsilon_i], breakpoints)[0] - 1  # 计算该点在哪个区间
    segment_index = int(np.clip(segment_index, 0, len(a_k) - 1))  # 确保是整数索引

    # 检查是否在分段点附近
    if np.abs(epsilon_i - breakpoints[segment_index]) < delta:
        # 查询二次函数的平滑参数
        smooth_params = smooth_results_df[smooth_results_df['Breakpoint'] == breakpoints[segment_index]]
        if not smooth_params.empty:
            a_quad = smooth_params['A'].values[0]
            b_quad = smooth_params['B'].values[0]
            # 计算二次函数梯度
            grad = 2 * a_quad * epsilon_i + b_quad
    else:
        # 普通分段内，梯度取 a_k[segment_index]
        grad = a_k[segment_index]
    return grad


# 计算误差
def evaluate(y_true, y_pred):
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    return mse, rmse, mae


# 训练和预测函数
def train_and_predict():
    """ 对整体数据进行训练和预测 """
    a_k = fit_results_df['Slope'].values  # 斜率 a_k
    breakpoints = fit_results_df['Breakpoint_Start'].values  # 分段点

    # 初始化参数 w
    n, d = X_train.shape
    w = np.zeros((d, 1))
    eta = 6  # 初始学习率

    # 训练过程
    for t in range(1, max_iter + 1):
        grad_w = np.zeros_like(w)
        for i in range(n):  # 遍历所有样本
            epsilon_i = (X_train[i] @ w - y_train[i]) / y_train[i]
            grad_L_i = piecewise_gradient(epsilon_i, delta, a_k, breakpoints, smooth_results_df)
            grad_w += (grad_L_i / y_train[i]) * X_train[i].reshape(-1, 1)
        w -= eta * grad_w  # 梯度更新
        eta = eta / (1 + gamma * t)  # 学习率衰减

        # 每 100 轮输出一次进程信息
        if t % 100 == 0:
            y_pred_train = X_train @ w
            mse_train, _, _ = evaluate(y_train, y_pred_train)
            print(f"迭代 {t}: 训练集 MSE={mse_train:.4f}")
            np.save(os.path.join(model_params_dir, f'w_iter_{t}.npy'), w)

    # 训练集和测试集预测
    y_pred_train = X_train @ w
    y_pred_test = X_test @ w

    # 计算最终评价指标
    mse_train, rmse_train, mae_train = evaluate(y_train, y_pred_train)
    mse_test, rmse_test, mae_test = evaluate(y_test, y_pred_test)
    print(f"最终训练集: MSE={mse_train:.4f}, RMSE={rmse_train:.4f}, MAE={mae_train:.4f}")
    print(f"最终测试集: MSE={mse_test:.4f}, RMSE={rmse_test:.4f}, MAE={mae_test:.4f}")

    # 保存预测结果
    pd.DataFrame({"y_true": y_train.flatten(), "y_pred": y_pred_train.flatten()}).to_csv(
        os.path.join(output_dir, 'predictions_train.csv'), index=False)
    pd.DataFrame({"y_true": y_test.flatten(), "y_pred": y_pred_test.flatten()}).to_csv(
        os.path.join(output_dir, '666.csv'), index=False)

    # 保存最终模型参数
    np.save(os.path.join(model_params_dir, 'w_final.npy'), w)


# 训练和预测
train_and_predict()

迭代 100: 训练集 MSE=71747.9491
迭代 200: 训练集 MSE=71724.5701
迭代 300: 训练集 MSE=71724.5701
迭代 400: 训练集 MSE=71724.5701
迭代 500: 训练集 MSE=71724.5701
迭代 600: 训练集 MSE=71724.5701
迭代 700: 训练集 MSE=71724.5701
迭代 800: 训练集 MSE=71724.5701
迭代 900: 训练集 MSE=71724.5701
迭代 1000: 训练集 MSE=71724.5701
最终训练集: MSE=71724.5701, RMSE=267.8144, MAE=218.5905
最终测试集: MSE=100576.5645, RMSE=317.1381, MAE=258.8114


In [37]:
#优化法：太慢
import pandas as pd
import numpy as np
import gurobipy as gp
from gurobipy import GRB

# 读取分段斜率和截距的文件
fit_results_df = pd.read_csv('./global_piecewise_linear_fit_results_continuous.csv')

# 读取训练数据和测试数据
X_train = pd.read_csv('./X_train.csv')
Y_train = pd.read_csv('./Y_train.csv')
X_test = pd.read_csv('./X_test.csv')
Y_test = pd.read_csv('./Y_test.csv')

# 获取全局的分段点和斜率和截距
breakpoints = fit_results_df['Breakpoint_Start'].tolist() + [fit_results_df['Breakpoint_End'].tolist()[-1]]
slopes = fit_results_df['Slope'].tolist()
intercepts = fit_results_df['Intercept'].tolist()

# 定义函数训练全局的线性回归模型
def train_model(X_train, y_train, breakpoints, slopes, intercepts):
    # 创建 Gurobi 模型
    model = gp.Model('global_linear_regression')
    #model.setParam('OutputFlag', 0)  # 关闭输出日志

    # 定义决策变量 theta 和 beta
    theta = model.addVars(X_train.shape[1], lb=-GRB.INFINITY, vtype=gp.GRB.CONTINUOUS, name='theta')
    beta = model.addVar(lb=-GRB.INFINITY, vtype=gp.GRB.CONTINUOUS, name='beta')

    # 定义中间变量 t 和分段区间指示变量 z
    t = model.addVars(len(y_train), vtype=gp.GRB.CONTINUOUS, name='t')
    z = model.addVars(len(y_train), len(breakpoints) - 1, vtype=gp.GRB.BINARY, name='z')

    # 设置目标函数：最小化 t 的均值
    model.setObjective(gp.quicksum(t[s] for s in range(len(y_train))) / len(y_train), gp.GRB.MINIMIZE)

    # 添加约束条件
    M = 1e6  # 足够大的常数
    epsilon = 1e-6  # 用于处理严格不等式的小常数
    for p in range(len(y_train)):
        # 计算线性预测值
        linear_pred = gp.quicksum(theta[i] * X_train.iloc[p, i] for i in range(X_train.shape[1])) + beta

        # 添加分段区间约束
        for seg_idx in range(len(breakpoints) - 1):
            # 约束 linear_pred >= breakpoints[seg_idx] - M * (1 - z[p, seg_idx])
            model.addConstr(
                linear_pred >= breakpoints[seg_idx] - M * (1 - z[p, seg_idx]),
                name=f'seg_lower_{p}_{seg_idx}'
            )
            # 约束 linear_pred <= breakpoints[seg_idx + 1] + M * (1 - z[p, seg_idx]) - epsilon
            model.addConstr(
                linear_pred <= breakpoints[seg_idx + 1] + M * (1 - z[p, seg_idx]) - epsilon,
                name=f'seg_upper_{p}_{seg_idx}'
            )
            # 添加分段线性约束（仅当 z[p, seg_idx] = 1 时激活）
            model.addConstr(
                slopes[seg_idx] * (linear_pred - y_train.iloc[p]) + intercepts[seg_idx] <= t[p] + M * (1 - z[p, seg_idx]),
                name=f'constraint_seg_{p}_{seg_idx}'
            )
        # 确保 linear_pred 只能属于一个分段区间
        model.addConstr(gp.quicksum(z[p, seg_idx] for seg_idx in range(len(breakpoints) - 1)) == 1, name=f'seg_assignment_{p}')

    # 优化模型
    model.optimize()

    # 获取优化后的 theta 和 beta 值
    theta_result = np.array([theta[i].x for i in range(X_train.shape[1])])
    beta_result = beta.x

    # 获取优化后的 t 值
    t_values = np.array([t[p].x for p in range(len(y_train))])

    return theta_result, beta_result, t_values

# 主函数
def train_and_predict():
    # 训练模型
    theta_result, beta_result, t_values = train_model(X_train, Y_train, breakpoints, slopes, intercepts)

    # 打印结果
    print("优化后的 theta 值：", theta_result)
    print("优化后的 beta 值：", beta_result)

    # 将 theta 和 beta 结果保存到 DataFrame
    theta_results_df = pd.DataFrame(theta_result.reshape(1, -1), columns=[f'theta_{i}' for i in range(X_train.shape[1])])
    theta_results_df.to_csv('theta_results.csv', index=False)
    print("theta 值已保存到 theta_results.csv 文件中。")

    beta_results_df = pd.DataFrame([beta_result], columns=['Beta'])
    beta_results_df.to_csv('beta_results.csv', index=False)
    print("beta 值已保存到 beta_results.csv 文件中。")

    # 将 t 值保存到 DataFrame
    t_values_df = pd.DataFrame(t_values, columns=['t_values'])
    t_values_df.to_csv('t_values.csv', index=False)
    print("t 值已保存到 t_values.csv 文件中。")

    # 预测 Y_train 和 Y_test 值
    def predict_y(X, theta, beta):
        return np.dot(X, theta) + beta

    y_train_pred = predict_y(X_train, theta_result, beta_result)
    y_test_pred = predict_y(X_test, theta_result, beta_result)

    # 将预测值保存为 CSV 文件
    y_train_pred_df = pd.DataFrame(y_train_pred, columns=['Predicted_Y'])
    y_train_pred_df.to_csv('y_train_predictions.csv', index=False)
    print("Y_train 预测值已保存到 y_train_predictions.csv 文件中。")

    y_test_pred_df = pd.DataFrame(y_test_pred, columns=['Predicted_Y'])
    y_test_pred_df.to_csv('y_test_predictions.csv', index=False)
    print("Y_test 预测值已保存到 y_test_predictions.csv 文件中。")

# 执行主函数
train_and_predict()

  slopes[seg_idx] * (linear_pred - y_train.iloc[p]) + intercepts[seg_idx] <= t[p] + M * (1 - z[p, seg_idx]),


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "d:\Conda\envs\NNLLL\lib\site-packages\IPython\core\interactiveshell.py", line 3550, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\85033\AppData\Local\Temp\ipykernel_27240\2298993246.py", line 117, in <module>
    train_and_predict()
  File "C:\Users\85033\AppData\Local\Temp\ipykernel_27240\2298993246.py", line 80, in train_and_predict
    theta_result, beta_result, t_values = train_model(X_train, Y_train, breakpoints, slopes, intercepts)
  File "C:\Users\85033\AppData\Local\Temp\ipykernel_27240\2298993246.py", line 59, in train_model
    slopes[seg_idx] * (linear_pred - y_train.iloc[p]) + intercepts[seg_idx] <= t[p] + M * (1 - z[p, seg_idx]),
  File "src\\gurobipy\\linexpr.pxi", line 506, in gurobipy.LinExpr.__sub__
  File "src\\gurobipy\\linexpr.pxi", line 481, in gurobipy.LinExpr.__add__
  File "src\\gurobipy\\linexpr.pxi", line 210, in gurobipy.LinExpr.add
  File "d:\Conda\envs\NNLLL\lib\site-packages\pa

In [92]:
import pandas as pd
import os

# 定义函数将 X_test 和 Y_test 按 24 小时分割并保存
def split_test_data_by_hour(X_test_file, Y_test_file, output_folder, start_hour=7):
    # 加载 X_test 和 Y_test 文件
    X_test = pd.read_csv(X_test_file)
    Y_test = pd.read_csv(Y_test_file)

    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 按 24 小时分割数据
    for hour in range(24):
        # 计算实际的小时索引（从 start_hour 开始）
        actual_hour = (start_hour - 1 + hour) % 24 + 1

        # 提取符合条件的行
        extracted_X = X_test.iloc[hour::24, :]
        extracted_Y = Y_test.iloc[hour::24, :]

        # 保存提取的数据到对应的文件
        extracted_X.to_csv(f'{output_folder}/X_test_hour_{actual_hour}.csv', index=False)
        extracted_Y.to_csv(f'{output_folder}/Y_test_hour_{actual_hour}.csv', index=False)

    print(f"X_test 和 Y_test 已按 24 小时分割并保存到 {output_folder} 文件夹中。")

# 主函数
def main():
    # 定义 X_test 和 Y_test 文件路径
    X_test_file = 'X_test.csv'
    Y_test_file = 'Y_test.csv'

    # 定义输出文件夹
    output_folder = './test_split'

    # 调用函数分割数据，指定从 7 点开始
    split_test_data_by_hour(X_test_file, Y_test_file, output_folder, start_hour=7)

# 执行主函数
main()

X_test 和 Y_test 已按 24 小时分割并保存到 ./test_split 文件夹中。


In [93]:
import pandas as pd
import numpy as np
import os

# 定义函数进行单小时预测
def predict_hourly_y(hour, X_test, theta, beta):
    # 使用 theta 和 beta 进行线性预测
    y_pred = np.dot(X_test, theta) + beta
    return y_pred

# 定义函数将预测值按 trend 列排序并重新组合
def combine_and_sort_predictions(predictions, trend_values):
    # 将所有预测值合并为一个数组
    all_predictions = np.concatenate(predictions)
    # 将所有 trend 值合并为一个数组
    all_trends = np.concatenate(trend_values)
    # 根据 trend 值排序
    sorted_indices = np.argsort(all_trends)
    # 按排序后的顺序重新组合预测值
    sorted_predictions = all_predictions[sorted_indices]
    return sorted_predictions

# 主函数
def predict_and_combine():
    # 加载训练好的 theta 和 beta 值
    theta_results_df = pd.read_csv('hourly_theta_results.csv', index_col='Hour')
    beta_results_df = pd.read_csv('hourly_beta_results.csv', index_col='Hour')

    # 初始化一个列表，用于存储所有小时的预测值
    all_predictions = []
    # 初始化一个列表，用于存储所有小时的 trend 值
    all_trends = []

    # 遍历 24 小时
    for hour in range(1, 25):
        # 加载对应小时的测试数据
        X_test_file = f'./test_split/X_test_hour_{hour}.csv'
        X_test = pd.read_csv(X_test_file)

        # 获取对应小时的 theta 和 beta 值
        theta = theta_results_df.loc[hour].values
        beta = beta_results_df.loc[hour, 'Beta']

        # 进行预测
        y_pred = predict_hourly_y(hour, X_test, theta, beta)

        # 将预测值添加到列表中
        all_predictions.append(y_pred)
        # 将 trend 值添加到列表中
        all_trends.append(X_test['trend'].values)

    # 将预测值按 trend 列排序并重新组合
    sorted_predictions = combine_and_sort_predictions(all_predictions, all_trends)

    # 将排序后的预测值保存为 CSV 文件
    sorted_predictions_df = pd.DataFrame(sorted_predictions, columns=['Predicted_Y'])
    sorted_predictions_df.to_csv('sorted_y_test_predictions.csv', index=False)
    print("按 trend 列排序后的 Y_test 预测值已保存到 sorted_y_test_predictions.csv 文件中。")

# 执行主函数
predict_and_combine()

按 trend 列排序后的 Y_test 预测值已保存到 sorted_y_test_predictions.csv 文件中。


In [None]:
import pandas as pd
import numpy as np
import gurobipy as gp
from gurobipy import GRB

# 读取分段斜率和截距的文件
fit_results_df = pd.read_csv('./global_piecewise_linear_fit_results_continuous.csv')

# 读取训练数据和测试数据
X_train = pd.read_csv('./X_train.csv')
Y_train = pd.read_csv('./Y_train.csv')
X_test = pd.read_csv('./X_test.csv')
Y_test = pd.read_csv('./Y_test.csv')

# 获取全局的分段点、斜率和截距
breakpoints = fit_results_df['Breakpoint_Start'].tolist() + [fit_results_df['Breakpoint_End'].tolist()[-1]]
slopes = fit_results_df['Slope'].tolist()
intercepts = fit_results_df['Intercept'].tolist()
num_segments = len(slopes)  # 分段数量

def train_model(X_train, y_train, breakpoints, slopes, intercepts):
    # 创建 Gurobi 模型
    model = gp.Model('global_linear_regression')
    
    # 定义决策变量 theta 和 beta
    theta = model.addVars(X_train.shape[1], lb=-GRB.INFINITY, vtype=GRB.CONTINUOUS, name='theta')
    beta = model.addVar(lb=-GRB.INFINITY, vtype=GRB.CONTINUOUS, name='beta')
    
    # 定义辅助变量 lambda 用于凸组合权重，t 作为误差变量
    lambdas = model.addVars(len(y_train), num_segments, lb=0, ub=1, vtype=GRB.CONTINUOUS, name='lambda')
    t = model.addVars(len(y_train), vtype=GRB.CONTINUOUS, name='t')

    # 设置目标函数：最小化 t 的均值
    model.setObjective(gp.quicksum(t[s] for s in range(len(y_train))) / len(y_train), GRB.MINIMIZE)

    # 约束：lambda 之和等于 1（确保每个样本在多个分段间的凸组合）
    for p in range(len(y_train)):
        model.addConstr(gp.quicksum(lambdas[p, s] for s in range(num_segments)) == 1, name=f'lambda_sum_{p}')
    
    # 约束：拟合误差 t
    for p in range(len(y_train)):
        linear_pred = gp.quicksum(theta[i] * X_train.iloc[p, i] for i in range(X_train.shape[1])) + beta
        piecewise_value = gp.quicksum(
            lambdas[p, s] * (slopes[s] * linear_pred + intercepts[s]) for s in range(num_segments)
        )
    
    # 将 y_train.iloc[p] 转换为 float 类型
        y_value = float(y_train.iloc[p])
    
    # 误差约束
        model.addConstr(piecewise_value - y_value <= t[p], name=f'err_upper_{p}')
        model.addConstr(y_value - piecewise_value <= t[p], name=f'err_lower_{p}')

    
    # 优化模型
    model.optimize()
    
    # 获取优化后的参数
    theta_result = np.array([theta[i].x for i in range(X_train.shape[1])])
    beta_result = beta.x
    t_values = np.array([t[p].x for p in range(len(y_train))])
    
    return theta_result, beta_result, t_values

# 主函数
def train_and_predict():
    # 训练模型
    theta_result, beta_result, t_values = train_model(X_train, Y_train, breakpoints, slopes, intercepts)
    
    # 预测函数
    def predict_y(X, theta, beta):
        return np.dot(X, theta) + beta
    
    y_train_pred = predict_y(X_train, theta_result, beta_result)
    y_test_pred = predict_y(X_test, theta_result, beta_result)

    # 保存结果
    pd.DataFrame(theta_result.reshape(1, -1), columns=[f'theta_{i}' for i in range(X_train.shape[1])]).to_csv('theta_results.csv', index=False)
    pd.DataFrame([beta_result], columns=['Beta']).to_csv('beta_results.csv', index=False)
    pd.DataFrame(t_values, columns=['t_values']).to_csv('t_values.csv', index=False)
    pd.DataFrame(y_train_pred, columns=['Predicted_Y']).to_csv('y_train_predictions.csv', index=False)
    pd.DataFrame(y_test_pred, columns=['Predicted_Y']).to_csv('y_test_predictions.csv', index=False)

# 运行
train_and_predict()


  y_value = float(y_train.iloc[p])


Gurobi Optimizer version 11.0.0 build v11.0.0rc2 (win64 - Windows 11+.0 (22631.2))

CPU model: 13th Gen Intel(R) Core(TM) i5-13400F, instruction set [SSE2|AVX|AVX2]
Thread count: 10 physical cores, 16 logical processors, using up to 16 threads

Optimize a model with 1638 rows, 27861 columns and 26208 nonzeros
Model fingerprint: 0x57bbfdb8
Model has 3276 quadratic constraints
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  QMatrix range    [2e+02, 1e+07]
  QLMatrix range   [1e+00, 2e+06]
  Objective range  [6e-04, 6e-04]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 1e+00]
  QRHS range       [8e+02, 3e+03]

Continuous model is non-convex -- solving as a MIP

Presolve time: 0.60s
Presolved: 1577394 rows, 420982 columns, 4013100 nonzeros
Presolved model has 393120 bilinear constraint(s)
         in product terms.
         Presolve was not able to compute smaller bounds for these variables.
         Consider bounding these variables or reformulating the model.

V