# 电力负荷预测系统 - 完整分析

本notebook包含完整的电力负荷预测流程，包括：
1. 数据加载与探索
2. 数据预处理
3. 特征工程
4. 模型训练
5. 模型评估与可视化

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

# 设置绘图样式
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("所有库导入成功！")

## 1. 数据加载与探索

In [None]:
# 加载数据
df = pd.read_csv('../data/synthetic_load.csv')

print(f"数据形状: {df.shape}")
print(f"\n数据前5行:")
df.head()

In [None]:
# 查看数据信息
print("数据信息:")
df.info()

In [None]:
# 描述性统计
df.describe()

In [None]:
# 可视化电力需求分布
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# 分布图
axes[0].hist(df['nat_demand'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_title('Power Demand Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Demand')
axes[0].set_ylabel('Frequency')

# 箱线图
axes[1].boxplot(df['nat_demand'])
axes[1].set_title('Power Demand Box Plot', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Demand')

plt.tight_layout()
plt.show()

## 2. 数据预处理

In [None]:
# 处理日期时间
df['datetime'] = pd.to_datetime(df['datetime'])

# 提取时间特征
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofweek'] = df['datetime'].dt.dayofweek
df['quarter'] = df['datetime'].dt.quarter
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

print("时间特征提取完成！")
print(f"新增特征: {['year', 'month', 'day', 'hour', 'dayofweek', 'quarter', 'is_weekend']}")

In [None]:
# 检查缺失值
print("缺失值统计:")
print(df.isnull().sum())

# 处理缺失值（如果有）
if df.isnull().sum().sum() > 0:
    df.fillna(method='ffill', inplace=True)
    print("\n缺失值已处理")

## 3. 特征工程

In [None]:
# 创建滞后特征
lags = [1, 2, 3, 24, 168]
for lag in lags:
    df[f'nat_demand_lag_{lag}'] = df['nat_demand'].shift(lag)

print(f"创建了 {len(lags)} 个滞后特征")

In [None]:
# 创建滚动窗口特征
windows = [24, 168]
for window in windows:
    df[f'nat_demand_rolling_mean_{window}'] = df['nat_demand'].rolling(window=window).mean()
    df[f'nat_demand_rolling_std_{window}'] = df['nat_demand'].rolling(window=window).std()

print(f"创建了 {len(windows)*2} 个滚动窗口特征")

In [None]:
# 创建交互特征
df['temp_holiday'] = df['T2M_toc'] * df['holiday']
df['temp_hour'] = df['T2M_toc'] * df['hour']
df['weekend_hour'] = df['is_weekend'] * df['hour']

print("创建了3个交互特征")

In [None]:
# 删除NaN行
print(f"删除前: {len(df)} 行")
df.dropna(inplace=True)
print(f"删除后: {len(df)} 行")

## 4. 准备训练数据

In [None]:
# 选择特征
target = 'nat_demand'
exclude_cols = ['datetime', 'nat_demand']

feature_cols = [col for col in df.columns 
               if col not in exclude_cols and df[col].dtype in ['float64', 'int64']]

print(f"特征数量: {len(feature_cols)}")
print(f"\n特征列表:")
for i, col in enumerate(feature_cols, 1):
    print(f"{i}. {col}")

In [None]:
# 划分特征和目标
X = df[feature_cols]
y = df[target]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

print(f"训练集大小: {len(X_train)}")
print(f"测试集大小: {len(X_test)}")

In [None]:
# 标准化（仅用于线性回归）
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("特征标准化完成")

## 5. 模型训练

In [None]:
# 训练线性回归
print("训练线性回归模型...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
print("线性回归训练完成！")

In [None]:
# 训练随机森林
print("训练随机森林模型...")
rf_model = RandomForestRegressor(
    n_estimators=50,
    max_depth=20,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf_model.fit(X_train, y_train)
print("随机森林训练完成！")

In [None]:
# 训练XGBoost
print("训练XGBoost模型...")
xgb_model = XGBRegressor(
    n_estimators=50,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)
xgb_model.fit(X_train, y_train)
print("XGBoost训练完成！")

## 6. 模型评估

In [None]:
# 预测
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

# 计算评估指标
models_results = {}

for name, y_pred in [('Linear Regression', y_pred_lr), 
                     ('Random Forest', y_pred_rf), 
                     ('XGBoost', y_pred_xgb)]:
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    models_results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'predictions': y_pred
    }
    
    print(f"\n{name}:")
    print(f"  MAE:  {mae:.2f}")
    print(f"  RMSE: {rmse:.2f}")
    print(f"  R²:   {r2:.4f}")

In [None]:
# 可视化模型性能对比
metrics_df = pd.DataFrame([
    {'Model': name, 'MAE': metrics['MAE'], 'RMSE': metrics['RMSE'], 'R2': metrics['R2']}
    for name, metrics in models_results.items()
])

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# MAE
axes[0].bar(metrics_df['Model'], metrics_df['MAE'], color='skyblue')
axes[0].set_title('Mean Absolute Error', fontweight='bold')
axes[0].set_ylabel('MAE')
axes[0].tick_params(axis='x', rotation=45)

# RMSE
axes[1].bar(metrics_df['Model'], metrics_df['RMSE'], color='lightcoral')
axes[1].set_title('Root Mean Squared Error', fontweight='bold')
axes[1].set_ylabel('RMSE')
axes[1].tick_params(axis='x', rotation=45)

# R²
axes[2].bar(metrics_df['Model'], metrics_df['R2'], color='lightgreen')
axes[2].set_title('R² Score', fontweight='bold')
axes[2].set_ylabel('R²')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# 可视化预测结果
n_samples = 500
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

for idx, (name, metrics) in enumerate(models_results.items()):
    axes[idx].plot(y_test.values[:n_samples], label='Actual', linewidth=2, alpha=0.7)
    axes[idx].plot(metrics['predictions'][:n_samples], label='Predicted', linewidth=2, alpha=0.7)
    axes[idx].set_title(f'{name} - Predictions vs Actual', fontweight='bold')
    axes[idx].set_xlabel('Time Steps')
    axes[idx].set_ylabel('Power Demand')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# 特征重要性分析
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Random Forest
rf_importances = rf_model.feature_importances_
rf_indices = np.argsort(rf_importances)[-15:]
axes[0].barh(range(len(rf_indices)), rf_importances[rf_indices], color='skyblue')
axes[0].set_yticks(range(len(rf_indices)))
axes[0].set_yticklabels([feature_cols[i] for i in rf_indices], fontsize=9)
axes[0].set_xlabel('Importance')
axes[0].set_title('Random Forest - Top 15 Features', fontweight='bold')

# XGBoost
xgb_importances = xgb_model.feature_importances_
xgb_indices = np.argsort(xgb_importances)[-15:]
axes[1].barh(range(len(xgb_indices)), xgb_importances[xgb_indices], color='lightcoral')
axes[1].set_yticks(range(len(xgb_indices)))
axes[1].set_yticklabels([feature_cols[i] for i in xgb_indices], fontsize=9)
axes[1].set_xlabel('Importance')
axes[1].set_title('XGBoost - Top 15 Features', fontweight='bold')

plt.tight_layout()
plt.show()

## 7. 结论

本项目成功构建了三个机器学习模型来预测电力负荷：
- **线性回归**: 作为基准模型
- **随机森林**: 表现良好的集成学习模型
- **XGBoost**: 高性能的梯度提升模型

通过比较三个模型的MAE、RMSE和R²指标，可以选择最适合的模型进行实际应用。

### 改进方向
1. 尝试更多的特征工程方法
2. 进行超参数调优
3. 尝试深度学习模型（LSTM、GRU等）
4. 添加更多的外部数据源