In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset, random_split

# 加载数据
file_path = 'Train data.csv'
train_data = pd.read_csv(file_path, parse_dates=['DateTime']).sort_values(by='DateTime')

# 设置异常检测的阈值
threshold = 100
train_data['is_anomaly'] = train_data['Load'] < threshold

# 定义季节划分函数
def assign_season(month):
    if month in [3, 4, 5]:
        return 0  # 春季
    elif month in [6, 7, 8]:
        return 1  # 夏季
    elif month in [9, 10, 11]:
        return 2  # 秋季
    else:
        return 3  # 冬季

# 提取时间标签特征
train_data['day_of_week'] = train_data['DateTime'].dt.dayofweek  # 0-6 表示周一到周日
train_data['hour_of_day'] = train_data['DateTime'].dt.hour       # 0-23 表示一天的小时
train_data['season'] = train_data['DateTime'].dt.month.apply(assign_season)  # 按中国大陆季节划分


# 选择特征并标准化
features = ['Temperature', 'Humidity', 'Wind_speed', 'Precipitation']
scaler = MinMaxScaler()
train_data[features + ['Load']] = scaler.fit_transform(train_data[features + ['Load']])

# 从 MinMaxScaler 中提取参数
data_min = scaler.data_min_
data_max = scaler.data_max_
scale = scaler.scale_


# 构建时间序列数据，并跳过包含异常数据的窗口
sequence_length = 24

def create_sequences_with_time_labels(data, seq_length, target_col):
    sequences = []
    targets = []
    time_labels = []
    for i in range(len(data) - seq_length):
        window = data.iloc[i:i+seq_length]
        # 检查窗口中是否有异常数据
        if window['is_anomaly'].sum() == 0:  # 如果窗口中没有异常数据
            # 获取前6个时间步的负荷和辅助特征
            sequence = window[features + [target_col]].values.flatten()
            # 添加当前时刻的辅助特征到序列末尾
            current_features = data.iloc[i + seq_length][features].values
            full_sequence = np.concatenate((sequence, current_features))
            sequences.append(full_sequence)
            targets.append(data.iloc[i + seq_length][target_col])
            # 提取时间标签
            time_labels.append(data.iloc[i + seq_length][['day_of_week', 'hour_of_day', 'season']].values)
    return np.array(sequences,dtype=np.float32), np.array(targets,dtype=np.float32), np.array(time_labels,dtype=np.int64)

# 构建数据集，包含时间标签
X, y, time_labels = create_sequences_with_time_labels(train_data, sequence_length, 'Load')
print(X)

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)
time_labels = np.array(time_labels, dtype=np.int64)  # 转换为整数类型
time_labels = torch.tensor(time_labels, dtype=torch.long)

# 划分训练集和验证集（80%训练集，20%验证集）
dataset = TensorDataset(X, time_labels, y)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


# 定义MLP模型，包含时间Embedding
class MLPModelWithEmbedding(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, day_emb_dim=4, hour_emb_dim=4, season_emb_dim=4):
        super(MLPModelWithEmbedding, self).__init__()
        
        # 时间标签的Embedding层
        self.day_embedding = nn.Embedding(7, day_emb_dim)
        self.hour_embedding = nn.Embedding(24, hour_emb_dim)
        self.season_embedding = nn.Embedding(4, season_emb_dim)
        
        # 计算总输入维度
        time_embedding_dim = day_emb_dim + hour_emb_dim + season_emb_dim
        total_input_size = input_size + time_embedding_dim

        # MLP层
        self.fc1 = nn.Linear(total_input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc3 = nn.Linear(hidden_size // 2, output_size)
    
    def forward(self, x, time_labels):
        # 时间标签embedding
        day_emb = self.day_embedding(time_labels[:, 0])
        hour_emb = self.hour_embedding(time_labels[:, 1])
        season_emb = self.season_embedding(time_labels[:, 2])
        
        # 合并输入
        time_emb = torch.cat((day_emb, hour_emb, season_emb), dim=1)
        x = torch.cat((x, time_emb), dim=1)
        
        # MLP前向传播
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

# 初始化MLP模型、损失函数和优化器
input_size = X.shape[1]
model = MLPModelWithEmbedding(input_size=input_size, hidden_size=128, output_size=1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 定义最小验证损失和最小MAPE的初始值
best_val_loss = float('inf')
best_mape = float('inf')

# 训练模型并在每个epoch进行验证
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for X_batch, time_labels_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch, time_labels_batch)
        loss = criterion(outputs, y_batch.view(-1, 1))
        loss.backward()
        optimizer.step()
    
    # 在验证集上评估模型
    model.eval()
    val_loss = 0
    mape = 0
    with torch.no_grad():
        for X_val, time_labels_val, y_val in val_loader:
            val_outputs = model(X_val, time_labels_val)
            val_loss += criterion(val_outputs, y_val.view(-1, 1)).item()
            
            # 计算MAPE
            y_true = y_val.view(-1, 1)
            y_pred = val_outputs
            mape += (torch.abs((y_true - y_pred) / y_true).mean().item()) * 100  # 转换为百分比

    # 计算平均的验证损失和MAPE
    val_loss /= len(val_loader)
    mape /= len(val_loader)
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}, MAPE: {mape:.2f}%')
    
    # 保存验证损失最小的模型
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_val_loss_model.pth')
        print(f"保存模型: 验证损失最小的模型 (Val Loss: {best_val_loss:.4f})")
        
    # 保存MAPE最小的模型
    if mape < best_mape:
        best_mape = mape
        torch.save(model.state_dict(), 'best_mape_model.pth')
        print(f"保存模型: 验证MAPE最小的模型 (MAPE: {best_mape:.2f}%)")

# 最终保存的两个模型文件：
# 'best_val_loss_model.pth' 保存验证集上损失最小的模型
# 'best_mape_model.pth' 保存验证集上MAPE最小的模型

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler

# 定义 MinMaxScaler 的参数（在训练阶段确定的参数）
data_min = scaler.data_min_
data_max = scaler.data_max_
scale = scaler.scale_

# 反向标准化函数
def inverse_min_max_scale(scaled_value, data_min, data_max):
    return scaled_value * (data_max - data_min) + data_min

# 手动标准化函数
def manual_min_max_scale(data, data_min, scale):
    return (data - data_min) * scale

# 滚动修复异常值
def rolling_repair_anomalies(original_data, model, sequence_length, data_min, data_max, scale):
    repaired_data = original_data.copy()
    model.eval()  # 设置模型为评估模式
    with torch.no_grad():
        # 找到所有异常时刻的索引
        anomaly_indices = repaired_data[repaired_data['is_anomaly']].index
        for i in anomaly_indices:
            # 提取前24个时间步的特征和负荷数据（未标准化）
            sequence = original_data.iloc[i-sequence_length:i][features + ['Load']].values

            # 对前24个时间步的数据进行手动标准化
            sequence = manual_min_max_scale(sequence, data_min, scale).astype(np.float32)  # 确保类型为 np.float32
            sequence = sequence.flatten()  # 平铺成一维数组
            
            # 获取当前异常时刻的辅助特征并手动标准化
            current_features = original_data.iloc[i][features].values
            current_features = manual_min_max_scale(current_features, data_min[:len(features)], scale[:len(features)]).astype(np.float32)
            
            # 合并前24个时间步的标准化特征和当前时刻的标准化辅助特征
            full_sequence = np.concatenate((sequence, current_features)).astype(np.float32)
            
            # 转换为 Tensor
            full_sequence = torch.tensor(full_sequence, dtype=torch.float32).unsqueeze(0)
            
            # 提取时间标签（确保整数类型 np.int32）
            time_labels = original_data.iloc[i][['day_of_week', 'hour_of_day', 'season']].values.astype(np.int32)
            time_labels = torch.tensor(time_labels, dtype=torch.long).unsqueeze(0)
            
            # 使用模型进行预测（得到的是标准化后的负荷值）
            scaled_repaired_value = model(full_sequence, time_labels).item()
            
            # 对预测值进行反向标准化
            original_repaired_value = inverse_min_max_scale(scaled_repaired_value, data_min[-1], data_max[-1])
            
            # 将反向标准化后的预测值替换到 `repaired_data` 和 `original_data` 中的异常负荷值
            repaired_data.at[i, 'Load'] = original_repaired_value
            original_data.at[i, 'Load'] = original_repaired_value  # 确保后续读取时得到已修复的值
            
    return repaired_data


# 读取原始文件数据
file_path = 'Train data.csv'
original_data = pd.read_csv(file_path, parse_dates=['DateTime']).sort_values(by='DateTime')

# 设置异常检测的阈值
threshold = 100
original_data['is_anomaly'] = original_data['Load'] < threshold

# 定义季节划分函数
def assign_season(month):
    if month in [3, 4, 5]:
        return 0  # 春季
    elif month in [6, 7, 8]:
        return 1  # 夏季
    elif month in [9, 10, 11]:
        return 2  # 秋季
    else:
        return 3  # 冬季

# 提取时间标签特征
original_data['day_of_week'] = original_data['DateTime'].dt.dayofweek  # 0-6 表示周一到周日
original_data['hour_of_day'] = original_data['DateTime'].dt.hour       # 0-23 表示一天的小时
original_data['season'] = original_data['DateTime'].dt.month.apply(assign_season)  # 按中国大陆季节划分


# 定义特征列表
features = ['Temperature', 'Humidity', 'Wind_speed', 'Precipitation']

# 加载验证集上 MSE 最小的模型
model = MLPModelWithEmbedding(input_size=input_size, hidden_size=128, output_size=1)
model.load_state_dict(torch.load('best_val_loss_model.pth'))
print("已加载验证集上 MSE 表现最好的模型。")

# 修复异常数据
sequence_length = 24  # 使用前24个时间步
repaired_data = rolling_repair_anomalies(original_data, model, sequence_length, data_min, data_max, scale)

# 保存修复后的数据
repaired_data.to_csv('Repaired_Train_Data.csv', index=False)
print("修复后的数据已保存为 'Repaired_Train_Data.csv' 文件。")



[[0.19008575 0.7848808  0.24119395 ... 0.9200101  0.37501767 0.        ]
 [0.18914689 0.78880644 0.23723298 ... 0.9255564  0.35775924 0.        ]
 [0.18889654 0.80558956 0.24458905 ... 0.9287978  0.33059838 0.        ]
 ...
 [0.21649872 0.77371603 0.11161409 ... 0.7025859  0.10991654 0.03018868]
 [0.19684547 0.82320106 0.11387749 ... 0.69178134 0.13453105 0.03018868]
 [0.18626776 0.82543397 0.11741406 ... 0.69812    0.15730655 0.03018868]]
Epoch [1/100], Train Loss: 0.0014, Val Loss: 0.0012, MAPE: 6.52%
保存模型: 验证损失最小的模型 (Val Loss: 0.0012)
保存模型: 验证MAPE最小的模型 (MAPE: 6.52%)
Epoch [2/100], Train Loss: 0.0006, Val Loss: 0.0009, MAPE: 5.56%
保存模型: 验证损失最小的模型 (Val Loss: 0.0009)
保存模型: 验证MAPE最小的模型 (MAPE: 5.56%)
Epoch [3/100], Train Loss: 0.0004, Val Loss: 0.0008, MAPE: 5.39%
保存模型: 验证损失最小的模型 (Val Loss: 0.0008)
保存模型: 验证MAPE最小的模型 (MAPE: 5.39%)
Epoch [4/100], Train Loss: 0.0003, Val Loss: 0.0009, MAPE: 5.77%
Epoch [5/100], Train Loss: 0.0009, Val Loss: 0.0005, MAPE: 4.45%
保存模型: 验证损失最小的模型 (Val Loss: 0.0