In [27]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [83]:


# 数据加载和预处理
file_path = "Dataset/Processed_NFLX.csv"
data = pd.read_csv(file_path)
numeric_columns = data.columns[1:]
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors="coerce")
data.dropna(inplace=True)

# 数据归一化
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data.iloc[:, 1:])
data_scaled = pd.DataFrame(data_scaled, columns=data.columns[1:])

# 数据集划分
train_size = int(len(data_scaled) * 0.7)
val_size = int(len(data_scaled) * 0.15)

train_data = data_scaled[:train_size]
val_data = data_scaled[train_size:train_size + val_size]
test_data = data_scaled[train_size + val_size:]

# 创建时间序列数据
def create_sequences(data, time_steps, target_column="Close"):
    sequences = []
    for i in range(len(data) - time_steps):
        seq = data.iloc[i:i + time_steps].values
        label = data.iloc[i + time_steps][target_column]
        sequences.append((seq, label))
    return sequences

# 使用最佳 time_steps 创建数据集
time_steps = 5
full_train_data = pd.concat([train_data, val_data])
full_train_sequences = create_sequences(full_train_data, time_steps, target_column="Close")
test_sequences = create_sequences(test_data, time_steps, target_column="Close")

full_train_loader = DataLoader(TensorDataset(
    torch.tensor([seq[0] for seq in full_train_sequences], dtype=torch.float32),
    torch.tensor([seq[1] for seq in full_train_sequences], dtype=torch.float32).unsqueeze(-1)
), batch_size=32, shuffle=True)

test_loader = DataLoader(TensorDataset(
    torch.tensor([seq[0] for seq in test_sequences], dtype=torch.float32),
    torch.tensor([seq[1] for seq in test_sequences], dtype=torch.float32).unsqueeze(-1)
), batch_size=32, shuffle=False)

# 模型定义
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, model_type="LSTM"):
        super(RNNModel, self).__init__()
        self.rnn = {
            "LSTM": nn.LSTM(input_size, hidden_size, num_layers, batch_first=True),
            "GRU": nn.GRU(input_size, hidden_size, num_layers, batch_first=True),
            "RNN": nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        }[model_type]
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        return self.fc(rnn_out[:, -1, :])  # 取最后一个时间步的输出

# 模型训练函数
def train_final_model_with_logging(model, train_loader, test_loader, criterion, optimizer, epochs=200, device="cpu"):
    model.to(device)
    results = {"epoch": [], "train_loss": [], "test_loss": []}

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        test_loss = 0.0
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                loss = criterion(y_pred, y_batch)
                test_loss += loss.item()
        test_loss /= len(test_loader)

        results["epoch"].append(epoch + 1)
        results["train_loss"].append(train_loss)
        results["test_loss"].append(test_loss)

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch + 1}: Train Loss = {train_loss:.4f}, Test Loss = {test_loss:.4f}")

    return results

# 模型评估函数
def evaluate_model(model, test_loader, criterion, device="cpu"):
    model.to(device)
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for x_batch, y_batch in test_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            y_pred = model(x_batch)
            predictions.append(y_pred.cpu().numpy().squeeze())
            actuals.append(y_batch.cpu().numpy().squeeze())

    predictions = np.concatenate(predictions, axis=0)
    actuals = np.concatenate(actuals, axis=0)

    test_loss = mean_squared_error(actuals, predictions)
    print(f"Test Loss: {test_loss:.4f}")
    return test_loss, actuals, predictions

# 保存预测值到单一文件格式
def save_predictions_by_format(actuals, predictions_dict, output_folder, data, scaler):
    # 创建与原始数据形状一致的零矩阵
    dummy_data = np.zeros((len(actuals), scaler.min_.shape[0]))
    # 仅填充目标列的实际值
    target_index = data.columns.tolist().index("Close") - 1  # 获取目标列在归一化数据中的索引
    dummy_data[:, target_index] = actuals

    # 反归一化实际值
    actuals_actual_values = scaler.inverse_transform(dummy_data)[:, target_index]

    # 初始化保存数据
    data_dict = {"Test Day": np.arange(1, len(actuals) + 1), "Actual": actuals_actual_values}

    # 反归一化预测值
    for model_type, preds in predictions_dict.items():
        dummy_data[:, target_index] = preds
        preds_actual_values = scaler.inverse_transform(dummy_data)[:, target_index]
        data_dict[f"Predicted ({model_type})"] = preds_actual_values

    # 保存为 CSV 文件
    predictions_df = pd.DataFrame(data_dict)
    prediction_file = os.path.join(output_folder, "predictions.csv")
    predictions_df.to_csv(prediction_file, index=False)
    print(f"Predictions saved to '{prediction_file}'")

# 评估指标计算
def calculate_metrics(actual, predicted):
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    r2 = r2_score(actual, predicted)
    rrmse = rmse / np.mean(actual)
    mda = np.mean(np.sign(actual[1:] - actual[:-1]) == np.sign(predicted[1:] - predicted[:-1]))
    return rmse, mae, r2, rrmse, mda

# 创建保存结果的文件夹
output_folder = "different_model"
os.makedirs(output_folder, exist_ok=True)

# 超参数设置
input_size = train_data.shape[1]
hidden_size = 32
num_layers = 1
output_size = 1
learning_rate = 0.0001
epochs = 200
device = "cuda" if torch.cuda.is_available() else "cpu"

metrics_list = []
predictions_dict = {}

# 训练模型并保存结果
for model_type in ["LSTM", "GRU", "RNN"]:
    print(f"\nTraining {model_type} model on full training data...")
    model = RNNModel(input_size, hidden_size, num_layers, output_size, model_type)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    results = train_final_model_with_logging(model, full_train_loader, test_loader, criterion, optimizer, epochs, device)

    results_df = pd.DataFrame(results)
    result_file = os.path.join(output_folder, f"{model_type}_training_log.csv")
    results_df.to_csv(result_file, index=False)
    print(f"Training log saved to '{result_file}'")

    _, actuals, predictions = evaluate_model(model, test_loader, criterion, device)
    predictions_dict[model_type] = predictions

    rmse, mae, r2, rrmse, mda = calculate_metrics(actuals, predictions)
    metrics_list.append({"model": model_type, "RMSE": rmse, "MAE": mae, "R²": r2, "RRMSE": rrmse, "MDA": mda})

# 保存所有预测值到单一文件
save_predictions_by_format(actuals, predictions_dict, output_folder, data, scaler)

# 保存评估指标到 CSV

      
metrics_df = pd.DataFrame(metrics_list)
metrics_file = os.path.join(output_folder, "model_metrics.csv")
metrics_df.to_csv(metrics_file, index=False)
print(f"Metrics saved to '{metrics_file}'")

print("All models trained and results saved.")



Training LSTM model on full training data...
Epoch 1: Train Loss = 0.2213, Test Loss = 0.5750
Epoch 10: Train Loss = 0.0019, Test Loss = 0.0065
Epoch 20: Train Loss = 0.0013, Test Loss = 0.0045
Epoch 30: Train Loss = 0.0013, Test Loss = 0.0042
Epoch 40: Train Loss = 0.0012, Test Loss = 0.0041
Epoch 50: Train Loss = 0.0011, Test Loss = 0.0041
Epoch 60: Train Loss = 0.0010, Test Loss = 0.0040
Epoch 70: Train Loss = 0.0010, Test Loss = 0.0039
Epoch 80: Train Loss = 0.0010, Test Loss = 0.0039
Epoch 90: Train Loss = 0.0009, Test Loss = 0.0038
Epoch 100: Train Loss = 0.0010, Test Loss = 0.0038
Epoch 110: Train Loss = 0.0009, Test Loss = 0.0038
Epoch 120: Train Loss = 0.0009, Test Loss = 0.0037
Epoch 130: Train Loss = 0.0009, Test Loss = 0.0036
Epoch 140: Train Loss = 0.0009, Test Loss = 0.0036
Epoch 150: Train Loss = 0.0009, Test Loss = 0.0037
Epoch 160: Train Loss = 0.0008, Test Loss = 0.0035
Epoch 170: Train Loss = 0.0008, Test Loss = 0.0035
Epoch 180: Train Loss = 0.0008, Test Loss = 0.0