In [1]:
import pandas as pd
import numpy as np

# ============================================
# 配置区
# ============================================

data_folder = 'data'
window_size = 12  # 输入窗口：过去12期
train_ratio = 0.8  # 训练集比例

# ============================================
# 主流程
# ============================================

# 1. 读取数据
df = pd.read_csv(f'{data_folder}/factor_longshort.csv')
print(f"原始数据形状: {df.shape}")

# 去掉日期列，只保留因子收益
factor_names = df.columns[1:].tolist()
data = df[factor_names].values
print(f"因子列表: {factor_names}")

# 2. 处理缺失值和异常值
data = np.nan_to_num(data, nan=0.0)  # 缺失值填0

# 3. 构造滑动窗口样本
X, y = [], []
for i in range(window_size, len(data)):
    X.append(data[i-window_size:i])  # 过去12期作为输入
    y.append(data[i])                 # 下一期作为输出

X = np.array(X)  # shape: (samples, window_size, num_factors)
y = np.array(y)  # shape: (samples, num_factors)

print(f"\n滑动窗口构造完成:")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# 4. 划分训练集和测试集
split_idx = int(len(X) * train_ratio)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"\n数据集划分:")
print(f"训练集: X_train {X_train.shape}, y_train {y_train.shape}")
print(f"测试集: X_test {X_test.shape}, y_test {y_test.shape}")

# 5. 保存数据
np.save(f'{data_folder}/X_train.npy', X_train)
np.save(f'{data_folder}/y_train.npy', y_train)
np.save(f'{data_folder}/X_test.npy', X_test)
np.save(f'{data_folder}/y_test.npy', y_test)

# 保存因子名称
with open(f'{data_folder}/factor_names.txt', 'w') as f:
    f.write(','.join(factor_names))

print(f"\n完成！已保存到 {data_folder}/")
print("- X_train.npy, y_train.npy")
print("- X_test.npy, y_test.npy")
print("- factor_names.txt")

原始数据形状: (120, 11)
因子列表: ['MOM20', 'MOM120', 'RSI', 'PB', 'PE', 'DIV', 'ROE', 'PROFIT_GR', 'VOL', 'BETA']

滑动窗口构造完成:
X shape: (108, 12, 10)
y shape: (108, 10)

数据集划分:
训练集: X_train (86, 12, 10), y_train (86, 10)
测试集: X_test (22, 12, 10), y_test (22, 10)

完成！已保存到 data/
- X_train.npy, y_train.npy
- X_test.npy, y_test.npy
- factor_names.txt


In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# ============================================
# 配置区
# ============================================

data_folder = 'data'
hidden_size = 64
num_layers = 2
learning_rate = 0.001
epochs = 100
batch_size = 16

# ============================================
# 1. 加载数据
# ============================================

X_train = np.load(f'{data_folder}/X_train.npy')
y_train = np.load(f'{data_folder}/y_train.npy')
X_test = np.load(f'{data_folder}/X_test.npy')
y_test = np.load(f'{data_folder}/y_test.npy')

with open(f'{data_folder}/factor_names.txt', 'r') as f:
    factor_names = f.read().strip().split(',')

num_factors = len(factor_names)
print(f"因子数量: {num_factors}")
print(f"训练集: {X_train.shape}, 测试集: {X_test.shape}")

# 转为Tensor
X_train_t = torch.FloatTensor(X_train)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test)
y_test_t = torch.FloatTensor(y_test)

train_loader = DataLoader(
    TensorDataset(X_train_t, y_train_t), 
    batch_size=batch_size, 
    shuffle=True
)

# ============================================
# 2. 定义LSTM模型
# ============================================

class FactorLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(FactorLSTM, self).__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x shape: (batch, seq_len, input_size)
        lstm_out, _ = self.lstm(x)
        # 取最后一个时间步
        out = self.fc(lstm_out[:, -1, :])
        return out

model = FactorLSTM(
    input_size=num_factors,
    hidden_size=hidden_size,
    num_layers=num_layers,
    output_size=num_factors
)

print(model)

# ============================================
# 3. 训练模型
# ============================================

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print("\n开始训练...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    if (epoch + 1) % 10 == 0:
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# ============================================
# 4. 测试集预测
# ============================================

model.eval()
with torch.no_grad():
    y_pred = model(X_test_t).numpy()

print(f"\n预测完成，预测结果 shape: {y_pred.shape}")

# ============================================
# 5. 保存预测结果
# ============================================

# 读取原始数据获取日期
df = pd.read_csv(f'{data_folder}/factor_longshort.csv')
window_size = 12
train_ratio = 0.8
total_samples = len(df) - window_size
split_idx = int(total_samples * train_ratio)

# 测试集对应的日期
test_dates = df['日期'].iloc[window_size + split_idx:].values

# 构建预测结果DataFrame
pred_columns = [f'{name}_pred' for name in factor_names]
pred_df = pd.DataFrame(y_pred, columns=pred_columns)
pred_df.insert(0, '日期', test_dates)

# 保存
pred_df.to_csv(f'{data_folder}/factor_prediction.csv', index=False, encoding='utf-8-sig')

print(f"\n已保存到 {data_folder}/factor_prediction.csv")
print("\n预测结果预览:")
print(pred_df.head())

# 保存模型
torch.save(model.state_dict(), f'{data_folder}/factor_lstm_model.pth')
print(f"\n模型已保存到 {data_folder}/factor_lstm_model.pth")

因子数量: 10
训练集: (86, 12, 10), 测试集: (22, 12, 10)
FactorLSTM(
  (lstm): LSTM(10, 64, num_layers=2, batch_first=True, dropout=0.2)
  (fc): Linear(in_features=64, out_features=10, bias=True)
)

开始训练...
Epoch 10/100, Loss: 32.6543
Epoch 20/100, Loss: 29.8312
Epoch 30/100, Loss: 27.0859
Epoch 40/100, Loss: 22.0712
Epoch 50/100, Loss: 18.6537
Epoch 60/100, Loss: 15.0597
Epoch 70/100, Loss: 13.3991
Epoch 80/100, Loss: 12.9479
Epoch 90/100, Loss: 12.0378
Epoch 100/100, Loss: 9.9292

预测完成，预测结果 shape: (22, 10)

已保存到 data/factor_prediction.csv

预测结果预览:
           日期  MOM20_pred  MOM120_pred  RSI_pred    PB_pred   PE_pred  \
0  2023-03-01   -3.505605    -3.394677 -2.102957   3.386605  3.749079   
1  2023-04-01   -2.381520    -2.273877 -1.335405   0.958414  1.704972   
2  2023-05-01    1.242640     1.370278  0.862291  -8.359152 -7.256452   
3  2023-06-01   -3.148259    -3.126997 -2.814463 -10.009925 -8.612885   
4  2023-07-01   -1.507943    -1.606989 -1.536146  -7.645605 -5.218080   

   DIV_pred  ROE