In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

# 加载所有CSV文件
def load_data(data_folder):
    data_frames = []
    num = 0
    for file in tqdm(os.listdir(data_folder), desc="Loading CSV files"):
        if num >= 500:
            break
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(data_folder, file), index_col=0, parse_dates=True)
            data_frames.append(df)
        num += 1
    return data_frames

# 数据预处理
def preprocess_data(df_list, time_window, future_window):
    x_data, y_data = [], []
    for df in tqdm(df_list, desc="Preprocessing data"):
        df = df[['open', 'close', 'high', 'low', 'volume', 'money', 'avg', 'high_limit', 'low_limit', 'pre_close', 'paused', 'factor', 'MA5', 'MA10', 'RSI', 'Williams %R']]
        
        # 处理 NaN 值
        df = df.ffill().bfill()
        
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(df)
        
        for i in range(len(scaled_data) - time_window - future_window):
            x_data.append(scaled_data[i:i + time_window])
            future_close = df.iloc[i + time_window + future_window]['close']
            current_close = df.iloc[i + time_window]['close']
            y_data.append((future_close - current_close) / current_close)  # 涨跌幅度百分比

    x_data = np.array(x_data)
    y_data = np.array(y_data)
    x_data = np.expand_dims(x_data, axis=-1)
    return x_data, y_data

# 检查数据加载和预处理部分
data_folder = '/root/autodl-tmp/processed_data'  # 数据文件夹路径
time_window = 45  # 时间窗口大小
future_window = 10  # 预测未来多少天的涨跌幅度

df_list = load_data(data_folder)
x_data, y_data = preprocess_data(df_list, time_window, future_window)

# 输出一些数据统计信息
print("x_data shape:", x_data.shape)
print("y_data shape:", y_data.shape)
print("NaN in x_data:", np.isnan(x_data).sum())
print("NaN in y_data:", np.isnan(y_data).sum())

# 如果存在 NaN 值，处理掉
if np.isnan(x_data).sum() > 0:
    x_data = x_data[~np.isnan(x_data).any(axis=(1, 2, 3))]
if np.isnan(y_data).sum() > 0:
    y_data = y_data[~np.isnan(y_data)]

print("x_data shape after removing NaN:", x_data.shape)
print("y_data shape after removing NaN:", y_data.shape)

Loading CSV files:  10%|▉         | 500/5133 [00:05<00:48, 96.50it/s] 
Preprocessing data: 100%|██████████| 500/500 [03:22<00:00,  2.47it/s]


x_data shape: (2218500, 45, 16, 1)
y_data shape: (2218500,)
NaN in x_data: 0
NaN in y_data: 0
x_data shape after removing NaN: (2218500, 45, 16, 1)
y_data shape after removing NaN: (2218500,)


In [None]:
with open('x_data.pkl', 'wb') as file:
    pickle.dump(x_data, file)
with open('y_data.pkl', 'wb') as file:
    pickle.dump(y_data, file)

In [None]:
import pickle
with open('x_data.pkl', 'rb') as file:
    x_data = pickle.load(file)
with open('y_data.pkl', 'rb') as file:
    y_data = pickle.load(file)

In [4]:
import tensorflow as tf
from tensorflow.keras import layers, models

# 残差块定义
def residual_block(x, filters, kernel_size=3, stride=1, activation='relu'):
    shortcut = x
    x = layers.Conv2D(filters, kernel_size, strides=stride, padding='same')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation(activation)(x)
    x = layers.Conv2D(filters, kernel_size, strides=1, padding='same')(x)
    x = layers.BatchNormalization()(x)
    
    # 如果输入和输出的维度不同，通过卷积调整维度
    if shortcut.shape[-1] != filters:
        shortcut = layers.Conv2D(filters, kernel_size=1, strides=stride, padding='same')(shortcut)
        shortcut = layers.BatchNormalization()(shortcut)
    
    x = layers.add([x, shortcut])
    x = layers.Activation(activation)(x)
    return x

# 构建残差网络模型
def build_resnet_model(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    
    x = residual_block(x, 64)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    
    x = residual_block(x, 128)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    
    x = residual_block(x, 256)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    
    x = layers.Flatten()(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(1, activation='linear')(x)  # 预测涨跌幅度
    
    model = models.Model(inputs, outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

input_shape = x_data.shape[1:]
model = build_resnet_model(input_shape)
model.summary()

2024-06-20 04:44:40.329265: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-20 04:44:40.387140: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-20 04:44:42.286707: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-20 04:44:42.326583: I tens

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 45, 16, 1)]          0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 45, 16, 32)           320       ['input_1[0][0]']             
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 23, 8, 32)            0         ['conv2d[0][0]']              
 D)                                                                                               
                                                                                                  
 conv2d_1 (Conv2D)           (None, 23, 8, 64)            18496     ['max_pooling2d[0][0]']   

In [5]:
# 数据分割
split = int(0.8 * len(x_data))
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = y_data[:split], y_data[split:]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# 检查训练数据和测试数据中是否存在NaN值
print("NaN in x_train:", np.isnan(x_train).sum())
print("NaN in y_train:", np.isnan(y_train).sum())
print("NaN in x_test:", np.isnan(x_test).sum())
print("NaN in y_test:", np.isnan(y_test).sum())

# 训练模型并保存模型
history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_test, y_test))
model.save("stock_prediction_resnet_model.h5")

x_train shape: (1774800, 45, 16, 1)
x_test shape: (443700, 45, 16, 1)
y_train shape: (1774800,)
y_test shape: (443700,)
NaN in x_train: 0
NaN in y_train: 0
NaN in x_test: 0
NaN in y_test: 0
Epoch 1/10


2024-06-20 04:45:13.621372: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8600
2024-06-20 04:45:14.150658: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-06-20 04:45:14.173015: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55ea8d5c9730 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-06-20 04:45:14.173046: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-06-20 04:45:14.178856: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-06-20 04:45:14.323803: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the p

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


OSError: [Errno 28] Can't synchronously write data (file write failed: time = Thu Jun 20 05:13:33 2024
, filename = 'stock_prediction_resnet_model.h5', file descriptor = 89, errno = 28, error message = 'No space left on device', buf = 0x55ea94aa2870, total write size = 2331808, bytes this sub-write = 2331808, bytes actually written = 18446744073709551615, offset = 0)

In [None]:
# 评估模型
test_loss, test_mae = model.evaluate(x_test, y_test)
print(f"测试损失: {test_loss}, 测试MAE: {test_mae}")

# 预测和可视化
predictions = model.predict(x_test)
plt.figure(figsize=(12, 6))
plt.plot(y_test, label='真实涨跌幅度')
plt.plot(predictions, label='预测涨跌幅度')
plt.legend()
plt.show()