In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm  # 添加 tqdm 进度条
import matplotlib.pyplot as plt

# 加载所有CSV文件
def load_data(data_folder):
    data_frames = []
    num = 0
    for file in tqdm(os.listdir(data_folder), desc="Loading CSV files"):
        if num >= 1000:
            break
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(data_folder, file), index_col=0, parse_dates=True)
            data_frames.append(df)
        num += 1
    return data_frames

# 数据预处理
def preprocess_data(df_list, time_window, future_window):
    x_data, y_data = [], []
    for df in tqdm(df_list, desc="Preprocessing data"):
        df = df[['open', 'close', 'high', 'low', 'volume', 'money', 'avg', 'high_limit', 'low_limit', 'pre_close', 'paused', 'factor', 'MA5', 'MA10', 'RSI', 'Williams %R']]
        
        # 处理 NaN 值
        df = df.ffill().bfill()
        
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(df)
        
        for i in range(len(scaled_data) - time_window - future_window):
            x_data.append(scaled_data[i:i + time_window])
            future_close = df.iloc[i + time_window + future_window]['close']
            current_close = df.iloc[i + time_window]['close']
            y_data.append((future_close - current_close) / current_close)  # 涨跌幅度百分比

    x_data = np.array(x_data)
    y_data = np.array(y_data)
    x_data = np.expand_dims(x_data, axis=-1)
    return x_data, y_data

# 检查数据加载和预处理部分
data_folder = '/kaggle/input/stockchina/processed_data'  # 数据文件夹路径
time_window = 30  # 时间窗口大小
future_window = 1  # 预测未来多少天的涨跌幅度

df_list = load_data(data_folder)
x_data, y_data = preprocess_data(df_list, time_window, future_window)

# 输出一些数据统计信息
print("x_data shape:", x_data.shape)
print("y_data shape:", y_data.shape)
print("NaN in x_data:", np.isnan(x_data).sum())
print("NaN in y_data:", np.isnan(y_data).sum())

# 如果存在 NaN 值，处理掉
if np.isnan(x_data).sum() > 0:
    x_data = x_data[~np.isnan(x_data).any(axis=(1, 2, 3))]
if np.isnan(y_data).sum() > 0:
    y_data = y_data[~np.isnan(y_data)]

print("x_data shape after removing NaN:", x_data.shape)
print("y_data shape after removing NaN:", y_data.shape)

Loading CSV files:  19%|█▉        | 1000/5133 [00:22<01:34, 43.68it/s]
Preprocessing data: 100%|██████████| 1000/1000 [06:44<00:00,  2.47it/s]


x_data shape: (4461000, 30, 16, 1)
y_data shape: (4461000,)
NaN in x_data: 0
NaN in y_data: 0
x_data shape after removing NaN: (4461000, 30, 16, 1)
y_data shape after removing NaN: (4461000,)


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models

# 构建卷积神经网络模型
def build_cnn_model(input_shape):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.MaxPooling2D((2, 2), padding='same'),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2), padding='same'),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2), padding='same'),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2), padding='same'),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2), padding='same'),
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.MaxPooling2D((2, 2), padding='same'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='linear')  # 预测涨跌幅度
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    return model

input_shape = x_data.shape[1:]
model = build_cnn_model(input_shape)
model.summary()

2024-06-19 19:01:29.950832: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-19 19:01:29.950962: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-19 19:01:30.064107: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# 数据分割
split = int(0.8 * len(x_data))
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = y_data[:split], y_data[split:]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# 检查训练数据和测试数据中是否存在NaN值
print("NaN in x_train:", np.isnan(x_train).sum())
print("NaN in y_train:", np.isnan(y_train).sum())
print("NaN in x_test:", np.isnan(x_test).sum())
print("NaN in y_test:", np.isnan(y_test).sum())

# 训练模型并保存模型
history = model.fit(x_train, y_train, epochs=10, batch_size=8192, validation_data=(x_test, y_test))
model.save("stock_prediction_cnn_model.h5")

x_train shape: (3568800, 30, 16, 1)
x_test shape: (892200, 30, 16, 1)
y_train shape: (3568800,)
y_test shape: (892200,)
NaN in x_train: 0
NaN in y_train: 0
NaN in x_test: 0
NaN in y_test: 0


In [None]:
# 评估模型
test_loss, test_mae = model.evaluate(x_test, y_test)
print(f"测试损失: {test_loss}, 测试MAE: {test_mae}")

# 预测和可视化
predictions = model.predict(x_test)
plt.figure(figsize=(12, 6), dpi=1600)
plt.plot(y_test, label='Real Gains and Losses')
plt.plot(predictions, label='Val Gains and Losses')
plt.legend()
plt.show()