In [40]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.models import load_model



In [18]:
# 数据加载函数
def load_and_process_data(lob_path, tape_path):
    # Load LOB data
    Feature_UoB_Set01_LOBs = pd.read_csv(lob_path).dropna()
    Feature_UoB_Set01_LOBs = Feature_UoB_Set01_LOBs.reset_index().drop("index", axis=1)
    
    # Load Tape data
    clean_uob_set01_tapes = pd.read_csv(tape_path)
    
    # Resample and adjust function
    def resample_and_adjust_using_original_times(df, target_count):
        # Assuming 'Time' is in seconds and 'Weighted_Price' is the price
        df['Time_bin'] = (df['Time'] // 10 * 10).astype(int)
        
        # Ensure the range includes 0-10 seconds bin
        all_time_bins = range(0, df['Time_bin'].max() + 10, 10)  # Starts from 0 now
        grouped = df.groupby('Time_bin')
        
        new_records = []
        last_known_price = df['Weighted_Price'].iloc[0]  # Initialize with the first known price
        
        # Iterate through all possible time bins
        for time_bin in all_time_bins:
            if time_bin in grouped.groups:
                group = grouped.get_group(time_bin)
                times = group['Time'].values
                prices = group['Weighted_Price'].values
                if len(prices) > 0:
                    last_known_price = prices[-1]
            else:
                group = None
            
            # If the current time bin is empty or has only one price data point
            if group is None or len(prices) == 0:
                prices = np.full(target_count, last_known_price)
                times = np.linspace(time_bin, time_bin + 9, num=target_count)
            elif len(prices) == 1:
                times = np.linspace(time_bin, time_bin + 9, num=target_count)
                prices = np.full(target_count, last_known_price)
            else:
                # If there are fewer data points than the target count, additional points will be added
                while len(prices) < target_count:
                    time_diffs = np.diff(times)
                    idx_to_fill = np.argmax(time_diffs)
                    new_time = (times[idx_to_fill] + times[idx_to_fill + 1]) / 2
                    new_price = (prices[idx_to_fill] + prices[idx_to_fill + 1]) / 2
                    times = np.insert(times, idx_to_fill + 1, new_time)
                    prices = np.insert(prices, idx_to_fill + 1, new_price)
                # If there are multiple data points within the time bin, proceed with normal processing
                while len(prices) > target_count:
                    time_diffs = np.diff(times)
                    idx_to_merge = np.argmin(time_diffs)
                    new_time = (times[idx_to_merge] + times[idx_to_merge + 1]) / 2
                    new_price = (prices[idx_to_merge] + prices[idx_to_merge + 1]) / 2
                    times = np.delete(times, [idx_to_merge, idx_to_merge + 1])
                    times = np.insert(times, idx_to_merge, new_time)
                    prices = np.delete(prices, [idx_to_merge, idx_to_merge + 1])
                    prices = np.insert(prices, idx_to_merge, new_price)
            # Add the processed data to the results list          
            new_records.extend(zip(times, prices))
    
        new_df = pd.DataFrame(new_records, columns=['Time', 'Weighted_Price'])
        new_df.sort_values('Time', inplace=True)
        return new_df


    # Resample the data
    processed_df = resample_and_adjust_using_original_times(clean_uob_set01_tapes, 6)
    
    
    # Create dataset function
    def create_dataset(data, time_step, step=6):
        X, y = [], []
        for i in range(0, len(data)-time_step, step):
            X.append(data[i:(i+time_step), 0])
            y.append(data[(i+time_step):(i+time_step+6), 0])
        return np.array(X), np.array(y)

    price = processed_df['Weighted_Price'].values.reshape(-1, 1)
    time_step = 360
    _, tags = create_dataset(price, time_step, step=6)

    # Feature scaling
    scaler = MinMaxScaler()
    features = scaler.fit_transform(Feature_UoB_Set01_LOBs)
    targets = scaler.fit_transform(tags)

    np.save(lob_path[-18:-4]+'_features.npy', features)
    np.save(tape_path[-19:-4]+'_targets.npy', targets)
    

    #return features, targets
    return Feature_UoB_Set01_LOBs, tags


In [17]:
def preprocess(directory0, directory1):
    files_lob = os.listdir(directory0)
    files_tape = os.listdir(directory1)
    
    for file_lob, file_tape in zip(files_lob, files_tape):
        file_path0 = os.path.join(directory0, file_lob)
        file_path1 = os.path.join(directory1, file_tape)
        print("Processing: ", file_path0, " and ", file_path1)
        load_and_process_data(file_path0, file_path1)


In [19]:
# train数据集预处理
preprocess("C:\\Users\\zoec0\\Desktop\\DSMP\\data\\train\\lobs", "C:\\Users\\zoec0\\Desktop\\DSMP\\data\\train\\tapes")

Processing:  C:\Users\zoec0\Desktop\DSMP\data\train\lobs\Featured_Clean_UoB_Set01_2025-01-02LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\train\tapes\Clean_UoB_Set01_2025-01-02tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\train\lobs\Featured_Clean_UoB_Set01_2025-01-03LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\train\tapes\Clean_UoB_Set01_2025-01-03tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\train\lobs\Featured_Clean_UoB_Set01_2025-01-06LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\train\tapes\Clean_UoB_Set01_2025-01-06tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\train\lobs\Featured_Clean_UoB_Set01_2025-01-07LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\train\tapes\Clean_UoB_Set01_2025-01-07tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\train\lobs\Featured_Clean_UoB_Set01_2025-01-08LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\train\tapes\Clean_UoB_Set01_2025-01-08tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\train\lobs\Fea

In [20]:
# validate数据集预处理
preprocess("C:\\Users\\zoec0\\Desktop\\DSMP\\data\\validate\\lobs", "C:\\Users\\zoec0\\Desktop\\DSMP\\data\\validate\\tapes")

Processing:  C:\Users\zoec0\Desktop\DSMP\data\validate\lobs\Featured_Clean_UoB_Set01_2025-04-28LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\validate\tapes\Clean_UoB_Set01_2025-04-28tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\validate\lobs\Featured_Clean_UoB_Set01_2025-04-29LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\validate\tapes\Clean_UoB_Set01_2025-04-29tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\validate\lobs\Featured_Clean_UoB_Set01_2025-04-30LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\validate\tapes\Clean_UoB_Set01_2025-04-30tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\validate\lobs\Featured_Clean_UoB_Set01_2025-05-01LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\validate\tapes\Clean_UoB_Set01_2025-05-01tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\validate\lobs\Featured_Clean_UoB_Set01_2025-05-02LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\validate\tapes\Clean_UoB_Set01_2025-05-02tapes.csv
Processing:  C:\Users\zoec0\De

In [21]:
# test数据集预处理
preprocess("C:\\Users\\zoec0\\Desktop\\DSMP\\data\\test\\lobs", "C:\\Users\\zoec0\\Desktop\\DSMP\\data\\test\\tapes")

Processing:  C:\Users\zoec0\Desktop\DSMP\data\test\lobs\Featured_Clean_UoB_Set01_2025-05-28LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\test\tapes\Clean_UoB_Set01_2025-05-28tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\test\lobs\Featured_Clean_UoB_Set01_2025-05-29LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\test\tapes\Clean_UoB_Set01_2025-05-29tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\test\lobs\Featured_Clean_UoB_Set01_2025-05-30LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\test\tapes\Clean_UoB_Set01_2025-05-30tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\test\lobs\Featured_Clean_UoB_Set01_2025-06-02LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\test\tapes\Clean_UoB_Set01_2025-06-02tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\test\lobs\Featured_Clean_UoB_Set01_2025-06-03LOBs.txt  and  C:\Users\zoec0\Desktop\DSMP\data\test\tapes\Clean_UoB_Set01_2025-06-03tapes.csv
Processing:  C:\Users\zoec0\Desktop\DSMP\data\test\lobs\Featured_Clean

In [22]:
# 指定文件夹的路径
valid_LOBs_path = "validate/lobs"
valid_Tapes_path = "validate/tapes"

# 读取文件夹中所有的.npy文件
files_valid_LOBs = [f for f in os.listdir(valid_LOBs_path) if f.endswith('.npy')]
files_valid_Tapes = [f for f in os.listdir(valid_Tapes_path) if f.endswith('.npy')]

# 初始化空的DataFrames
valid_LOBs_df = pd.DataFrame()
valid_Tapes_df = pd.DataFrame()

# 遍历文件列表，加载每个文件并追加到DataFrame
for file in files_valid_LOBs:
    # 加载.npy文件
    data = np.load(os.path.join(valid_LOBs_path, file))
    # 将numpy数组转换为DataFrame
    temp_LOBs_df = pd.DataFrame(data)
    # 追加到主DataFrame
    valid_LOBs_df = pd.concat([valid_LOBs_df, temp_LOBs_df], ignore_index=True)

for file in files_valid_Tapes:
    # 加载.npy文件
    data = np.load(os.path.join(valid_Tapes_path, file))
    # 将numpy数组转换为DataFrame
    temp_Tapes_df = pd.DataFrame(data)
    # 追加到主DataFrame
    valid_Tapes_df = pd.concat([valid_Tapes_df, temp_Tapes_df], ignore_index=True)

# 保存成文件
valid_LOBs_df.to_csv('valid_LOBs_df.csv')
valid_Tapes_df.to_csv('valid_Tapes_df.csv')


In [23]:
# 模型构建
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(248, 1)),
    Dropout(0.2),
    LSTM(30, return_sequences=False),
    Dense(6)  # 输出维度为6
])



  super().__init__(**kwargs)


In [24]:
# 编译优化
model.compile(optimizer='adam', loss='mean_squared_error')


In [None]:
train_lobs_dir = 'train/lobs'
train_tapes_dir = 'train/tapes'

# 获取所有lobs和tapes数据文件名
lobs_files = [f for f in os.listdir(train_lobs_dir) if f.endswith('.npy')]
tapes_files = [f for f in os.listdir(train_tapes_dir) if f.endswith('.npy')]

# 确保文件名排序一致，以匹配每个lobs文件与对应的tapes文件
lobs_files.sort()
tapes_files.sort()

# 加载验证集数据
valid_LOBs = valid_LOBs_df
valid_Tapes = valid_Tapes_df

# 初始化用于存储验证损失的列表
val_losses = []
        
# 重复整个训练循环五十次
for epoch in range(50):
    # 创建ModelCheckpoint实例
    checkpoint_path = f"checkpoints/ckpt-epoch{{epoch:04d}}-val_loss{{val_loss:.4f}}.weights.h5"
    checkpoint_callback = ModelCheckpoint(
        filepath=checkpoint_path,       # 检查点文件的保存路径
        monitor='val_loss',             # 监控的数据是验证集loss
        save_best_only=False,           # 保存所有模型，不仅仅是最好的模型
        save_weights_only=True,         # 仅保存权重
        mode='auto',
        verbose=1)                      # 打印详细日志
    
    # 逐个加载和训练数据
    for index, (lobs_file, tapes_file) in enumerate(zip(lobs_files, tapes_files)):
        lobs_path = os.path.join(train_lobs_dir, lobs_file)
        tapes_path = os.path.join(train_tapes_dir, tapes_file)
        
        # 加载数据
        X_train = np.load(lobs_path)
        y_train = np.load(tapes_path)
        
        # 检查是否是当前迭代的最后一次，并使用回调
        if index == len(lobs_files) - 1:
            # 使用验证数据
            history = model.fit(X_train, y_train, 
                      epochs=1, batch_size=32, 
                      validation_data=(valid_LOBs, valid_Tapes),
                      callbacks=[checkpoint_callback])
            print("Used validation data.")
        else:
            # 不使用验证数据
            model.fit(X_train, y_train, 
                      epochs=1, batch_size=32)
            print("Did not use validation data.")

    # 记录当前epoch的验证损失
    if 'val_loss' in history.history:
        current_val_loss = history.history['val_loss'][0]
        val_losses.append(current_val_loss)
                
    # 保存模型
    # 模型保存路径
    model_directory = "saved_models"
    if not os.path.exists(model_directory):
        os.makedirs(model_directory)
    model_filename = f"saved_models/model-epoch{epoch:03d}.keras"
    model.save(model_filename)
    
    print(f"Checkpoint and model saved at epoch {epoch + 1}")
    



In [44]:
# 在所有训练完成后，将val_loss保存到CSV文件
val_loss_df = pd.DataFrame(val_losses, columns=['val_loss'])
val_loss_df.to_csv('validation_losses.csv', index_label='epoch')
print(f"Validation losses saved.")

Validation losses saved.


In [None]:
# 可视化val_loss，选出最好的模型



In [47]:
# 测试数据的子文件夹路径
test_lobs_dir = 'test/lobs'
test_tapes_dir = 'test/tapes'

# 加载最优模型
model_path = 'saved_models/model-epoch000.keras'
model = load_model(model_path)

# 获取所有lobs和tapes数据文件名
lobs_test_files = [f for f in os.listdir(test_lobs_dir) if f.endswith('.npy')]
tapes_test_files = [f for f in os.listdir(test_tapes_dir) if f.endswith('.npy')]

# 确保文件名排序一致，以匹配每个lobs文件与对应的tapes文件
lobs_test_files.sort()
tapes_test_files.sort()

# 初始化评估指标列表
mse_scores = []
rmse_scores = []

# 假设之前的代码块已经导入和定义了必要的模块和变量
for lobs_test_file, tapes_test_file in zip(lobs_test_files, tapes_test_files):
    test_lobs_path = os.path.join(test_lobs_dir, lobs_test_file)
    test_tapes_path = os.path.join(test_tapes_dir, tapes_test_file)
    
    # 加载数据
    X_test = np.load(test_lobs_path)
    y_true = np.load(test_tapes_path)  # 真实的tapes数据
    
    # 使用模型进行预测
    y_pred = model.predict(X_test)
    
    # 如果预测输出需要被逆变换（确保预测输出的形状适用于scaler）
    #y_pred_rescaled = scaler.inverse_transform(y_pred)
    #y_true_rescaled = scaler.inverse_transform(y_true.reshape(-1, 1))
    
    # 计算MSE和RMSE
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    
    # 可视化预测结果与真实数据
    plt.figure(figsize=(10, 4))
    #plt.plot(y_true_rescaled, label='Actual Weighted Price', color='blue', linewidth=1)
    #plt.plot(y_pred_rescaled, label='Predicted Weighted Price', color='red', linestyle='--', linewidth=1)
    plt.plot(y_true, label='Actual Weighted Price', color='blue', linewidth=1)
    plt.plot(y_pred, label='Predicted Weighted Price', color='red', linestyle='--', linewidth=1)
    plt.title(f'Prediction vs True Data for {tapes_test_file}')
    plt.xlabel('Time Steps')
    plt.ylabel('Weighted Price')
    plt.legend()
    plt.grid(True)

    # 保存图像
    save_path = os.path.join('predictions_visualizations', f"{tapes_test_file}.png")
    plt.savefig(save_path)
    plt.close()  # 关闭图形以节省内存
    
    print(f"Saved visualization for {tapes_test_file} at {save_path}")
    print(f"MSE for {tapes_test_file}: {mse}")
    print(f"RMSE for {tapes_test_file}: {rmse}")


# 计算总体MSE和RMSE
average_mse = np.mean(mse_scores)
average_rmse = np.mean(rmse_scores)

print(f"Average MSE: {average_mse}")
print(f"Average RMSE: {average_rmse}")



[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step
Saved visualization for 2025-05-28tapes_targets.npy at predictions_visualizations\2025-05-28tapes_targets.npy.png
MSE for 2025-05-28tapes_targets.npy: 0.02378392219774934
RMSE for 2025-05-28tapes_targets.npy: 0.1542203689457049
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step
Saved visualization for 2025-05-29tapes_targets.npy at predictions_visualizations\2025-05-29tapes_targets.npy.png
MSE for 2025-05-29tapes_targets.npy: 0.04798995410022899
RMSE for 2025-05-29tapes_targets.npy: 0.21906609527772433
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step
Saved visualization for 2025-05-30tapes_targets.npy at predictions_visualizations\2025-05-30tapes_targets.npy.png
MSE for 2025-05-30tapes_targets.npy: 0.03505416516712025
RMSE for 2025-05-30tapes_targets.npy: 0.18722757587257347
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step
Saved visualizatio