In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import lightgbm as lgb
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from scipy.stats import norm


warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体，或者使用你系统上可用的其他字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

In [2]:
#利用AskPrice1和BidPrice1计算价格的平均值，进而计算分钟频的收益率
def mid_price(df):
    mid = (df['AskPrice1'] + df['BidPrice1'])/2
    mid = mid.astype(float)
    return mid

In [3]:
def std_factor(X):
    X_std = (X - X.mean())/X.std()
    return X_std

In [4]:
def calculate_ic(factors, returns):
    # 计算皮尔逊相关系数
    ic, _ = pearsonr(factors, returns)
    return ic

In [5]:
# 定义函数：获取前一个交易日
def get_previous_trading_date(current_date, trading_dates):
    idx = trading_dates.index(current_date)
    return trading_dates[idx - 1] if idx > 0 else None

def process_day(group, trading_dates):
    """
    按照日盘和夜盘时间范围划分交易数据，并对每段数据进行前后切片处理，最后返回日盘和夜盘的 baskets 和 window_size。

    :param group: 分组后的 DataFrame，每组是一个交易日的数据。
    :param prev_period: 去掉每段数据前 prev_period 条记录。
    :param back_period: 去掉每段数据后 back_period 条记录。
    :param trading_dates: 所有交易日的序列，用于查找前一个交易日。
    :param V: 每个桶的目标交易量（可以根据需要调整）
    :return: 处理后的日盘和夜盘数据分别处理后的 DataFrame。
    """
    # 获取当前交易日和前一个交易日
    trading_date = group['trading_date'].iloc[0]
    previous_trading_date = get_previous_trading_date(trading_date, trading_dates)

    # 定义时间范围
    day_start = pd.to_datetime(f"{trading_date} 09:00:00")
    day_end = pd.to_datetime(f"{trading_date} 14:57:00")
    night_start = pd.to_datetime(f"{previous_trading_date} 21:00:00") if previous_trading_date else None
    night_end = (pd.to_datetime(f"{previous_trading_date} 02:27:00") + pd.Timedelta(days=1)) if previous_trading_date else None

    # 筛选日盘数据
    day_session = group[(group['exchange_time'] >= day_start) & (group['exchange_time'] <= day_end)]

    # 筛选夜盘数据（需要判断是否有前一个交易日）
    if night_start:
        night_session = group[(group['exchange_time'] >= night_start) & (group['exchange_time'] <= night_end)]
    else:
        night_session = pd.DataFrame()  # 如果没有前一个交易日，则夜盘数据为空

    # day_session_processed = day_session.iloc[prev_period:-back_period] 

    # # 如果夜盘数据存在，则进行处理；否则跳过
    # if not night_session.empty:
    #     night_session_processed = night_session.iloc[prev_period:-back_period]
    # else:
    #     night_session_processed = pd.DataFrame()  # 为空时可以直接跳过处理

    # # 在日盘和夜盘数据上分别应用桶划分逻辑
    # def bucketize_data(session_data):
    #     # 如果 session_data 不为空并且包含 'Volume' 列
    #     if session_data.empty or 'Volume' not in session_data.columns:
    #         return session_data  # 返回原始数据，因为数据为空或者没有 'Volume' 列

    #     # session_data['current_volume'] = session_data['Volume'].diff()
    #     # session_data['current_volume'].fillna(0, inplace=True)
    #     current_basket = 0  # 当前桶的交易量
    #     window_size = 0  # 当前桶的起始索引

    #     current_basket_list = []
    #     window_size_list = []


    #     # 遍历 `current_volume` 数据，将数据划分为多个桶
    #     for volume in session_data['current_volume'].values:
    #         current_basket += volume  # 累积当前桶的交易量
    #         # current_basket_list.append(current_basket)
    #         window_size += 1  # 增加窗口大小
    #         # window_size_list.append(window_size)

    #         # 当当前桶的交易量达到或超过目标交易量时
    #         if current_basket >= V:
    #             current_basket_list.extend([current_basket]*window_size)
    #             window_size_list.extend([window_size]*window_size)
    #             window_size = 0
    #             current_basket = 0

    #     current_basket_list.extend([current_basket]*window_size)
    #     window_size_list.extend([window_size]*window_size)

    #     session_data['basket_volume'] = current_basket_list
    #     session_data['window_size'] = window_size_list


    #     return session_data

    # # 分别对日盘和夜盘数据进行桶划分处理
    # day_session_processed = bucketize_data(day_session_processed)

    # if not night_session_processed.empty:
    #     night_session_processed = bucketize_data(night_session_processed)

    # 拼接处理后的日盘和夜盘数据
    # processed_data = pd.concat([night_session,day_session], ignore_index=True)

    # 返回处理后的日盘和夜盘数据
    return night_session,day_session

In [6]:
# table = pd.read_parquet(r"C:\Ter\source\sp")
# table['trading_date'] = pd.to_datetime(table['trading_date']) 

# # 设置开始和结束时间
# start_time = pd.to_datetime('2023-07-01')
# end_time = pd.to_datetime('2024-06-30')

# table = table[(table['trading_date'] >= start_time) & (table['trading_date'] <= end_time)]

# # 当 AskPrice1 为 0 时，用 BidPrice1 替换
# table['AskPrice1'] = table['AskPrice1'].where(table['AskPrice1'] != 0, table['BidPrice1'])

# # 当 AskPrice1 为 0 时，用 AskPrice1 替换
# table['BidPrice1'] = table['BidPrice1'].where(table['BidPrice1'] != 0, table['AskPrice1'])

# # 计算一些差分数据
# table['current_volume'] = table['Volume'].diff()
# table['Position Increase'] = table['OpenInterest'].diff()
# table['current_turnover'] = table['Turnover'].diff()
# table['current_avg_price'] = table['current_turnover']/(table['current_volume']*10)
# table['mid_price'] = mid_price(table)
# table['current_volume'].fillna(0,inplace=True)
# table['return'] = -mid_price(table).diff(-120)
# table['return'].fillna(0,inplace=True)

# table['buy_sell_signal'] = 0
# table.loc[table['last'] >= table['AskPrice1'].shift(1),'buy_sell_signal'] = 1
# table.loc[table['last'] <= table['BidPrice1'].shift(1),'buy_sell_signal'] = -1

In [7]:
symbol = 'ag'
df = pd.read_parquet(fr"C:\Ter\source\{symbol}")
df['trading_date'] = pd.to_datetime(df['trading_date']) 
# 设置开始和结束时间
start_time = pd.to_datetime('2023-07-01')
end_time = pd.to_datetime('2024-06-30')

table = df[(df['trading_date'] >= start_time) & (df['trading_date'] <= end_time)]

# 当 AskPrice1 为 0 时，用 BidPrice1 替换
table['AskPrice1'] = table['AskPrice1'].where(table['AskPrice1'] != 0, table['BidPrice1'])

# 当 AskPrice1 为 0 时，用 AskPrice1 替换
table['BidPrice1'] = table['BidPrice1'].where(table['BidPrice1'] != 0, table['AskPrice1'])

# 计算一些基本信息
table['mid_price'] = (table['BidPrice1'] + table['AskPrice1']) / 2



In [8]:
# 按 'trading_date' 分组，使用 process_day 处理每个分组
# new_table = table[~table['trading_date'].isin([pd.to_datetime('2023-12-08')])].reset_index(drop=True)
new_table = table.copy()
# new_table = table[~table['trading_date'].isin([pd.to_datetime('2024-04-08'),pd.to_datetime('2024-05-20')])]
unique_trading_dates = sorted(new_table['trading_date'].unique())
result = new_table.groupby('trading_date').apply(process_day,trading_dates=unique_trading_dates)
# 处理结果：返回每一天的日盘和夜盘数据以及合并后的结果
concat_results = []
day_results = []
night_results = []

from factor_install import *


for night_df,day_df in result:
    day_df['frt_120'] = -day_df['mid_price'].diff(-120)
    day_df['frt_120'].fillna(0,inplace=True)
    factor_install(day_df,symbol)
    if not night_df.empty:
        night_df['frt_120'] = -night_df['mid_price'].diff(-120)
        night_df['frt_120'].fillna(0,inplace=True)
        factor_install(night_df,symbol)

    concat_df = pd.concat([night_df,day_df],ignore_index=True)
    # day_results.append(day_df)
    # night_results.append(night_df)
    concat_results.append(concat_df)

# day_data = pd.concat(day_results,ignore_index=True)
# night_data = pd.concat(night_results,ignore_index=True)
train_data = pd.concat(concat_results, ignore_index=True)

In [9]:
train_data

Unnamed: 0,order_book_id,exchange_time,trading_date,open,last,high,low,prev_settlement,prev_close,Volume,OpenInterest,Turnover,limit_up,limit_down,AskPrice1,AskPrice2,AskPrice3,AskPrice4,AskPrice5,BidPrice1,BidPrice2,BidPrice3,BidPrice4,BidPrice5,AskVolume1,AskVolume2,AskVolume3,AskVolume4,AskVolume5,BidVolume1,BidVolume2,BidVolume3,BidVolume4,BidVolume5,change_rate,mid_price,frt_120,BAV_diff,BAV_diff_transform,Base_factor,ratio,pending_vol_ratio_factor,BidVolume,AskVolume,Bid_ratio,Ask_ratio,relative_vol_ratio_diff,relative_vol_ratio_imbalance,Bid_submit_price,Ask_submit_price,std_factor,submit_price_imbalance,Base_factor_lag10,BAV_diff_transform_lag10,pending_vol_ratio_factor_lag10,submit_price_imbalance_lag10,relative_vol_ratio_imbalance_lag10,Base_factor_lag16,BAV_diff_transform_lag16,pending_vol_ratio_factor_lag16,submit_price_imbalance_lag16,relative_vol_ratio_imbalance_lag16,Base_factor_lag26,BAV_diff_transform_lag26,pending_vol_ratio_factor_lag26,submit_price_imbalance_lag26,relative_vol_ratio_imbalance_lag26,Base_factor_lag40,BAV_diff_transform_lag40,pending_vol_ratio_factor_lag40,submit_price_imbalance_lag40,relative_vol_ratio_imbalance_lag40,Base_factor_lag50,BAV_diff_transform_lag50,pending_vol_ratio_factor_lag50,submit_price_imbalance_lag50,relative_vol_ratio_imbalance_lag50,Base_factor_lag60,BAV_diff_transform_lag60,pending_vol_ratio_factor_lag60,submit_price_imbalance_lag60,relative_vol_ratio_imbalance_lag60
0,AG2308,2023-07-03 09:00:00.500,2023-07-03,5428.0,5491.0,5502.0,5427.0,5432.0,5439.0,322670.0,260704.0,2.646210e+10,5920.0,4943.0,5492.0,5493.0,5494.0,5495.0,5496.0,5491.0,5490.0,5489.0,5488.0,5487.0,67.0,158.0,106.0,58.0,35.0,12.0,26.0,41.0,31.0,28.0,0.010862,5491.5,-7.0,,0.000000,-0.696203,0.151899,0.000000,138.0,424.0,11.500000,6.328358,-5.171642,0.000000,5488.731884,5493.613208,-0.654908,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,AG2308,2023-07-03 09:00:01.000,2023-07-03,5428.0,5490.0,5502.0,5427.0,5432.0,5439.0,322831.0,260719.0,2.647536e+10,5920.0,4943.0,5490.0,5491.0,5492.0,5493.0,5494.0,5489.0,5488.0,5487.0,5486.0,5485.0,2.0,235.0,119.0,163.0,104.0,3.0,27.0,64.0,48.0,89.0,0.010677,5489.5,-5.0,56.0,0.476645,0.200000,0.600000,0.000000,231.0,623.0,77.000000,311.500000,234.500000,0.707107,5486.164502,5492.211878,-0.623620,0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,AG2308,2023-07-03 09:00:01.500,2023-07-03,5428.0,5487.0,5502.0,5427.0,5432.0,5439.0,323027.0,260650.0,2.649150e+10,5920.0,4943.0,5489.0,5490.0,5491.0,5492.0,5493.0,5487.0,5486.0,5485.0,5484.0,5483.0,17.0,55.0,155.0,100.0,148.0,16.0,48.0,63.0,22.0,33.0,0.010125,5488.0,-4.5,-2.0,-0.033284,-0.030303,0.484848,-0.067050,182.0,475.0,11.375000,27.941176,16.566176,-0.493405,5484.956044,5491.646316,0.602360,1.154426,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,AG2308,2023-07-03 09:00:02.000,2023-07-03,5428.0,5486.0,5502.0,5427.0,5432.0,5439.0,323121.0,260629.0,2.649923e+10,5920.0,4943.0,5488.0,5489.0,5490.0,5491.0,5492.0,5486.0,5485.0,5484.0,5483.0,5482.0,32.0,40.0,58.0,100.0,82.0,28.0,63.0,22.0,33.0,273.0,0.009941,5487.0,-3.5,-3.0,-0.049834,-0.066667,0.466667,-0.433333,419.0,312.0,14.964286,9.750000,-5.214286,-0.560413,5482.902148,5490.512821,-0.585032,-0.440421,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,AG2308,2023-07-03 09:00:02.500,2023-07-03,5428.0,5488.0,5502.0,5427.0,5432.0,5439.0,323277.0,260569.0,2.651207e+10,5920.0,4943.0,5488.0,5489.0,5490.0,5491.0,5492.0,5487.0,5486.0,5485.0,5484.0,5483.0,1.0,43.0,48.0,116.0,83.0,1.0,31.0,63.0,22.0,33.0,0.010309,5487.5,-3.0,4.0,0.066274,-0.000000,0.500000,0.015152,150.0,291.0,150.000000,291.000000,141.000000,0.602578,5484.633333,5490.814433,0.447766,0.967917,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13651520,AG2408,2024-06-28 14:56:58.000,2024-06-28,7720.0,7766.0,7769.0,7633.0,7625.0,7629.0,1001382.0,315264.0,1.157123e+11,8387.0,6862.0,7767.0,7768.0,7769.0,7770.0,7771.0,7766.0,7765.0,7764.0,7763.0,7762.0,44.0,137.0,142.0,304.0,27.0,2.0,27.0,92.0,42.0,111.0,0.018492,7766.5,0.0,-125.0,-0.532594,-0.913043,0.043478,-0.720377,274.0,654.0,137.000000,14.863636,-122.136364,-2.785048,7763.149635,7769.203364,-0.647001,-2.452708,0.346245,-0.078829,-0.064723,-1.392332,-0.177357,0.489308,-0.093912,-0.047793,-0.864388,0.215511,0.465220,-0.053800,0.006898,-0.146992,0.209120,0.265055,-0.035303,0.000925,-0.004690,0.119478,0.133190,-0.030329,-0.015191,-0.105448,-0.026964,0.158309,-0.024898,-0.001199,0.033810,0.053772
13651521,AG2408,2024-06-28 14:56:58.500,2024-06-28,7720.0,7766.0,7769.0,7633.0,7625.0,7629.0,1001402.0,315281.0,1.157146e+11,8387.0,6862.0,7767.0,7768.0,7769.0,7770.0,7771.0,7765.0,7764.0,7763.0,7762.0,7761.0,29.0,137.0,117.0,282.0,93.0,26.0,93.0,51.0,137.0,135.0,0.018492,7766.0,0.0,39.0,0.430862,-0.054545,0.472727,-0.346465,442.0,658.0,17.000000,22.689655,5.689655,-0.374200,7762.407240,7769.414894,-0.177867,-1.051565,0.290106,-0.032415,-0.089398,-1.297788,-0.196811,0.440760,-0.070097,-0.066703,-0.979246,0.152002,0.454363,-0.040404,-0.014653,-0.179122,0.191889,0.287471,-0.012838,0.003165,0.004961,0.186678,0.135324,-0.012858,-0.011777,-0.111603,-0.024542,0.154370,-0.017162,-0.009679,0.012976,0.048833
13651522,AG2408,2024-06-28 14:56:59.000,2024-06-28,7720.0,7766.0,7769.0,7633.0,7625.0,7629.0,1001402.0,315281.0,1.157146e+11,8387.0,6862.0,7767.0,7768.0,7769.0,7770.0,7771.0,7765.0,7764.0,7763.0,7762.0,7761.0,39.0,134.0,120.0,283.0,58.0,28.0,92.0,52.0,106.0,140.0,0.018492,7766.0,0.0,-8.0,-0.130260,-0.164179,0.417910,0.274432,418.0,634.0,14.928571,16.256410,1.327839,-0.421317,7762.430622,7769.294953,-0.274425,-1.041326,0.222894,-0.029365,-0.061676,-1.228349,-0.223809,0.372137,-0.089210,-0.058585,-1.092860,-0.091083,0.433319,-0.053263,-0.008258,-0.229475,0.172153,0.305426,-0.019736,0.020186,-0.022865,0.206113,0.122707,-0.020592,-0.005241,-0.118589,-0.061338,0.149721,-0.017957,-0.004294,-0.006807,0.043710
13651523,AG2408,2024-06-28 14:56:59.500,2024-06-28,7720.0,7766.0,7769.0,7633.0,7625.0,7629.0,1001414.0,315282.0,1.157160e+11,8387.0,6862.0,7767.0,7768.0,7769.0,7770.0,7771.0,7765.0,7764.0,7763.0,7762.0,7761.0,39.0,134.0,150.0,283.0,58.0,13.0,93.0,57.0,121.0,140.0,0.018492,7766.0,0.0,-15.0,-0.231059,-0.500000,0.250000,-0.222727,424.0,664.0,32.615385,17.025641,-15.589744,-0.678733,7762.334906,7769.281627,-0.383468,-1.059528,0.111549,-0.067037,-0.089278,-1.209412,-0.292431,0.288930,-0.100536,-0.075915,-1.201720,-0.185541,0.400140,-0.060233,-0.019419,-0.263093,0.143282,0.268803,-0.037927,-0.006834,-0.106935,0.124621,0.128707,-0.016596,-0.003308,-0.110753,-0.039965,0.142837,-0.018883,-0.005766,-0.025063,0.036738


In [10]:
factor_columns = ['Base_factor','BAV_diff_transform','pending_vol_ratio_factor', 'submit_price_imbalance', 'relative_vol_ratio_imbalance']
new_factor_columns = []
for i in factor_columns:
    new_factor_columns.extend([col for col in train_data.columns if (i+'_lag') in col])
    
# mean = train_data[factor_columns].mean().to_numpy().flatten().tolist()
# std = train_data[factor_columns].std().to_numpy().flatten().tolist()

# # 保留小数点后 6 位
# mean = [round(num, 6) for num in mean]
# std = [round(num, 6) for num in std]

# print(f'均值为：{mean}')
# print(f'标准差为:{std}')

# train_data[factor_columns] = (train_data[factor_columns] - mean)/std

In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm


x = train_data[new_factor_columns].to_numpy()
y = train_data['frt_120'].to_numpy()

# 将 x 和 y 转换为 torch.Tensor 类型
x = torch.tensor(x, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

# 定义线性回归模型，输入维度为 35，输出维度为 1，不使用偏置项
model = nn.Linear(30, 1, bias=False)

# 自定义 Cauchy 负对数似然损失函数
def cauchy_loss(outputs, targets):
    residuals = targets - outputs
    return torch.mean(torch.log(1 + (residuals ** 2)))

# 定义优化器
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# 训练模型
num_epochs = 1000
for epoch in tqdm(range(num_epochs), desc="Training Progress"):
    # 前向传播
    outputs = model(x)
    loss = cauchy_loss(outputs, y)

    # 反向传播和优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# 由于有 35 个特征，无法直接绘制 35 维的拟合图，这里可以简单查看预测值和真实值的差异
predicted = model(x).detach().numpy()
print("真实值前几个样本：", y[:5].numpy().flatten())
print("预测值前几个样本：", predicted[:5].flatten())

# 获取并打印线性系数的值
linear_coefficients = model.weight.detach().numpy().flatten()
print("线性系数的值：", linear_coefficients)
    

Training Progress:   5%|▌         | 51/1000 [00:09<03:03,  5.17it/s]

Epoch [50/1000], Loss: 1.3409


Training Progress:  10%|█         | 100/1000 [00:19<02:52,  5.22it/s]

Epoch [100/1000], Loss: 1.3386


Training Progress:  15%|█▌        | 150/1000 [00:28<02:40,  5.30it/s]

Epoch [150/1000], Loss: 1.3373


Training Progress:  20%|██        | 201/1000 [00:38<02:34,  5.17it/s]

Epoch [200/1000], Loss: 1.3365


Training Progress:  25%|██▌       | 251/1000 [00:48<02:24,  5.18it/s]

Epoch [250/1000], Loss: 1.3360


Training Progress:  30%|███       | 301/1000 [00:57<02:14,  5.19it/s]

Epoch [300/1000], Loss: 1.3357


Training Progress:  35%|███▌      | 351/1000 [01:07<02:08,  5.07it/s]

Epoch [350/1000], Loss: 1.3355


Training Progress:  40%|████      | 400/1000 [01:16<01:53,  5.27it/s]

Epoch [400/1000], Loss: 1.3353


Training Progress:  45%|████▌     | 451/1000 [01:26<01:45,  5.21it/s]

Epoch [450/1000], Loss: 1.3351


Training Progress:  50%|█████     | 500/1000 [01:36<01:34,  5.32it/s]

Epoch [500/1000], Loss: 1.3350


Training Progress:  55%|█████▌    | 551/1000 [01:45<01:25,  5.22it/s]

Epoch [550/1000], Loss: 1.3349


Training Progress:  60%|██████    | 601/1000 [01:55<01:17,  5.17it/s]

Epoch [600/1000], Loss: 1.3348


Training Progress:  65%|██████▌   | 650/1000 [02:05<01:09,  5.06it/s]

Epoch [650/1000], Loss: 1.3347


Training Progress:  70%|███████   | 700/1000 [02:14<00:57,  5.25it/s]

Epoch [700/1000], Loss: 1.3347


Training Progress:  75%|███████▌  | 751/1000 [02:24<00:49,  5.00it/s]

Epoch [750/1000], Loss: 1.3346


Training Progress:  80%|████████  | 800/1000 [02:33<00:37,  5.29it/s]

Epoch [800/1000], Loss: 1.3346


Training Progress:  85%|████████▌ | 851/1000 [02:43<00:28,  5.15it/s]

Epoch [850/1000], Loss: 1.3345


Training Progress:  90%|█████████ | 900/1000 [02:53<00:19,  5.23it/s]

Epoch [900/1000], Loss: 1.3345


Training Progress:  95%|█████████▌| 951/1000 [03:03<00:09,  5.16it/s]

Epoch [950/1000], Loss: 1.3345


Training Progress: 100%|██████████| 1000/1000 [03:12<00:00,  5.19it/s]

Epoch [1000/1000], Loss: 1.3344
真实值前几个样本： [-7.  -5.  -4.5 -3.5 -3. ]
预测值前几个样本： [0. 0. 0. 0. 0.]
线性系数的值： [ 0.21024534 -0.065294    0.00250815  0.09216918  0.13430469 -0.07276703
  0.12419012 -0.05333526 -0.09427624  0.03724391 -0.14388846 -0.16608742
  0.06781021  0.06916014  0.07404608 -0.02166027 -0.13148908 -0.1448235
  0.14159355 -0.13188267  0.04771601  0.14434318  0.14616245 -0.10105506
  0.02139213 -0.04438195  0.16326898 -0.10763139 -0.09875619 -0.05785135]





In [27]:
coefficients = linear_coefficients.tolist()
print(coefficients)

[0.21024534106254578, -0.06529400497674942, 0.0025081525091081858, 0.09216918051242828, 0.1343046873807907, -0.07276702672243118, 0.12419012188911438, -0.05333526432514191, -0.09427624195814133, 0.03724391385912895, -0.143888458609581, -0.16608741879463196, 0.06781020760536194, 0.06916014105081558, 0.07404608279466629, -0.021660272032022476, -0.1314890831708908, -0.14482350647449493, 0.1415935456752777, -0.1318826675415039, 0.047716010361909866, 0.14434318244457245, 0.14616245031356812, -0.10105506330728531, 0.021392129361629486, -0.044381946325302124, 0.16326898336410522, -0.1076313927769661, -0.09875618666410446, -0.05785134807229042]


In [None]:
# import torch
# import torch.nn as nn
# import matplotlib.pyplot as plt
# from tqdm import tqdm
# from torch.utils.data import TensorDataset, DataLoader

# batch_size = 10240

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# x = train_data[new_factor_columns].to_numpy()
# y = train_data['frt_120'].to_numpy()

# # 将 x 和 y 转换为 torch.Tensor 类型
# x = torch.tensor(x, dtype=torch.float32)
# y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

# dataset = TensorDataset(x, y)
# dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# # 定义线性回归模型，输入维度为 35，输出维度为 1，不使用偏置项
# model = nn.Linear(30, 1)
# mode = model.to(device)
# model.train()

# # 自定义 Cauchy 负对数似然损失函数
# def cauchy_loss(outputs, targets):
#     residuals = targets - outputs
#     return torch.mean(torch.log(1 + (residuals ** 2)))

# def some_loss(outputs, targets):
#     return torch.mean(torch.abs((outputs-targets)*targets))

# criterion = some_loss

# # 定义优化器
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
# print(device)

# # 训练模型
# num_epochs = 1000
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for batch_x, batch_y in dataloader:
#         batch_x, batch_y = batch_x.to(device), batch_y.to(device)
#         # 前向传播
#         optimizer.zero_grad()
#         outputs = model(batch_x)
#         loss = criterion(outputs, batch_y)

#         # 反向传播和优化
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}')

# # 由于有 35 个特征，无法直接绘制 35 维的拟合图，这里可以简单查看预测值和真实值的差异
# predicted = model(x).detach().numpy()
# print("真实值前几个样本：", y[:5].numpy().flatten())
# print("预测值前几个样本：", predicted[:5].flatten())

# # 获取并打印线性系数的值
# linear_coefficients = model.weight.detach().numpy().flatten()
# print("线性系数的值：", linear_coefficients)

cuda:0
Epoch [1/1000], Loss: 12.8345
Epoch [2/1000], Loss: 12.8192
Epoch [3/1000], Loss: 12.8161


KeyboardInterrupt: 

In [None]:
# from scipy.optimize import minimize

# # 提取因子和目标变量
# X = train_data[new_factor_columns].to_numpy()
# y = train_data['frt_120'].to_numpy()

# # Cauchy负对数似然函数
# def cauchy_loss(params, X, y):
#     y_pred = np.dot(X, params)
#     residuals = y - y_pred
#     return np.sum(np.log(1 + (residuals ** 2)))

# # 初始参数（全为0）
# initial_params = np.zeros(X.shape[1])

# # 极大似然估计
# result = minimize(cauchy_loss, initial_params, args=(X, y))

# # 回归系数
# coefficients = pd.DataFrame(result.x, index=new_factor_columns, columns=['Coefficient']).to_numpy().flatten().tolist()
# coefficients = [round(num, 6) for num in coefficients]

# print(coefficients)

# # 计算因子值
# train_data['factor'] = np.dot(X, result.x)

In [None]:
# final_data['segment'] = (final_data['window_size'] != final_data['window_size'].shift()).cumsum()

# # 计算每段的统计特征（例如均值、标准差、最大值等）
# current_volume_stats = final_data.groupby('segment')['current_volume'].agg(
#     current_volume_mean='mean',
#     current_volume_std='std',
#     current_volume_min='min',
#     current_volume_max='max',
#     current_volume_count='count'
# ).reset_index()
# return_stats = final_data.groupby('segment')['return'].agg(
#     return_mean='mean',
#     return_std='std',
#     return_min='min',
#     return_max='max',
#     return_count='count'
# ).reset_index()

# # 将每个段的统计特征加入原始数据
# final_data = final_data.merge(current_volume_stats, on='segment', how='left')
# final_data = final_data.merge(return_stats, on='segment', how='left')

In [None]:
# grouped = final_data.groupby(final_data['segment'])
# max_volume_rows = grouped.apply(lambda x: x.loc[x['current_volume'].idxmax()])
# def calculate_sign(group):
#     # 找到 current_volume 最大的行
#     max_row = group.loc[group['current_volume'].idxmax()]
    
#     # 获取该行的 current_avg_price, AskPrice1 和 BidPrice1 的值
#     current_avg_price = max_row['current_avg_price']
#     AskPrice1_prev = group['AskPrice1'].shift(1)
#     BidPrice1_prev = group['BidPrice1'].shift(1)
    
#     # 计算 sign 值
#     sign_value = np.sign(2 * current_avg_price - AskPrice1_prev - BidPrice1_prev)
    
#     # 将 sign 值赋给当前组的所有行
#     group['sign'] = sign_value
#     group['sign'].fillna(method='bfill', inplace=True)
#     return group

# final_data = final_data.groupby('segment').apply(calculate_sign)

In [None]:
# grouped = final_data.groupby(final_data['segment'])
# max_volume_rows = grouped.apply(lambda x: x.loc[x['current_volume'].idxmax()])
# def calculate_sign(group):
#     # 找到 current_volume 最大的行
#     max_row = group.loc[group['current_volume'].idxmax()]
#     # 计算 sign 值
#     factor_value = max_row['buy_sell_signal']
#     # 将 sign 值赋给当前组的所有行
#     group['factor'] = factor_value
#     group['factor'].fillna(method='bfill', inplace=True)
#     return group

# df = final_data.groupby('segment').apply(calculate_sign)
# df.rename(columns={'segment':'segment_columns'},inplace=True)
# last_rows = df.groupby('segment').tail(1)

In [None]:
# last_rows[['factor','return']].corr()

In [None]:
# final_data['factor'] = final_data['sign'] * final_data['current_volume']
# final_data[['factor','return']].corr()

In [None]:
# max_volume_rows['sign'] = np.sign(2*max_volume_rows['current_avg_price']-max_volume_rows['AskPrice1']-max_volume_rows['BidPrice1'])
# max_volume_rows['factor'] = max_volume_rows['sign']*max_volume_rows['current_volume']
# max_volume_rows[['factor','return']].corr()