# 读取数据并进行预处理

In [1]:
import torch

# 检查是否有可用的 GPU
gpu_available = torch.cuda.is_available()

if gpu_available:
    print("GPU is available.")
    print(f"Number of available GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU device: {torch.cuda.current_device()}")
    print(f"GPU device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("GPU is not available.")

GPU is available.
Number of available GPUs: 1
Current GPU device: 0
GPU device name: NVIDIA GeForce RTX 4090 D


In [2]:
import torch
from operator import add
from functools import reduce
from itertools import chain
import numpy as np
import pandas as pd
from pprint import pprint

In [3]:
data = pd.read_csv('./output_data_with_dates.csv')
# 使用上一行的值填充缺失值
df_filled = data.ffill()
# 使用下一行的值填充仍然存在的缺失值
data = df_filled.bfill()

In [4]:
# 计算每列的缺失值数量
missing_values_per_column = data.isna().sum()

# 计算总的缺失值数量
total_missing_values = data.isna().sum().sum()

print("每列的缺失值数量:")
print(missing_values_per_column)
print("\n总的缺失值数量:")
print(total_missing_values)

每列的缺失值数量:
Date        0
Stock       0
Feature0    0
Feature1    0
Feature2    0
Feature3    0
Feature4    0
Feature5    0
Feature6    0
Feature7    0
Feature8    0
dtype: int64

总的缺失值数量:
0


In [5]:
import torch
from tqdm import tqdm

def df_to_tensor(data, n=60, batch_size=2, features=9):
    # 按date列分组
    grouped_by_date = data.groupby('Date')

    # 创建一个列表用于存储多级分组的数据
    nested_grouped_list = []
    stock_num = data['Stock'].nunique()
    
    for date_name, date_group in tqdm(grouped_by_date, desc="Processing dates"):
        grouped_by_stock = date_group.groupby('Stock')
        stock_list = []
        for stock_name, stock_group in grouped_by_stock:
            # 删除date和Stock列，只保留特征列
            stock_group = stock_group.drop(columns=['Date', 'Stock'])
            # 将每个分组的数据转换为嵌套列表，并添加到stock_list中
            stock_list.append([item for sublist in stock_group.values.tolist() for item in sublist])
        nested_grouped_list.append(stock_list)
    
    new_list = []
    
    # 使用 tqdm 为外层循环添加进度条
    for i in tqdm(range(len(nested_grouped_list) - n), desc="Processing nested grouped list"):
        new_list.append(nested_grouped_list[i:i+n])
    
    tensor = torch.tensor(new_list, dtype=torch.float32)
    # 按照 batch_size 分割 tensor
    num_batches = tensor.size(0) // batch_size
    batches = torch.split(tensor[:num_batches * batch_size], batch_size)
    
    # 将分割后的 batch 组合成新的 tensor
    batch_tensor = torch.stack(batches)
    
    return batch_tensor

In [6]:
# data = data[data['Date'] > '2022-06-01']
data['midPrice'] = (data['Feature6'] + data['Feature8']) / 2

In [7]:
# 定义一个函数来计算未来n天的收益率
def calculate_future_returns(df, n, old_name = 'Feature0', new_name = 'Future_Return'):
    df[new_name] = df.groupby('Stock')[old_name].transform(lambda x: x.shift(-n) - x)
    return df.dropna()

In [8]:
data

Unnamed: 0,Date,Stock,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,midPrice
0,2012-01-04,sh000300,1.029251,0.999804,0.000435,2298.750000,1.416968e+07,-0.009727,1.027297,7.894957e+10,1.000000,1.013649
1,2012-01-04,sh000903,1.026232,0.999768,0.000454,2201.890000,8.247408e+06,-0.009727,1.023416,4.350682e+10,1.000000,1.011708
2,2012-01-04,sh600006,1.025890,0.996764,0.000000,0.323625,2.823113e+00,-0.009727,1.019418,6.338548e+06,1.000000,1.009709
3,2012-01-04,sh600007,1.022346,0.998883,0.000000,0.111732,7.221252e+00,-0.009727,1.022346,4.877571e+06,1.000000,1.011173
4,2012-01-04,sh600009,1.022332,0.995864,0.000000,0.082713,1.025788e+01,-0.009727,1.020678,3.356048e+07,1.000000,1.010339
...,...,...,...,...,...,...,...,...,...,...,...,...
1708042,2022-12-30,sz300029,0.646826,0.622101,0.000000,0.079757,7.960000e+00,0.027097,0.632470,4.005062e+07,0.634862,0.633666
1708043,2022-12-30,sz300093,6.400431,6.209724,0.000000,0.146698,4.284000e+01,-0.001631,6.387228,1.472596e+07,6.284540,6.335884
1708044,2022-12-30,sz300111,0.601142,0.583197,0.179445,3.260000,0.000000e+00,-0.003058,0.590375,1.750337e+08,0.584992,0.587684
1708045,2022-12-30,sz300203,2.152766,2.106145,0.000000,0.063002,3.390000e+01,-0.002648,2.141426,3.094160e+07,2.135756,2.138591


In [9]:
new_return = calculate_future_returns(data, 21, old_name='midPrice', new_name='Return')
# 复制的次数
n = 1
for i in range(n):
    new_return[f'Return_copy_{i+1}'] = new_return['Return']
# 去除因为计算未来导致的所有包含Nan的行
new_return = new_return[new_return['Date'] < '2022-12-20']
# 计算每列的缺失值数量
missing_values_per_column = new_return.isna().sum()

# 计算总的缺失值数量
total_missing_values = new_return.isna().sum().sum()

print("每列的缺失值数量:")
print(missing_values_per_column)
print("\n总的缺失值数量:")
print(total_missing_values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_return[f'Return_copy_{i+1}'] = new_return['Return']


每列的缺失值数量:
Date             0
Stock            0
Feature0         0
Feature1         0
Feature2         0
Feature3         0
Feature4         0
Feature5         0
Feature6         0
Feature7         0
Feature8         0
midPrice         0
Return           0
Return_copy_1    0
dtype: int64

总的缺失值数量:
0


In [10]:
a = new_return[['Date', 'Stock', 'Feature0', 'Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5', 'Feature6', 'Feature7', 'Feature8']]
feature_tensors = df_to_tensor(a, batch_size = 2, n = 63)

Processing dates: 100%|██████████| 2652/2652 [07:07<00:00,  6.21it/s]
Processing nested grouped list: 100%|██████████| 2589/2589 [00:00<00:00, 417913.06it/s]


In [11]:
feature_tensors.size()
# 这里X
# 有多少个batch，batch_size，用了之前三十天的数据，639支股票，9个特征（给定，价格....)

torch.Size([1294, 2, 63, 639, 9])

In [12]:
name_list = ['Return', 'Date', 'Stock']
for i in range(n):
    name_list.append(f'Return_copy_{i+1}')
b = new_return[name_list]
stock_returns_tensor = df_to_tensor(b, features = n + 1, batch_size = 2, n = 1)

Processing dates: 100%|██████████| 2652/2652 [07:05<00:00,  6.23it/s]
Processing nested grouped list: 100%|██████████| 2651/2651 [00:00<00:00, 1216931.15it/s]


In [13]:
stock_returns_tensor.size()
# 这部分不太确定理解是不是对的，Y
# 有多少个batch，batch_size，现在的这一天（基准），639支股票，9个特征（和训练的因子数量一致）

torch.Size([1325, 2, 1, 639, 2])

In [14]:
import pickle
with open('./data.pkl', 'wb') as f:
    pickle.dump((feature_tensors, stock_returns_tensor), f)

In [15]:
# 历史数据-》未来的因子-》未来的协方差矩阵