# 读取数据并进行预处理

In [1]:
import torch

# 检查是否有可用的 GPU
gpu_available = torch.cuda.is_available()

if gpu_available:
    print("GPU is available.")
    print(f"Number of available GPUs: {torch.cuda.device_count()}")
    print(f"Current GPU device: {torch.cuda.current_device()}")
    print(f"GPU device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("GPU is not available.")

GPU is available.
Number of available GPUs: 1
Current GPU device: 0
GPU device name: NVIDIA GeForce RTX 4090 D


In [2]:
import torch
from operator import add
from functools import reduce
from itertools import chain
import numpy as np
import pandas as pd
from pprint import pprint

In [3]:
# 指数的值
data = pd.read_csv('../DL_new/DL_Markowitz/data/return_df.csv')
# 使用上一行的值填充缺失值
df_filled = data.ffill()
# 使用下一行的值填充仍然存在的缺失值
data = df_filled.bfill()

In [4]:
import os
import pandas as pd

def read_csv_files_to_dict(directory_path):
    csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]
    dataframes = {}

    for file in csv_files:
        file_path = os.path.join(directory_path, file)
        df = pd.read_csv(file_path)
        # 使用上一行的值填充缺失值
        df_filled = df.ffill()
        # 使用下一行的值填充仍然存在的缺失值
        df = df_filled.bfill()
        file_name_without_extension = os.path.splitext(file)[0]
        dataframes[file_name_without_extension] = df
    return dataframes

# 使用示例
directory_path = '../DL_new/DL_Markowitz/data/stocks/'
# 替换成实际的文件夹路径
csv_dataframes = read_csv_files_to_dict(directory_path)

# 打印字典中的keys作为验证
print(csv_dataframes.keys())

dict_keys(['AAPL', 'ABT', 'ACN', 'ADBE', 'AMZN', 'BAC', 'BMY', 'BRK-B', 'C', 'CMCSA', 'COST', 'CSCO', 'CVX', 'DHR', 'DIS', 'GS', 'HD', 'HON', 'INTC', 'INTU', 'JNJ', 'JPM', 'KO', 'LIN', 'LLY', 'MCD', 'MDT', 'MRK', 'MS', 'MSFT', 'NEE', 'NFLX', 'NKE', 'NVDA', 'ORCL', 'PEP', 'PFE', 'PG', 'QCOM', 'SBUX', 'T', 'TMO', 'TXN', 'UNH', 'UNP', 'UPS', 'VZ', 'WFC', 'WMT', 'XOM'])


In [5]:
def check_missing_values(stock_dict):
    for stock, df in stock_dict.items():
        if df.isnull().values.any():
            raise ValueError(f"DataFrame for stock {stock} contains missing values")
            
check_missing_values(csv_dataframes)

In [6]:
def align_dates(stock_dict):
    # 获取所有的股票代码
    all_stocks = list(stock_dict.keys())
    
    # 提取每个DataFrame的日期，并求交集
    common_dates = set(stock_dict[all_stocks[0]]['Date'])
    for stock in all_stocks[1:]:
        common_dates &= set(stock_dict[stock]['Date'])
    
    common_dates = sorted(list(common_dates))
    
    # 只保留共同存在的日期
    for stock in all_stocks:
        stock_dict[stock] = stock_dict[stock][stock_dict[stock]['Date'].isin(common_dates)].reset_index(drop=True)
    
    return stock_dict, common_dates

In [7]:
csv_dataframes, dates = align_dates(csv_dataframes)
print(len(dates))

5093


In [8]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

def create_4d_tensorX(stock_dict, n_days):
    all_stocks = list(stock_dict.keys())
    all_dates = sorted(stock_dict[all_stocks[0]]['Date'])
    print(len(all_dates))
    date_to_index = {date: idx for idx, date in enumerate(all_dates)}

    num_dates = len(all_dates)
    num_stocks = len(all_stocks)
    num_features = len(stock_dict[all_stocks[0]].columns) - 1

    tensor = np.zeros((num_dates, n_days, num_stocks, num_features))

    for stock_idx, stock in tqdm(enumerate(all_stocks), total=len(all_stocks), desc="Processing stocks"):
        df = stock_dict[stock]
        df.set_index('Date', inplace=True)
        df = df.sort_index()
        for date_idx, current_date in enumerate(all_dates):
            start_date_idx = max(0, date_idx - n_days + 1)
            relevant_dates = all_dates[start_date_idx:date_idx + 1]
            for n_day_idx, past_date in enumerate(relevant_dates):
                past_date_idx = date_to_index[past_date]
                tensor[date_idx, n_day_idx, stock_idx, :] = df.loc[past_date].values

    tensor = torch.tensor(tensor, dtype=torch.float32)
    return tensor

def add_batch_dimensionX(tensor, batch_size):
    # 获取当前张量的第一个维度的长度
    num_dates = tensor.shape[0]

    # 计算有多少个完整的批次
    num_batches = (num_dates + batch_size - 1) // batch_size

    # 初始化带有批次的新张量
    new_shape = (num_batches, batch_size) + tensor.shape[1:]
    batched_tensor = torch.zeros(new_shape, dtype=tensor.dtype)

    # 填充新的带有批次的张量
    for i in tqdm(range(num_batches), desc="Adding batch dimension"):
        start_idx = i * batch_size
        end_idx = min(start_idx + batch_size, num_dates)
        batched_tensor[i, :end_idx-start_idx] = tensor[start_idx:end_idx]

    return batched_tensor

In [9]:
import numpy as np
import torch
from tqdm import tqdm

def create_4d_tensorY(stock_dict, n_days, b):
    all_stocks = list(stock_dict.keys())
    all_dates = sorted(stock_dict[all_stocks[0]]['Date'])
    print(len(all_dates))
    date_to_index = {date: idx for idx, date in enumerate(all_dates)}

    num_dates = len(all_dates)
    num_stocks = len(all_stocks)

    tensor = np.zeros((num_dates, 1, num_stocks, b))

    def fill_nan_with_nearest(array):
        """填充NaN值，用临近的值进行填充"""
        nan_indices = np.isnan(array)
        if np.any(nan_indices):
            # 获取非NaN值的索引和值
            not_nan_indices = np.where(~nan_indices)[0]
            not_nan_values = array[not_nan_indices]
            # 使用最近的非NaN值进行填充
            nearest_values = np.interp(np.where(nan_indices)[0], not_nan_indices, not_nan_values)
            array[nan_indices] = nearest_values
        return array

    for stock_idx, stock in tqdm(enumerate(all_stocks), total=len(all_stocks), desc="Processing stocks"):
        df = stock_dict[stock]
        df.set_index('Date', inplace=True)
        df = df.sort_index()
        df['Avg'] = (df['Open'] + df['Close']) / 2
        df['Future_Return'] = (df['Avg'].shift(-n_days) - df['Avg']) / df['Avg']
        
        # 检查并填充NaN值
        df['Future_Return'] = fill_nan_with_nearest(df['Future_Return'].values)

        for date_idx, current_date in enumerate(all_dates):
            target_idx = date_idx + n_days
            if target_idx < len(df):
                future_return = df['Future_Return'].iloc[target_idx]
            else:
                future_return = df['Future_Return'].iloc[-1]
            tensor[date_idx, 0, stock_idx, :] = future_return

    tensor = torch.tensor(tensor, dtype=torch.float32)
    return tensor

def add_batch_dimensionY(tensor, batch_size):
    # 获取当前张量的第一个维度的长度
    num_dates = tensor.shape[0]

    # 计算有多少个完整的批次
    num_batches = (num_dates + batch_size - 1) // batch_size

    # 初始化带有批次的新张量
    new_shape = (num_batches, batch_size) + tensor.shape[1:]
    batched_tensor = torch.zeros(new_shape, dtype=tensor.dtype)

    # 填充新的带有批次的张量
    for I in tqdm(range(num_batches), desc="Adding batch dimension"):
        start_idx = I * batch_size
        end_idx = min(start_idx + batch_size, num_dates)
        batched_tensor[I, :end_idx-start_idx] = tensor[start_idx:end_idx]

    return batched_tensor

In [10]:
# 调整成四维张量
past_days = 63
future_days = 21
factors = 2
batch_size = 2
# 生成X的
tensorX = create_4d_tensorX(csv_dataframes, past_days)
dataX = add_batch_dimensionX(tensorX, batch_size)
# 重新读取数据
csv_dataframes = read_csv_files_to_dict(directory_path)
csv_dataframes, dates = align_dates(csv_dataframes)
# 生成Y的
tensorY = create_4d_tensorY(csv_dataframes, future_days, factors)
dataY = add_batch_dimensionY(tensorY, batch_size)

5093


Processing stocks: 100%|██████████| 50/50 [08:54<00:00, 10.69s/it]
Adding batch dimension: 100%|██████████| 2547/2547 [00:00<00:00, 63678.10it/s]


5093


Processing stocks: 100%|██████████| 50/50 [00:03<00:00, 15.12it/s]
Adding batch dimension: 100%|██████████| 2547/2547 [00:00<00:00, 231111.38it/s]


In [11]:
import pickle
with open('./new_data.pkl', 'wb') as f:
    pickle.dump((dataX, dataY), f)

In [12]:
dataX.size()

torch.Size([2547, 2, 63, 50, 6])

In [13]:
dataY.size()

torch.Size([2547, 2, 1, 50, 2])

In [14]:
csv_dataframes['AAPL']

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-05-23,0.436607,0.450714,0.429821,0.449643,0.381153,369398400.0
2002-05-24,0.446250,0.446250,0.427857,0.431250,0.365562,166174400.0
2002-05-27,0.446250,0.446250,0.427857,0.431250,0.364275,166174400.0
2002-05-28,0.423036,0.432143,0.418393,0.428214,0.362988,149716000.0
2002-05-29,0.427143,0.436429,0.418750,0.428214,0.362988,221793600.0
...,...,...,...,...,...,...
2021-11-23,161.119995,161.800003,159.059998,161.410004,159.586609,96041900.0
2021-11-24,160.750000,162.139999,159.639999,161.940002,160.110626,69463600.0
2021-11-25,160.750000,162.139999,159.639999,161.940002,157.574593,69463600.0
2021-11-26,159.570007,160.449997,156.360001,156.809998,155.038559,76959800.0
