In [4]:
import argparse
from datetime import datetime
import json
import numpy as np
import operator
import os
import pandas as pd

#  读取股票数据，生成特征
class EOD_Preprocessor:
    def __init__(self, data_path, market_name):
        self.data_path = data_path # /Users/liupeng/Desktop/DSAA 5020 Final Project/TGCN_with_latest_data/2023.12.14(Data acquisition& Preprocessing)
        self.date_format = '%Y-%m-%d' 
        self.market_name = market_name # S&P500

    # def _read_EOD_data(self):
    #     self.data_EOD = []
    #     for index, ticker in enumerate(self.tickers):
    #         single_EOD = np.genfromtxt(
    #             os.path.join(self.data_path,"S&P500_original_data", ticker +
    #                          '_stock_data.csv'), dtype=str, delimiter=',',
    #             skip_header=True
    #         )
    #         self.data_EOD.append(single_EOD)
    #         # if index > 99:
    #         #     break
    #     print('#stocks\' EOD data readin:', len(self.data_EOD))
    #     assert len(self.tickers) == len(self.data_EOD), 'length of tickers ' \
    #                                                     'and stocks not match'

    def _read_EOD_data(self):
        self.data_EOD = []
        for ticker in self.tickers:
            file_path = os.path.join(self.data_path, "S&P500_original_data", f"{ticker}_stock_data.csv")
            single_EOD = pd.read_csv(file_path)
            self.data_EOD.append(single_EOD.values)
        print('#stocks\' EOD data readin:', len(self.data_EOD))
        assert len(self.tickers) == len(self.data_EOD), 'length of tickers and stocks not match'

    # def _read_tickers(self, ticker_fname):
    #     self.tickers = np.genfromtxt(ticker_fname, dtype=str, delimiter='\t',
    #                                  skip_header=True)[:, 0]
    def _read_tickers(self):
        tickers_path = os.path.join(self.data_path, "Relation_Data(478)", "sp500_tickers_478.csv")
        self.tickers = pd.read_csv(tickers_path)['Ticker'].tolist()

    def _transfer_EOD_str(self, selected_EOD_str, tra_date_index):
        selected_EOD = np.zeros(selected_EOD_str.shape, dtype=float)
        for row, daily_EOD in enumerate(selected_EOD_str):
            # print('daily_EOD:', daily_EOD)
            date_str = daily_EOD[0]
            # date_str = date_str.replace('-04:00', '')
            # print('date_str:', date_str)
            # print('tra_date_index:', tra_date_index)
            selected_EOD[row][0] = tra_date_index[date_str]
            # print('selected_EOD[row][0]:', selected_EOD[row][0])
            for col in range(1, selected_EOD_str.shape[1]):
                selected_EOD[row][col] = float(daily_EOD[col])
        return selected_EOD

    '''
        Transform the original EOD data collected from Google Finance to a
        friendly format to fit machine learning model via the following steps:
            Calculate moving average (5-days, 10-days, 20-days, 30-days),
            ignoring suspension days (market open, only suspend this stock)
            Normalize features by (feature - min) / (max - min)
    '''
    def generate_feature(self, selected_tickers_fname, begin_date, opath,
                         return_days=1, pad_begin=29):
        # 新的交易日期文件路径
        trading_dates_path = os.path.join(self.data_path, 'S&P500_aver_line_dates.csv')
        trading_dates = pd.read_csv(trading_dates_path)['Date'].tolist()

        # trading_dates = np.genfromtxt(
        #     os.path.join(self.data_path,self.market_name + '_aver_line_dates_latest.csv'),
        #     dtype=str, delimiter=',', skip_header=False
        # )
        # trading_dates = trading_dates[29:]  # offset for the first 30-days average
        print('#trading dates:', len(trading_dates)) # trading dates是什么？为什么文件中显示是股票ticker的数量？ trading_dates：1275
        # begin_date = datetime.strptime(trading_dates[29], self.date_format)
        print('begin date:', begin_date)
        # transform the trading dates into a dictionary with index
        index_tra_dates = {}
        tra_dates_index = {}
        for index, date in enumerate(trading_dates):
            tra_dates_index[date] = index
            index_tra_dates[index] = date
        # read in tickers file: NASDAQ_tickers_qualify_dr-0.98_min-5_smooth.csv
        self.tickers = np.genfromtxt(
            os.path.join(self.data_path, selected_tickers_fname),
            dtype=str, delimiter='\t', skip_header=False
        )
        print('#tickers selected:', len(self.tickers))
        self._read_EOD_data()
        for stock_index, single_EOD in enumerate(self.data_EOD):
            # select data within the begin_date
            # begin_date_row = -1
            for date_index, daily_EOD in enumerate(single_EOD):
                date_str = daily_EOD[0]
                # date_str = date_str.replace('-04:00', '') #这部分代码为什么在_transfer_EOD_str中又被执行了一遍？
                cur_date = datetime.strptime(date_str, self.date_format)
                # if cur_date > begin_date:
                    # print('cur_date:', cur_date)
                    # begin_date_row = date_index
                    # print(single_EOD[date_index,:])
                    # break
            selected_EOD_str = single_EOD
            selected_EOD = self._transfer_EOD_str(selected_EOD_str,
                                                  tra_dates_index)
            # Question：数据形状对不上：selected_EOD_str.shape = (1274, 6), selected_EOD.shape = (1274, 6)
            # calculate moving average features
            begin_date_row = 29
            # for row in selected_EOD[:, 0]:
            #     row = int(row)
            #     if row >= pad_begin:   # offset for the first 30-days average
            #         begin_date_row = row
            #         break
            mov_aver_features = np.zeros(
                [selected_EOD.shape[0], 4], dtype=float
            )   # 4 columns refers to 5-, 10-, 20-, 30-days average
            for row in range(begin_date_row, selected_EOD.shape[0]):
                date_index = selected_EOD[row][0]
                aver_5 = 0.0
                aver_10 = 0.0
                aver_20 = 0.0
                aver_30 = 0.0
                count_5 = 0
                count_10 = 0
                count_20 = 0
                count_30 = 0
                for offset in range(30):
                    date_gap = date_index - selected_EOD[row - offset][0]
                    if date_gap < 5:
                        count_5 += 1
                        aver_5 += selected_EOD[row - offset][4]
                    if date_gap < 10:
                        count_10 += 1
                        aver_10 += selected_EOD[row - offset][4]
                    if date_gap < 20:
                        count_20 += 1
                        aver_20 += selected_EOD[row - offset][4]
                    if date_gap < 30:
                        count_30 += 1
                        aver_30 += selected_EOD[row - offset][4]
                mov_aver_features[row][0] = aver_5 / count_5
                mov_aver_features[row][1] = aver_10 / count_10
                mov_aver_features[row][2] = aver_20 / count_20
                mov_aver_features[row][3] = aver_30 / count_30

            '''
                normalize features by feature / max, the max price is the
                max of close prices, I give up to subtract min for easier
                return ratio calculation.
            '''
            pri_min = np.min(selected_EOD[:, 5]) # 收盘价的最小值
            price_max = np.max(selected_EOD[:, 5])# 收盘价的最大值
            print(self.tickers[stock_index], 'minimum:', pri_min,
                  'maximum:', price_max, 'ratio:', price_max / pri_min)
            if price_max / pri_min > 10:      # 价格最大值与最小值的比值大于10，说明这个股票的价格波动很大，不适合用来做预测 ?
                print('!!!!!!!!!')
            # open_high_low = (selected_EOD[:, 1:4] - price_min) / \
            #                 (price_max - price_min)
            mov_aver_features = mov_aver_features / price_max  #移动平均值除以收盘价最大值进行标准化是否合理？

            '''
                generate feature and ground truth in the following format:
                date_index, 5-day, 10-day, 20-day, 30-day, close price
                two ways to pad missing dates:
                for dates without record, pad a row [date_index, -1234 * 5]
            '''
            features = np.ones([len(trading_dates) - pad_begin, 6],
                               dtype=float) * -1234
            # features：(1275-29=1246, 6)
            # data missed at the beginning
            for row in range(len(trading_dates) - pad_begin):
                features[row][0] = row
            for row in range(begin_date_row, selected_EOD.shape[0]):
                cur_index = int(selected_EOD[row][0])
                features[cur_index - pad_begin][1:5] = mov_aver_features[row]
                # 这个if语句的作用是什么？有什么意义？
                # if cur_index - int(selected_EOD[row - return_days][0]) == return_days:
                features[cur_index - pad_begin][-1] = selected_EOD[row][4] / price_max

            # write out
            np.savetxt(os.path.join(opath, self.market_name + '_' +
                                    self.tickers[stock_index] + '_' +
                                    str(return_days) + '.csv'), features,
                       fmt='%.6f', delimiter=',')


if __name__ == '__main__':
    desc = "pre-process EOD data market by market, including listing all " \
           "trading days, all satisfied stocks (5 years & high price), " \
           "normalizing and compensating data"
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('-path', help='path of EOD data')
    parser.add_argument('-market', help='market name')
    args = parser.parse_args()

    if args.path is None:
        args.path = '/Users/liupeng/Desktop/DSAA 5020 Final Project/TGCN_with_latest_data/2023.12.14(Data acquisition& Preprocessing)'
    if args.market is None:
        args.market = 'S&P500'

    processor = EOD_Preprocessor(args.path, args.market)
    processor.generate_feature(
        'sp500_tickers_478.csv',  # 这里假设传递的是正确的tickers文件名
        datetime.strptime('2015-01-02', '%Y-%m-%d'),
        os.path.join(args.path, 'SP500_EOD'),  # 假设输出路径为Processed_Data子文件夹
        return_days=1,
        pad_begin=29
    )

usage: ipykernel_launcher.py [-h] [-path PATH] [-market MARKET]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/liupeng/Library/Jupyter/runtime/kernel-v2-10460fYMx4InVpaSJ.json


SystemExit: 2

In [5]:
%pwd

'/Users/liupeng/Desktop/DSAA 5020 Final Project/TGCN_with_latest_data/2023.12.14(Data acquisition& Preprocessing)/Data_processing_code'

In [9]:
from datetime import datetime
import numpy as np
import os
import pandas as pd

class EOD_Preprocessor:
    def __init__(self, data_path, market_name):
        self.data_path = data_path
        self.date_format = '%Y-%m-%d'
        self.market_name = market_name

    def _read_EOD_data(self):
        self.data_EOD = []
        for ticker in self.tickers:
            file_path = os.path.join(self.data_path, "S&P500_original_data", f"{ticker}_stock_data.csv")
            single_EOD = pd.read_csv(file_path)
            self.data_EOD.append(single_EOD.values)
        print('#stocks\' EOD data readin:', len(self.data_EOD))
        assert len(self.tickers) == len(self.data_EOD), 'length of tickers and stocks not match'

    def _read_tickers(self):
        tickers_path = os.path.join(self.data_path, "Relation_Data(478)", "sp500_tickers_478.csv")
        self.tickers = pd.read_csv(tickers_path)['Ticker'].tolist()

    
    def _transfer_EOD_str(self, selected_EOD_str, tra_date_index):
        selected_EOD = np.zeros(selected_EOD_str.shape, dtype=float)
        for row, daily_EOD in enumerate(selected_EOD_str):
            # print('daily_EOD:', daily_EOD)
            date_str = daily_EOD[0]
            # date_str = date_str.replace('-04:00', '')
            # print('date_str:', date_str)
            # print('tra_date_index:', tra_date_index)
            selected_EOD[row][0] = tra_date_index[date_str]
            # print('selected_EOD[row][0]:', selected_EOD[row][0])
            for col in range(1, selected_EOD_str.shape[1]):
                selected_EOD[row][col] = float(daily_EOD[col])
        return selected_EOD

    '''
        Transform the original EOD data collected from Google Finance to a
        friendly format to fit machine learning model via the following steps:
            Calculate moving average (5-days, 10-days, 20-days, 30-days),
            ignoring suspension days (market open, only suspend this stock)
            Normalize features by (feature - min) / (max - min)
    '''

    
    def generate_feature(self, selected_tickers_fname, begin_date, opath,
                         return_days=1, pad_begin=29):
        # 新的交易日期文件路径
        trading_dates_path = os.path.join(self.data_path, 'S&P500_aver_line_dates.csv')
        trading_dates_df = pd.read_csv(trading_dates_path, header=None)
        trading_dates = trading_dates_df.iloc[:, 0].tolist()  # 假设日期数据在第一列

        # trading_dates = np.genfromtxt(
        #     os.path.join(self.data_path,self.market_name + '_aver_line_dates_latest.csv'),
        #     dtype=str, delimiter=',', skip_header=False
        # )
        # trading_dates = trading_dates[29:]  # offset for the first 30-days average
        print('#trading dates:', len(trading_dates)) # trading dates是什么？为什么文件中显示是股票ticker的数量？ trading_dates：1275
        # begin_date = datetime.strptime(trading_dates[29], self.date_format)
        print('begin date:', begin_date)
        # transform the trading dates into a dictionary with index
        index_tra_dates = {}
        tra_dates_index = {}
        for index, date in enumerate(trading_dates):
            tra_dates_index[date] = index
            index_tra_dates[index] = date
        # read in tickers file: NASDAQ_tickers_qualify_dr-0.98_min-5_smooth.csv
        self.tickers = np.genfromtxt(
            os.path.join(self.data_path, selected_tickers_fname),
            dtype=str, delimiter='\t', skip_header=False
        )
        print('#tickers selected:', len(self.tickers))
        self._read_EOD_data()
        for stock_index, single_EOD in enumerate(self.data_EOD):
            # select data within the begin_date
            # begin_date_row = -1
            for date_index, daily_EOD in enumerate(single_EOD):
                date_str = daily_EOD[0]
                # date_str = date_str.replace('-04:00', '') #这部分代码为什么在_transfer_EOD_str中又被执行了一遍？
                cur_date = datetime.strptime(date_str, self.date_format)
                # if cur_date > begin_date:
                    # print('cur_date:', cur_date)
                    # begin_date_row = date_index
                    # print(single_EOD[date_index,:])
                    # break
            selected_EOD_str = single_EOD
            selected_EOD = self._transfer_EOD_str(selected_EOD_str,
                                                  tra_dates_index)
            # Question：数据形状对不上：selected_EOD_str.shape = (1274, 6), selected_EOD.shape = (1274, 6)
            # calculate moving average features
            begin_date_row = 29
            # for row in selected_EOD[:, 0]:
            #     row = int(row)
            #     if row >= pad_begin:   # offset for the first 30-days average
            #         begin_date_row = row
            #         break
            mov_aver_features = np.zeros(
                [selected_EOD.shape[0], 4], dtype=float
            )   # 4 columns refers to 5-, 10-, 20-, 30-days average
            for row in range(begin_date_row, selected_EOD.shape[0]):
                date_index = selected_EOD[row][0]
                aver_5 = 0.0
                aver_10 = 0.0
                aver_20 = 0.0
                aver_30 = 0.0
                count_5 = 0
                count_10 = 0
                count_20 = 0
                count_30 = 0
                for offset in range(30):
                    date_gap = date_index - selected_EOD[row - offset][0]
                    if date_gap < 5:
                        count_5 += 1
                        aver_5 += selected_EOD[row - offset][4]
                    if date_gap < 10:
                        count_10 += 1
                        aver_10 += selected_EOD[row - offset][4]
                    if date_gap < 20:
                        count_20 += 1
                        aver_20 += selected_EOD[row - offset][4]
                    if date_gap < 30:
                        count_30 += 1
                        aver_30 += selected_EOD[row - offset][4]
                mov_aver_features[row][0] = aver_5 / count_5
                mov_aver_features[row][1] = aver_10 / count_10
                mov_aver_features[row][2] = aver_20 / count_20
                mov_aver_features[row][3] = aver_30 / count_30

            '''
                normalize features by feature / max, the max price is the
                max of close prices, I give up to subtract min for easier
                return ratio calculation.
            '''
            pri_min = np.min(selected_EOD[:, 5]) # 收盘价的最小值
            price_max = np.max(selected_EOD[:, 5])# 收盘价的最大值
            print(self.tickers[stock_index], 'minimum:', pri_min,
                  'maximum:', price_max, 'ratio:', price_max / pri_min)
            if price_max / pri_min > 10:      # 价格最大值与最小值的比值大于10，说明这个股票的价格波动很大，不适合用来做预测 ?
                print('!!!!!!!!!')
            # open_high_low = (selected_EOD[:, 1:4] - price_min) / \
            #                 (price_max - price_min)
            mov_aver_features = mov_aver_features / price_max  #移动平均值除以收盘价最大值进行标准化是否合理？

            '''
                generate feature and ground truth in the following format:
                date_index, 5-day, 10-day, 20-day, 30-day, close price
                two ways to pad missing dates:
                for dates without record, pad a row [date_index, -1234 * 5]
            '''
            features = np.ones([len(trading_dates) - pad_begin, 6],
                               dtype=float) * -1234
            # features：(1275-29=1246, 6)
            # data missed at the beginning
            for row in range(len(trading_dates) - pad_begin):
                features[row][0] = row
            for row in range(begin_date_row, selected_EOD.shape[0]):
                cur_index = int(selected_EOD[row][0])
                features[cur_index - pad_begin][1:5] = mov_aver_features[row]
                # 这个if语句的作用是什么？有什么意义？
                # if cur_index - int(selected_EOD[row - return_days][0]) == return_days:
                features[cur_index - pad_begin][-1] = selected_EOD[row][4] / price_max

            # write out
            np.savetxt(os.path.join(opath, self.market_name + '_' +
                                    self.tickers[stock_index] + '_' +
                                    str(return_days) + '.csv'), features,
                       fmt='%.6f', delimiter=',')

# 设置数据路径和市场名称
data_path = '/Users/liupeng/Desktop/DSAA 5020 Final Project/TGCN_with_latest_data/2023.12.14(Data acquisition& Preprocessing)'
market_name = 'S&P500'

# 创建EOD_Preprocessor实例
processor = EOD_Preprocessor(data_path, market_name)

# 调用generate_feature方法处理数据
processor.generate_feature(
    'sp500_tickers_478.csv',
    datetime.strptime('2015-01-02', '%Y-%m-%d'),
    os.path.join(data_path, 'SP500_EOD'),
    return_days=1,
    pad_begin=29
)


#trading dates: 2244
begin date: 2015-01-02 00:00:00


FileNotFoundError: /Users/liupeng/Desktop/DSAA 5020 Final Project/TGCN_with_latest_data/2023.12.14(Data acquisition& Preprocessing)/sp500_tickers_478.csv not found.