In [None]:
from math import sqrt
import numpy as np
import pandas as pd
import warnings
from datetime import date, timedelta
import psutil
import os

# Get the current process ID of the IPython kernel
pid = os.getpid()
# Get the process associated with the IPython kernel
process = psutil.Process(pid)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
zscore = StandardScaler()
from cylib.apis.all_api import *
import baostock as bs
import pickle

# Suppress the warning
warnings.filterwarnings("ignore", category=pd.core.common.SettingWithCopyWarning)

# Suppress the warning
# warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
Main_bactest = False
Online = False
path_Main = "/home/huh/"
path_HS300_Rolling = "/home/huh/Stage-2/HS300-Rolling/"
path_HS300_Rolling_pic = "/home/huh/Stage-2/HS300-Rolling/Loss-pic/"

begin_date = "20180101"
end_date = "20240430"
today_date = str(date.today())
if Main_bactest:
    end_date = today_date

In [None]:
'''
# Variables
begin_date = None
end_date = None
date_all = None
date_backtest = None

HS300 = None
all_stocks = None
stocks_code = None
TEMP_PRICE_pivot = None
TEMP_PRICE = None
indexes = None
price = None
benchmark = None
price_return = None
benchmark_return = None
price_return_backtest = None
benchmark_return_backtest = None
HS_300_backtest = None
CS_500_backtest = None
CS_1000_backtest = None
price_backtest = None
price_X_backtest = None
price_y_backtest = None
HS_300 = None
CS_500 = None
CS_1000 = None
Label_column_processed = None
'''

class Get_Stocks_Data():
    def __init__(self, Begin_date, End_date, Path) -> None:
        '''
        Variables which will be saved to local:
        price, benchmark, date_all, stocks_code, price_return_backtest,
        benchmark_return_backtest, HS_300_backtest, CS_1000_backtest, CS_500_backtest
        '''
        self.RAM_USAGE()
        self.path = Path
        self.Obtain_HS300_Stocks()
        self.all_stocks = get_targets_info(target_type="stock")
        self.begin_date = Begin_date
        self.end_date = End_date
        self.Get_Trade_Calendar()
        self.Remove_Missing_Stocks()
        self.Get_Stocks()
        self.Get_Benchmark()
        self.Make_Predicting_Label()
        self.Data_Preprocessing()
        self.Get_Daily_Return()
    def RAM_USAGE(self):
        # Get the memory usage of the IPython kernel in MB
        ram_usage = process.memory_info().rss / (1024 * 1024)
        print(f"RAM Usage: {ram_usage} MB")
    def Obtain_HS300_Stocks(self):
        # Obtain HS300 index
        print("*" * 60)
        print("Obtaining HS300 stocks...")
        bs.login()
        rs = bs.query_hs300_stocks()
        hs300_stocks = []
        while (rs.error_code == "0") & rs.next():
            hs300_stocks.append(rs.get_row_data())
        hs300_stocks = pd.DataFrame(hs300_stocks)
        HS300 = list(hs300_stocks[1])
        HS300 = [HS300[i][-6:] + "." + HS300[i][:2].upper() for i in range(len(HS300))]
        self.stocks_code = self.HS300 = HS300
        print("*" * 60)
    def Get_Trade_Calendar(self):
        print("*" * 60)
        print("Getting trade calendar...")
        PRICE = get_price(
            ts_code_list=self.stocks_code,
            feature_list=["open"],
            start_date=self.begin_date,
            trade_date=self.end_date,
            target_type="stock",
        )
        PRICE.reset_index(inplace=True)
        PRICE = PRICE.sort_values(by="trade_date").reset_index(drop=True)
        # price.set_index(['trade_date', 'ts_code'], inplace=True)
        PRICE_pivot = PRICE.fillna(0)
        PRICE_pivot = PRICE_pivot.pivot(index="trade_date", columns="ts_code", values="open")

        # Attention: len(date_all) >= 60
        date_all = PRICE_pivot.index
        self.date_all = date_all
        self.date_train, date_backtest = train_test_split(date_all, test_size=0.2, shuffle=False)
        self.date_backtest = date_backtest
        if Main_bactest:
            # Create a DataFrame with the DatetimeIndex
            date_all_df = pd.DataFrame(index=date_all)
            # Save the DataFrame to a CSV file
            date_all_df.to_csv(self.path + 'date_all.csv')

            # Save the stocks list to a file
            with open(self.path + 'stocks_code.pkl', 'wb') as file:
                pickle.dump(self.stocks_code, file)
        # self.date_test = date_backtest[:-11]
        # self.date_drop = date_backtest[-11:]
        # self.date_GRU = date_all[29:]
        self.TEMP_PRICE_pivot = PRICE_pivot
        self.TEMP_PRICE = PRICE
        print("*" * 60)
    def Remove_Missing_Stocks(self):
        print("*" * 60)
        print("Removing missing stocks...")
        # Find missing stocks
        all_combinations = pd.MultiIndex.from_product(
            [self.date_all, self.stocks_code], 
            names=["trade_date", "ts_code"])
        all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()
        price_whole = pd.merge(
            all_combinations_df, 
            self.TEMP_PRICE, 
            on=["trade_date", "ts_code"], 
            how="left")
        missing = price_whole[price_whole.isnull().any(axis=1)]
        missing_stock = missing["ts_code"].unique()
        print("The missing stock:", missing_stock)

        self.stocks_code = list(set(self.HS300) - set(missing_stock))

        # Del variables to release RAM.
        del (
            self.TEMP_PRICE,
            self.TEMP_PRICE_pivot)
        print("*" * 60)
    def Get_Stocks(self):
        print("*" * 60)
        print("Getting stocks data...")
        # Get stocks
        price = get_price(
            ts_code_list=self.stocks_code,
            feature_list=[
                "open",
                "high",
                "low",
                "close",
                "avg_price",
                "volume",
                "trade_status",  # "交易"/"停牌"
            ],
            start_date=self.begin_date,
            trade_date=self.end_date,
            target_type="stock",
        )
        price = price.rename(
            columns={
                "open": "OPEN",
                "high": "HIGH",
                "low": "LOW",
                "close": "CLOSE",
                "avg_price": "VWAP",
                "volume": "VOLUME",
            }
        )
        price.reset_index(inplace=True)

        price = price.sort_values(by="trade_date").reset_index(drop=True)
        print(price.info())
        self.price = price
        self.indexes = ["OPEN", "HIGH", "LOW", "CLOSE", "VWAP", "VOLUME"]
        self.RAM_USAGE()
        print("*" * 60)
    def Get_Benchmark(self):
        print("*" * 60)
        print("Getting benchmark...")
        # Get benchmark index
        # 1. the Shanghai and Shenzhen 300 index(the HS 300 index)(000300)
        # 2. the China Securities 500 index(000905)
        # 3. the China Securities 1000 index(000852)
        benchmark = get_price(
            ts_code_list=["000300.SH", "000905.SH", "000852.SH"],
            feature_list=["open", "high", "low", "close", "volume"],
            start_date=self.begin_date,
            trade_date=self.end_date,
            target_type="index"
        )
        benchmark = benchmark.rename(
            columns={
                "open": "OPEN",
                "high": "HIGH",
                "low": "LOW",
                "close": "CLOSE",
                "volume": "VOLUME"
            }
        )
        benchmark.reset_index(inplace=True)
        # Add weekend, If it is Monday, the value is 1, or 0.
        # benchmark["Monday"] = (benchmark["trade_date"].dt.dayofweek == 0).astype(int)
        benchmark = benchmark.sort_values(by="trade_date").reset_index(drop=True)
        benchmark.fillna(0, inplace=True)
        self.benchmark = benchmark
        print("*" * 60)
        # del BUY_price, BUY_benchmark, SELL_price, SELL_benchmark
    def Make_Predicting_Label(self):
        print("*" * 60)
        print("Making predicting label...")
        price = self.price
        # T+1 to T+11 VWAP return
        VWAP_df = price.pivot(index="trade_date", 
                              columns="ts_code", 
                              values="VWAP")

        # the change rate of T+1 to T+11
        T_begin = 1
        T_end = 11
        Label_df = (VWAP_df.shift(-T_end) - VWAP_df.shift(-T_begin)) / VWAP_df.shift(-T_begin)
        # Attention! zscore.fit_transform() is calculated column by column, so we use Label_df.T
        Label_df_processed = pd.DataFrame(zscore.fit_transform(Label_df.T).T, 
                                          index=Label_df.index, 
                                          columns=Label_df.columns)

        # Create a dictionary to map trade_date to the corresponding label values
        label_dict = Label_df.to_dict(orient="index")
        label_dict_processed = Label_df_processed.to_dict(orient="index")

        # Define a function to get the label value for a given row (trade_date and stock code)
        def get_label(row):
            return label_dict[row["trade_date"]][row["ts_code"]]

        def get_label_processed(row):
            return label_dict_processed[row["trade_date"]][row["ts_code"]]

        # Apply the function to create a new 'Label' column in the 'price' dataframe
        Label_column = price.apply(get_label, axis=1)
        price["Label"] = Label_column
        self.Label_column_processed = price.apply(get_label_processed, axis=1)
        self.price = price
        print("*" * 60)
        # del VWAP_df, Label_df, label_dict, label_dict_processed
    def Data_Preprocessing(self):
        print("*" * 60)
        print("Data preprocessing...")
        price = self.price
        # 3MAD
        def Col_3MAD(row):
            median = row.median()  # median
            mad = abs(row - row.median()).median()
            threshold = 3 * mad
            lower_bound = median - threshold
            upper_bound = median + threshold
            return row.clip(lower=lower_bound, upper=upper_bound)
        def value_mapping(row):
            return value_dict[row["trade_date"]][row["ts_code"]]

        # indexes = ["OPEN", "HIGH", "LOW", "CLOSE", "VWAP", "VOLUME"]
        for index in self.indexes:
            df = price.pivot(index="trade_date", 
                                  columns="ts_code", 
                                  values=index)
            values = df.values
            # Standardize for each row
            df = pd.DataFrame(zscore.fit_transform(values.T).T, 
                              index=df.index, 
                              columns=df.columns)
            df = df.apply(Col_3MAD, axis=1)  # 3 times MAD for each row
            value_dict = df.to_dict(orient="index")
            price[index + "_processed"] = price.apply(value_mapping, axis=1)
                # Data preprocessing
        price["Label_processed"] = self.Label_column_processed
        del self.Label_column_processed
        # It needs to be modified. It is possible to have nan values except for the last 11 days of the time
        # Remove last 11 days' missing data. (T_end - T_begin + 1)
        # del Label_column, Label_column_processed
        price.fillna(0, inplace=True)
        print(price.info())
        self.price = price
        self.RAM_USAGE()
        print("*" * 60)
    def Get_Daily_Return(self):
        print("*" * 60)
        print("Getting daily return...")
        benchmark = self.benchmark
        price = self.price
        BUY_price = price.pivot(index="trade_date", 
                                columns="ts_code", 
                                values="OPEN")
        BUY_benchmark = benchmark.pivot(index="trade_date", 
                                        columns="ts_code", 
                                        values="OPEN")
        SELL_price = price.pivot(index="trade_date", 
                                 columns="ts_code", 
                                 values="CLOSE")
        SELL_benchmark = benchmark.pivot(index="trade_date", 
                                         columns="ts_code", 
                                         values="CLOSE")
        price_return = (SELL_price - BUY_price) / BUY_price
        self.price_return = price_return
        benchmark_return = (SELL_benchmark - BUY_benchmark) / BUY_benchmark
        HS_300 = pd.DataFrame(benchmark_return["000300.SH"])
        CS_500 = pd.DataFrame(benchmark_return["000905.SH"])
        CS_1000 = pd.DataFrame(benchmark_return["000852.SH"])
        # self.CS_500 = CS_500
        # self.HS_300 = HS_300
        # self.CS_1000 = CS_1000

        # Split return rates data
        date_backtest = self.date_backtest
        price_return_backtest = price_return.loc[price_return.index.isin(date_backtest), :]
        benchmark_return_backtest = benchmark_return.loc[
            benchmark_return.index.isin(date_backtest), :]
        HS_300_backtest = HS_300.loc[HS_300.index.isin(date_backtest), :]["000300.SH"]
        CS_500_backtest = CS_500.loc[HS_300.index.isin(date_backtest), :]["000905.SH"]
        CS_1000_backtest = CS_1000.loc[HS_300.index.isin(date_backtest), :]["000852.SH"]

        # price_backtest = price.loc[price["trade_date"].isin(date_backtest), :]
        # self.price_X_backtest = price_backtest[self.X_indexes].values
        # self.price_y_backtest = price_backtest[self.y_index].values
        if Main_bactest:
            price.to_csv(self.path + "price_processed.csv", index=False)
            price_return.to_csv(self.path + "price_return.csv")
            benchmark.to_csv(self.path + "benchmark.csv")
            price_return_backtest.to_csv(self.path + "price_return_backtest.csv")
            benchmark_return_backtest.to_csv(self.path + "benchmark_return_backtest.csv")
            HS_300_backtest.to_csv(self.path + "HS_300_backtest.csv")
            CS_1000_backtest.to_csv(self.path + "CS_1000_backtest.csv")
            CS_500_backtest.to_csv(self.path + "CS_500_backtest.csv")
        self.RAM_USAGE()
        print("Finished!")
        print("*" * 60)

In [None]:
if Online:
    GET_STOCKS_DATA = Get_Stocks_Data(begin_date, end_date, path_HS300_Rolling)
    price = GET_STOCKS_DATA.price
    price_return = GET_STOCKS_DATA.price_return
    date_all = GET_STOCKS_DATA.date_all
    date_backtest = GET_STOCKS_DATA.date_backtest
    stocks_code = GET_STOCKS_DATA.stocks_code