### HS300 backtest
1. Data collection: HS300.
2. Data cleaning: Remove stocks with missing value.
3. Label making: VWAP ROI of between T and T + 11.
4. Data preprocessing: 3MAD, z-score etc.
5. ROI dataframe making: HS300, CS500, CS1000 etc.
6. Modeling: MLP, GBDT, GRU, AGRU. *(Rolling position adjustment)*
7. Ensembling: according to past 60 days' ICIR.
8. Backtesting.

In [6]:
# Modeling
import tensorflow as tf
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Attention, Layer, GRU, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler

from math import sqrt
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from datetime import date, timedelta
import psutil
import os

# Get the current process ID of the IPython kernel
pid = os.getpid()
# Get the process associated with the IPython kernel
process = psutil.Process(pid)

from cylib.apis.all_api import *
import baostock as bs
from IPython.display import clear_output
from tqdm import tqdm

# Suppress the warning
warnings.filterwarnings(
    "ignore", 
    category=pd.core.common.SettingWithCopyWarning)

zscore = StandardScaler()

# Suppress the warning
# warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
Main_bactest = False
Online = True
path = "/home/huh/Stage-2/HS300-Single/"

### Get stocks list

In [7]:
# Obtain HS300 index
lg = bs.login()
rs = bs.query_hs300_stocks()
hs300_stocks = []
while (rs.error_code == "0") & rs.next():
    hs300_stocks.append(rs.get_row_data())
hs300_stocks = pd.DataFrame(hs300_stocks)
HS300 = list(hs300_stocks[1])
HS300 = [HS300[i][-6:] + "." + HS300[i][:2].upper() for i in range(len(HS300))]
stocks_code = HS300

all_stocks = get_targets_info(target_type="stock")
# all_stocks.to_csv(path + 'all_stocks.csv', index=False)

factor_info = get_factors_info()
factor_info

login success!


Unnamed: 0,factor_name,describe,need_days,contributor,calc_type,data_type,from,now_available
0,adj_close,复权收盘价(元),1,market,daily_spider,float,market,1
1,adj_factor,复权因子,1,market,daily_spider,float,market,1
2,adj_high,复权最高价(元),1,market,daily_spider,float,market,1
3,adj_high_52w,52周最高价(复权),1,market,daily_spider,float,market,1
4,adj_low,复权最低价(元),1,market,daily_spider,float,market,1
...,...,...,...,...,...,...,...,...
313,tot_shr,当日总股本,1,market,daily_spider,float,market,1
314,trade_status,交易状态,1,market,daily_spider,float,market,1
315,turn,换手率,1,market,daily_spider,float,market,1
316,up_down_limit_status,涨跌停状态,1,market,daily_spider,float,market,1


In [8]:
factors_name = list(factor_info[factor_info['from'] == 'article']['factor_name'])
factors_name

['alpha101_001',
 'alpha101_002',
 'alpha101_003',
 'alpha101_004',
 'alpha101_005',
 'alpha101_006',
 'alpha101_007',
 'alpha101_008',
 'alpha101_009',
 'alpha101_010',
 'alpha101_011',
 'alpha101_012',
 'alpha101_013',
 'alpha101_014',
 'alpha101_015',
 'alpha101_016',
 'alpha101_017',
 'alpha101_018',
 'alpha101_019',
 'alpha101_020',
 'alpha101_021',
 'alpha101_022',
 'alpha101_023',
 'alpha101_024',
 'alpha101_025',
 'alpha101_026',
 'alpha101_027',
 'alpha101_028',
 'alpha101_029',
 'alpha101_030',
 'alpha101_031',
 'alpha101_032',
 'alpha101_033',
 'alpha101_034',
 'alpha101_035',
 'alpha101_037',
 'alpha101_038',
 'alpha101_039',
 'alpha101_040',
 'alpha101_041',
 'alpha101_042',
 'alpha101_043',
 'alpha101_044',
 'alpha101_045',
 'alpha101_046',
 'alpha101_047',
 'alpha101_049',
 'alpha101_050',
 'alpha101_051',
 'alpha101_052',
 'alpha101_053',
 'alpha101_054',
 'alpha101_055',
 'alpha101_057',
 'alpha101_060',
 'alpha101_061',
 'alpha101_062',
 'alpha101_064',
 'alpha101_065

In [10]:
len(factors_name)

269

In [9]:
class HS300_Single():
    def __init__(self, Factor_Name, Begin_Date, End_Date, Stocks_Code) -> None:
        print(f"Factor: {Factor_Name}")
        self.factor_name = Factor_Name
        self.begin_date = Begin_Date
        self.end_date = End_Date
        self.stocks_code = Stocks_Code
        self.Get_Trade_Calendar()
        self.Data()
        # self.Get_Benchmark()
        self.Get_Daily_Return()
        self.Factor_DF()
        # self.Backtest_Main()
        # self.Backtest_Simple(6, False)
        # self.Indexes()
    def RAM_USAGE(self):
        # Get the memory usage of the IPython kernel in MB
        ram_usage = process.memory_info().rss / (1024 * 1024)
        print(f"RAM Usage: {ram_usage} MB")
    def Get_Trade_Calendar(self):
        print("*" * 60)
        print("Getting trade calendar...")
        PRICE = get_factor(self.factor_name, 
                           start_date=self.begin_date, 
                           end_date=self.end_date, 
                           ts_code_list=self.stocks_code)
        PRICE.reset_index(inplace=True)
        PRICE = PRICE.sort_values(by="trade_date").reset_index(drop=True)
        # price.set_index(['trade_date', 'ts_code'], inplace=True)
        PRICE = PRICE.fillna(0)
        PRICE = PRICE.drop_duplicates(subset=['trade_date', 'ts_code'])
        PRICE_pivot = PRICE.pivot(index="trade_date", 
                                  columns="ts_code", 
                                  values=self.factor_name)

        # Attention: len(date_all) >= 60
        date_all = PRICE_pivot.index
        self.date_all = date_all
        # _, date_backtest = train_test_split(date_all, test_size=0.2, shuffle=False)
        # backtest_begin = date_backtest[0]
        # backtest_end = date_backtest[-1]

        # Find missing stocks
        all_combinations = pd.MultiIndex.from_product(
            [date_all, stocks_code], 
            names=["trade_date", "ts_code"])
        all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()
        price_whole = pd.merge(
            all_combinations_df, 
            PRICE, 
            on=["trade_date", "ts_code"], 
            how="left")
        missing = price_whole[price_whole.isnull().any(axis=1)]
        missing_stock = missing["ts_code"].unique()
        print("The missing stock:", missing_stock)

        self.stocks_code = list(set(HS300) - set(missing_stock))
        print("*" * 60)
    def Data(self):
        print("*" * 60)
        print("Get data...")
        # Get factor
        factor = get_factor(self.factor_name, 
                        start_date=self.begin_date, 
                        end_date=self.end_date, 
                        ts_code_list=self.stocks_code)
        factor.reset_index(inplace=True)
        factor = factor.drop_duplicates(subset=['trade_date', 'ts_code'])
        self.factor = factor.sort_values(by="trade_date").reset_index(drop=True)

        # Get stocks
        price = get_price(
            ts_code_list=self.stocks_code,
            feature_list=[
                "open",
                "close"
            ],
            start_date=self.begin_date,
            trade_date=self.end_date,
            target_type="stock",
        )
        price = price.rename(
            columns={
                "open": "OPEN",
                "close": "CLOSE"
            }
        )
        price.reset_index(inplace=True)
        # Add weekend, If it is Monday, the value is 1, or 0.
        # price['Monday'] = (price['trade_date'].dt.dayofweek == 0).astype(int)
        self.price = price.sort_values(by="trade_date").reset_index(drop=True)
        # price.set_index(['trade_date', 'ts_code'], inplace=True)
        # price.fillna(0, inplace=True)
        self.RAM_USAGE()
        print("*" * 60)
    def Get_Benchmark(self):
        print("*" * 60)
        print("Get benchmark...")
        # Get benchmark index
        # 1. the Shanghai and Shenzhen 300 index(the HS 300 index)(000300)
        # 2. the China Securities 500 index(000905)
        # 3. the China Securities 1000 index(000852)
        benchmark = get_price(
            ts_code_list=["000300.SH", "000905.SH", "000852.SH"],
            feature_list=["open", "close"],
            start_date=self.begin_date,
            trade_date=self.end_date,
            target_type="index",
        )
        benchmark = benchmark.rename(
            columns={
                "open": "OPEN",
                "close": "CLOSE"
            }
        )
        benchmark.reset_index(inplace=True)
        # Add weekend, If it is Monday, the value is 1, or 0.
        # benchmark["Monday"] = (benchmark["trade_date"].dt.dayofweek == 0).astype(int)
        benchmark = benchmark.sort_values(by="trade_date").reset_index(drop=True)
        benchmark.fillna(0, inplace=True)
        self.benchmark = benchmark
        self.RAM_USAGE()
        print("*" * 60)
    def Get_Daily_Return(self):
        print("*" * 60)
        print("Get daily return...")
        price = self.price
        benchmark = self.benchmark
        BUY_price = price.pivot(index="trade_date", columns="ts_code", values="OPEN")
        BUY_benchmark = benchmark.pivot(index="trade_date", columns="ts_code", values="OPEN")
        SELL_price = price.pivot(index="trade_date", columns="ts_code", values="CLOSE")
        SELL_benchmark = benchmark.pivot(index="trade_date", columns="ts_code", values="CLOSE")
        price_return = (SELL_price - BUY_price) / BUY_price
        self.price_return = price_return.loc[price_return.index.isin(self.date_all), :]
        benchmark_return = (SELL_benchmark - BUY_benchmark) / BUY_benchmark
        self.benchmark_return = benchmark_return.loc[benchmark_return.index.isin(self.date_all), :]

        self.HS_300 = pd.DataFrame(benchmark_return["000300.SH"])["000300.SH"]
        self.CS_500 = pd.DataFrame(benchmark_return["000905.SH"])["000905.SH"]
        self.CS_1000 = pd.DataFrame(benchmark_return["000852.SH"])["000852.SH"]
        print("*" * 60)
    def Data_preprocessing(self):
        print("*" * 60)
        print("Data preprocessing...")
        factor = self.factor
        # Data preprocessing
        def value_mapping(row):
            return value_dict[row["trade_date"]][row["ts_code"]]

        # 3MAD
        def Col_3MAD(row):
            median = row.median()  # median
            mad = abs(row - row.median()).median()
            threshold = 3 * mad
            lower_bound = median - threshold
            upper_bound = median + threshold
            return row.clip(lower=lower_bound, upper=upper_bound)

        indexes = [self.factor_name]
        for index in indexes:
            df = factor.pivot(index="trade_date", columns="ts_code", values=index)
            values = df.values
            # Standardize for each row
            df = pd.DataFrame(zscore.fit_transform(values.T).T, index=df.index, columns=df.columns)
            df = df.apply(Col_3MAD, axis=1)  # 3 times MAD for each row
            value_dict = df.to_dict(orient="index")
            factor[index + "_processed"] = factor.apply(value_mapping, axis=1)
        # It needs to be modified. It is possible to have nan values except for the last 11 days of the time
        # Remove last 11 days' missing data. (T_end - T_begin + 1)
        factor.fillna(0, inplace=True)
        self.factor = factor
        self.RAM_USAGE()
        print("*" * 60)
    def Ensure_position(self, DF):
        # We add a column named 'Monday' and change positions every Monday.
        DF = DF.shift(1)  # Move one step forward to ensure position
        DF["Monday"] = (DF.index.dayofweek == 0).astype(int)  # Shift except Monday

        # Get the columns to shift (all columns except 'Monday')
        cols_to_shift = DF.columns[DF.columns != "Monday"]

        DF.loc[DF["Monday"] == 0, cols_to_shift] = np.nan
        DF.fillna(method="ffill", 
                inplace=True)  # Forward fill, holing positions for a week.
        DF.fillna(value=0, inplace=True)  # Fill remaining NaN with 0
        return DF
    def Factor_DF(self):
        factor = self.factor
        factor_df = factor.pivot(index="trade_date", 
                                 columns="ts_code", 
                                 values=f'{self.factor_name}')
        self.factor_df = factor_df
    def Backtest_Main(self):
        print("*" * 60)
        print("Backtest...")
        factor_df = self.factor_df
        # layer number
        num_layers = 20

        # Calculte the ranks of factors daily.
        factor_ranks = factor_df.rank(axis=1, ascending=False)

        # The factor ordering is divided into num_layers, each of which allocates funds equally.
        layer_allocation = (factor_ranks // (len(factor_df.columns) / num_layers)).fillna(0)
        layer_allocation

        # import matplotlib.cm as cm

        plt.figure()
        plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号
        plt.figure(figsize=(10, 5))
        plt.axhline(y=1, color="grey", linestyle="--")

        # Define a color map to use for changing colors progressively
        # colors = plt.cm.jet(np.linspace(0, 1, num_layers))

        self.profit_long = self.profit_short = None

        def Long_Short(Num_layers, Layer, Profit):
            long_layer = Num_layers - 1
            short_layer = 0
            profit_long = self.profit_long
            profit_short = self.profit_short
            if Layer == short_layer:
                profit_short = Profit
                # The short profit comes from the decline of the stock.
                profit_short = profit_short.apply(
                    lambda x: x + 1 if x == 0 else -x + (1 - 0.0003)
                )
                profit_short = profit_short.cumprod()
                profit_short *= 0.5
                self.profit_short = profit_short
            elif Layer == long_layer:
                profit_long = Profit
                profit_long = profit_long.apply(
                    lambda x: x + 1 if x == 0 else x + (1 - 0.0003)
                )
                profit_long = profit_long.cumprod()
                profit_long *= 0.5
                self.profit_long = profit_long

        Long_position_df = None
        for layer in range(0, num_layers):
            # Ensure holding stocks
            hold_flag_matrix = layer_allocation.mask(layer_allocation != layer, 0).mask(
                layer_allocation == layer, 1)
            hold_flag_matrix = self.Ensure_position(hold_flag_matrix)
            if layer == 0:
                Long_position_df = hold_flag_matrix.copy()

            # Delete 'Monday' to fit into yield dataframe.
            del hold_flag_matrix["Monday"]

            # Calculate the sum of each line in turn.
            stock_amount_sum = hold_flag_matrix.sum(axis=1)

            # Calculate the weight of each stock. (Average distribution at the same level)
            weight_allocation = hold_flag_matrix.apply(
                lambda x: x / stock_amount_sum, axis=0).fillna(0)

            # Calculate the daily profit rate. And prepare to calculate cumprod.
            profit = (weight_allocation * self.price_return).sum(axis=1)

            # Create Long and Short position
            Long_Short(num_layers, layer, profit)

            # Calculate the daily equity and draw.
            # Using the 'viridis' colormap with a gradient based on layer number
            colors = cm.viridis(layer / num_layers)
            profit = profit.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
            profit.cumprod().plot(label=layer, legend=True, color=colors)

        profit_HS300 = self.HS_300.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit_HS300.cumprod().plot(label="HS 300 index", legend=True, color="r")
        profit_CS500 = self.CS_500.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit_CS500.cumprod().plot(label="CS 500 index", legend=True, color="g")
        profit_CS1000 = self.CS_1000.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit_CS1000.cumprod().plot(label="CS 1000 index", legend=True, color="b")

        (self.profit_long + self.profit_short).plot(color="orange", label="long_short", legend=True)
        plt.title(f"20-Layered Portfolio Equity ({self.factor_name})")
        plt.legend(title="Layer", bbox_to_anchor=(1, 0.5), loc="center left")
        plt.savefig(path + f'{self.factor_name}_Backtest_Main.png', bbox_inches='tight')

        self.Long_position_df = Long_position_df
        self.RAM_USAGE()
        print("*" * 60)
    def Backtest_Simple(self, stock_num, Ascending):
        # Calculte the ranks of factors daily.
        factor_ranks = self.factor_df.rank(axis=1, ascending=Ascending)

        # Create position_df based on top 3 ranks
        position_df = factor_ranks.apply(lambda x: x <= stock_num).astype(int)
        position_df = self.Ensure_position(position_df)

        # Delete 'Monday' to fit into yield dataframe.
        del position_df["Monday"]

        # Calculate the sum of each line in turn.
        stock_amount_sum = position_df.sum(axis=1)

        # Calculate the weight of each stock. (Average distribution at the same level)
        weight_allocation = position_df.apply(
            lambda x: x / stock_amount_sum, axis=0
        ).fillna(0)

        # Calculate the daily profit rate. And prepare to calculate cumprod.
        profit = (weight_allocation * self.price_return).sum(axis=1)

        plt.figure()
        plt.rcParams["axes.unicode_minus"] = False  # 正常显示负号
        plt.figure(figsize=(10, 5))
        plt.axhline(y=1, color="grey", linestyle="--")
        # Calculate the daily equity and draw.
        profit = profit.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit.cumprod().plot(label="Stocks", legend=True, color="#800080")

        profit_HS300 = self.HS_300.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit_HS300.cumprod().plot(label="HS 300 index", legend=True, color="r")
        profit_CS500 = self.CS_500.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit_CS500.cumprod().plot(label="CS 500 index", legend=True, color="g")
        profit_CS1000 = self.CS_1000.apply(lambda x: x + 1 if x == 0 else x + (1 - 0.0003))
        profit_CS1000.cumprod().plot(label="CS 1000 index", legend=True, color="b")

        plt.title(f"Equity of {stock_num} stocks ({self.factor_name})")
        plt.legend(title="Index", bbox_to_anchor=(1, 0.5), loc="center left")
        plt.savefig(path + f'{self.factor_name}_Backtest_Simple.png', bbox_inches='tight')
    def Indexes(self):
        print("*" * 60)
        print("Indexes...")
        # 1. RankIC mean(IC mean)
        # 2. ICIR
        # 3. IC winning rate
        # 4. IC t-value
        # 5. Long ROI
        # 6. Long sharp
        # 7. Long retracement
        # 8. Long average weekly turnover rate
        # ICIR
        plt.figure()
        IC_series = self.factor_df.corrwith(self.price_return, axis=1, method="spearman")
        IC_mean = IC_series.mean()
        print(f"IC mean: {IC_mean}")
        ICIR = IC_series.mean() / IC_series.std()
        print(f"ICIR: {ICIR}")
        IC_series.hist(bins=30)
        plt.title(f"IC frequency histogram ({self.factor_name})")
        plt.savefig(path + f'{self.factor_name}_Indexes_IC.png', bbox_inches='tight')

        # RankIC
        # Create a figure and axis
        plt.figure()
        _, ax = plt.subplots(figsize=(8, 4))
        # Plot histogram of values on the left side
        ax.bar(IC_series.index, IC_series.values)
        ax.set_title(f'Histogram of Series Values ({self.factor_name})')
        ax.set_xlabel('Date')
        ax.set_ylabel('RankIC')
        # Create a twin Axes sharing the xaxis
        ax2 = ax.twinx()
        # Plot cumulative line on the right side
        ax2.plot(IC_series.index, IC_series.cumsum(), color='orange')
        ax2.set_ylabel('Cumulative Sum')
        plt.savefig(path + f'{self.factor_name}_Indexes_IC_Cum.png', bbox_inches='tight')
        
        # IC winning rate
        IC_winning_rate = sum(1 for ic in IC_series if ic > 0) / len(IC_series)
        print(f"IC winning rate: {IC_winning_rate}")

        t_value = ICIR / sqrt(len(IC_series))
        print(f"IC t-value: {t_value}")

        # Long (Except Short), so `profit_long * 2`
        # profit_long is long equity series.
        Long_equity = self.profit_long * 2
        Long_ROI = (Long_equity[-1]) - 1
        print(f"Long ROI: {Long_ROI}")

        # Max Drawdown
        # Calculate the previous peaks
        previous_peaks = Long_equity.cummax()
        # Calculate the drawdowns
        drawdowns = (Long_equity - previous_peaks) / previous_peaks

        # Find the maximum drawdown and the dates associated with it
        # Convert the index labels to a numeric format
        drawdowns.index = pd.to_numeric(drawdowns.index)
        max_drawdown = drawdowns.min()
        print(f"Max drawdown: {max_drawdown}")
        max_drawdown_start = drawdowns.idxmin()

        # Convert the start and end dates back to the original format if needed
        max_drawdown_start = pd.to_datetime(max_drawdown_start).date()
        print(f"Max drawdown begin date: {max_drawdown_start}")

        # Plotting
        length = 6
        width = 4
        plt.figure(figsize=(length, width))
        plt.text(0.1, 7/9, f"IC mean: {IC_mean}", fontsize=12)
        plt.text(0.1, 6/9, f"ICIR: {ICIR}", fontsize=12)
        plt.text(0.1, 5/9, f"IC winning rate: {IC_winning_rate}", fontsize=12)
        plt.text(0.1, 4/9, f"IC t-value: {t_value}", fontsize=12)
        plt.text(0.1, 3/9, f"Long ROI: {Long_ROI}", fontsize=12)
        plt.text(0.1, 2/9, f"Max drawdown: {max_drawdown}", fontsize=12)
        plt.text(0.1, 1/9, f"Max drawdown begin date: {max_drawdown_start}", fontsize=12)
        plt.title(f'Indexes of ({self.factor_name})')
        plt.axis('off')  # Turn off axis
        plt.savefig(path + f'{self.factor_name}_Indexes.png', bbox_inches='tight')
        
        print("*" * 60)

In [17]:
begin_date = "20230120"
end_date = "20240430"

factors_len = len(factors_name)
begin_index = 32
end_index = factors_len - 1

In [55]:
# Missing: ['alpha101_032', 'alpha191_021', 'alpha191_025', 'alpha191_030', 'alpha191_061', 'alpha191_062',
# 'alpha191_063', 'alpha191_064', 'alpha191_065', 'alpha191_066', 'alpha191_067', 'alpha191_068', 'alpha191_069',
# 'alpha191_070', 'alpha191_072', 'alpha191_073', 'alpha191_074', 'alpha191_075', 'alpha191_076', 'alpha191_077',
# 'alpha191_078', 'alpha191_079', 'alpha191_080', 'alpha191_081', 'alpha191_082', 'alpha191_083', 'alpha191_084',
# 'alpha191_085', 'alpha191_086', 'alpha191_087', 'alpha191_088', 'alpha191_089', 'alpha191_090', 'alpha191_116',
# 'alpha191_143', 'alpha191_147', 'alpha191_182']

In [75]:
for factor_name in tqdm(factors_name[261: end_index+1], desc="Processing factors", ncols=100):
    Single = HS300_Single(factor_name, begin_date, end_date, stocks_code)
    clear_output(wait=True)

Yes, `bbox_inches='tight'` is a parameter used in `plt.savefig()` to ensure that the saved image file includes all elements of the plot without cropping them. It automatically adjusts the bounding box to fit the entire plot area. 

When you specify `bbox_inches='tight'`, Matplotlib adjusts the bounding box to fit the plot content. This can be helpful to ensure that nothing is cut off when saving the plot, especially when using `plt.tight_layout()` to adjust the layout of the plot.

In the provided code snippet:

```python
plt.savefig('RankIC_combined_plot.png', dpi=300, bbox_inches='tight')
```

The `bbox_inches='tight'` ensures that all elements of the plot are included in the saved image file, and there is no unnecessary whitespace around the edges. This helps to produce a cleaner and more accurate representation of the plot.

In [None]:
class HS300_Single_Correlation():
    def __init__(self, Factor_Name, Begin_Date, End_Date, Stocks_Code) -> None:
        print(f"Factor: {Factor_Name}")
        self.factor_name = Factor_Name
        self.begin_date = Begin_Date
        self.end_date = End_Date
        self.stocks_code = Stocks_Code
        self.Get_Trade_Calendar()
        self.Data()
        self.Get_Daily_Return()
        self.Factor_DF()
    def RAM_USAGE(self):
        # Get the memory usage of the IPython kernel in MB
        ram_usage = process.memory_info().rss / (1024 * 1024)
        print(f"RAM Usage: {ram_usage} MB")
    def Get_Trade_Calendar(self):
        print("*" * 60)
        print("Getting trade calendar...")
        PRICE = get_factor(self.factor_name, 
                           start_date=self.begin_date, 
                           end_date=self.end_date, 
                           ts_code_list=self.stocks_code)
        PRICE.reset_index(inplace=True)
        PRICE = PRICE.sort_values(by="trade_date").reset_index(drop=True)
        # price.set_index(['trade_date', 'ts_code'], inplace=True)
        PRICE = PRICE.fillna(0)
        PRICE = PRICE.drop_duplicates(subset=['trade_date', 'ts_code'])
        PRICE_pivot = PRICE.pivot(index="trade_date", 
                                  columns="ts_code", 
                                  values=self.factor_name)

        # Attention: len(date_all) >= 60
        date_all = PRICE_pivot.index
        self.date_all = date_all
        # _, date_backtest = train_test_split(date_all, test_size=0.2, shuffle=False)
        # backtest_begin = date_backtest[0]
        # backtest_end = date_backtest[-1]

        # Find missing stocks
        all_combinations = pd.MultiIndex.from_product(
            [date_all, stocks_code], 
            names=["trade_date", "ts_code"])
        all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()
        price_whole = pd.merge(
            all_combinations_df, 
            PRICE, 
            on=["trade_date", "ts_code"], 
            how="left")
        missing = price_whole[price_whole.isnull().any(axis=1)]
        missing_stock = missing["ts_code"].unique()
        print("The missing stock:", missing_stock)

        self.stocks_code = list(set(HS300) - set(missing_stock))
        print("*" * 60)
    def Data(self):
        print("*" * 60)
        print("Get data...")
        # Get factor
        factor = get_factor(self.factor_name, 
                        start_date=self.begin_date, 
                        end_date=self.end_date, 
                        ts_code_list=self.stocks_code)
        factor.reset_index(inplace=True)
        factor = factor.drop_duplicates(subset=['trade_date', 'ts_code'])
        self.factor = factor.sort_values(by="trade_date").reset_index(drop=True)

        # Get stocks
        price = get_price(
            ts_code_list=self.stocks_code,
            feature_list=[
                "open",
                "close"
            ],
            start_date=self.begin_date,
            trade_date=self.end_date,
            target_type="stock",
        )
        price = price.rename(
            columns={
                "open": "OPEN",
                "close": "CLOSE"
            }
        )
        price.reset_index(inplace=True)
        # Add weekend, If it is Monday, the value is 1, or 0.
        # price['Monday'] = (price['trade_date'].dt.dayofweek == 0).astype(int)
        self.price = price.sort_values(by="trade_date").reset_index(drop=True)
        # price.set_index(['trade_date', 'ts_code'], inplace=True)
        # price.fillna(0, inplace=True)
        self.RAM_USAGE()
        print("*" * 60)
    def Get_Daily_Return(self):
        print("*" * 60)
        print("Get daily return...")
        price = self.price
        BUY_price = price.pivot(index="trade_date", columns="ts_code", values="OPEN")
        SELL_price = price.pivot(index="trade_date", columns="ts_code", values="CLOSE")
        price_return = (SELL_price - BUY_price) / BUY_price
        self.price_return = price_return.loc[price_return.index.isin(self.date_all), :]
        print("*" * 60)
    def Data_preprocessing(self):
        print("*" * 60)
        print("Data preprocessing...")
        factor = self.factor
        # Data preprocessing
        def value_mapping(row):
            return value_dict[row["trade_date"]][row["ts_code"]]

        # 3MAD
        def Col_3MAD(row):
            median = row.median()  # median
            mad = abs(row - row.median()).median()
            threshold = 3 * mad
            lower_bound = median - threshold
            upper_bound = median + threshold
            return row.clip(lower=lower_bound, upper=upper_bound)

        indexes = [self.factor_name]
        for index in indexes:
            df = factor.pivot(index="trade_date", columns="ts_code", values=index)
            values = df.values
            # Standardize for each row
            df = pd.DataFrame(zscore.fit_transform(values.T).T, index=df.index, columns=df.columns)
            df = df.apply(Col_3MAD, axis=1)  # 3 times MAD for each row
            value_dict = df.to_dict(orient="index")
            factor[index + "_processed"] = factor.apply(value_mapping, axis=1)
        # It needs to be modified. It is possible to have nan values except for the last 11 days of the time
        # Remove last 11 days' missing data. (T_end - T_begin + 1)
        factor.fillna(0, inplace=True)
        self.factor = factor
        self.RAM_USAGE()
        print("*" * 60)
    def Factor_DF(self):
        factor = self.factor
        factor_df = factor.pivot(index="trade_date", 
                                 columns="ts_code", 
                                 values=f'{self.factor_name}')
        self.factor_df = factor_df