In [579]:
from cylib.factor_backtest.factor_exp_engine import (get_needing_basic_factors, 
                                                    exec_and_eval_exp)
import tqdm
import pandas as pd
import numpy as np

#### 所需提交一：提交因子表达式所需的算子

In [580]:
# MAX(A, B) - 在A、B中选择最大的数
def MAX(A: pd.DataFrame, B: pd.DataFrame) -> pd.DataFrame:
    if type(B) == int or float:
        max_df = pd.DataFrame(np.maximum(A.values, B), index=A.index, columns=A.columns)
    elif type(B) == pd.DataFrame:
        max_df = pd.DataFrame(np.maximum(A.values, B.values), index=A.index, columns=A.columns)
    return max_df

# MIN(A, B) - 在A、B中选择最小的数
def MIN(A: pd.DataFrame, B: pd.DataFrame) -> pd.DataFrame:
    if type(B) == int or float:
        min_df = pd.DataFrame(np.minimum(A.values, B), index=A.index, columns=A.columns)
    elif type(B) == pd.DataFrame:
        min_df = pd.DataFrame(np.minimum(A.values, B.values), index=A.index, columns=A.columns)
    return min_df

def ABS(price_df):
    return abs(price_df)

# SUM(A, n) - 序列A过去n天的求和
def SUM(A: pd.DataFrame, n: int) -> pd.DataFrame:
    sum_df = A.rolling(window=n).sum()
    return sum_df

# SIGN(A) - 符号函数
def SIGN(A: pd.DataFrame) -> pd.DataFrame:
    sign_df = np.sign(A)
    return pd.DataFrame(sign_df, index=A.index, columns=A.columns)

# CORR(A, B, n) - 序列A、B过去n天的相关系数
def CORR(A: pd.DataFrame, B: pd.DataFrame, n: int) -> pd.DataFrame:
    corr_df = A.rolling(window=n).corr(other=B)
    return corr_df

# COVARIANCE(A, B, n) - 序列A、B过去n天的协方差
def COVARIANCE(A: pd.DataFrame, B: pd.DataFrame, n: int) -> pd.DataFrame:
    cov_df = A.rolling(window=n).cov(other=B)
    return cov_df

# STD1(A) - 序列A的标准差
def STD1(A: pd.DataFrame) -> pd.DataFrame:
    std_df = A.std()
    return std_df

# STD2(A, n) - 序列A过去n天的标准差
def STD2(A: pd.DataFrame, n: int) -> pd.DataFrame:
    std_df = A.rolling(window=n).std()
    return std_df

# DELAY(A, n) - A_i-n
def DELAY(A: pd.DataFrame, n: int) -> pd.DataFrame:
    delay_df = A.shift(n)
    return delay_df

def DELTA(df, n: int):
    df_shift1 = df.shift(n)
    return df_shift1 - df

def RANK(df):
    return df.rank(axis=0)

# MEAN(A, n) - 序列A过去n天的均值
def MEAN(A: pd.DataFrame, n: int) -> pd.DataFrame:
    mean_df = A.rolling(window=n).mean()
    return mean_df

# COUNT(condition, n) - 计算前n期满足条件condition的样本个数
def COUNT(condition: pd.DataFrame, n: int) -> pd.DataFrame:
    count_df = condition.rolling(window=n).sum()
    return count_df

# LOG(A) - 自然对数函数
def LOG(A: pd.DataFrame) -> pd.DataFrame:
    A.replace(0, np.nan, inplace = True)
    log_df = np.log(A)
    df = pd.DataFrame(log_df, index=A.index, columns=A.columns)
    # df.replace(-np.inf, 0, inplace=True)
    return df

# TSMAX(A, n) - 序列A过去n天的最大值
def TSMAX(A: pd.DataFrame, n: int) -> pd.DataFrame:
    tsmax_df = A.rolling(window=n).max()
    return tsmax_df

# TSRANK(A, n) - 序列A的末位值在过去n天的顺序排位
def TSRANK(A: pd.DataFrame, n: int) -> pd.DataFrame:
    rank_df = A.rolling(window=n).apply(lambda x: x.rank(method = 'min').iloc[-1])
    return rank_df

# HIGHDAY(A, n) - 计算A前n期时间序列中最大值距离当前时点的间隔
def HIGHDAY(A: pd.DataFrame, n: int) -> pd.DataFrame:
    high_day_df = A.rolling(window=n).apply(lambda x: len(x) - np.argmax(x) - 1)
    return high_day_df

# LOWDAY(A, n) - 计算A前n期时间序列中最小值距离当前时点的间隔
def LOWDAY(A: pd.DataFrame, n: int) -> pd.DataFrame:
    low_day_df = A.rolling(window=n).apply(lambda x: len(x) - np.argmin(x) - 1)
    return low_day_df

# DECAYLINEAR(A, d) - 对A序列计算移动平均加权
def DECAYLINEAR(A: pd.DataFrame, d: int) -> pd.DataFrame:
    weights = np.arange(1, d + 1)/ np.sum(np.arange(1, d + 1))
    decay_df = A.rolling(window=d).apply(lambda x: np.dot(x, weights))
    return decay_df

'''
这是一个计算简单移动平均线（Simple Moving Average，SMA）的函数，输入
参数包括一个 Pandas DataFrame A，以及两个整数 n 和 m，输出结果也是一个
Pandas DataFrame。

具体来说，该函数首先对输入的 DataFrame A 进行了一些处理，去除其中所有
缺失值 NaN，并用 0 填充。然后创建了一个新的 DataFrame sma_df，用于保存
计算得到的 SMA 值。

接下来，通过遍历 A 的每一行数据，计算对应的 SMA 值。如果当前行的索引 i 小
于 1，则直接将该行数据添加到 sma_df 中；如果 i 大于等于 1，
则根据公式：next_sma = (A.iloc[i] * m + sma_df.iloc[i - 1] * (n - m)) / n，
计算出当前行的 SMA 值 next_sma，并将其添加到 sma_df 中。

最后，将 sma_df 的索引设置为 A 的最后 len(sma_df) 行数据的索引，并返回 sma_df。

该函数的作用是计算简单移动平均线，通过调用该函数可以方便地得到 SMA 值，并进行
进一步的分析和处理。
'''

def SMA(A: pd.DataFrame, n: int, m: int) -> pd.DataFrame:
    A=A.copy().dropna(how='all').fillna(0)
    sma_df = pd.DataFrame(columns = A.columns)
    for i in range(len(A)):
        if i < 1:
            sma_df.loc[i,:]=A.iloc[i]
        else:
            next_sma = (A.iloc[i] * m + sma_df.iloc[i - 1] * (n - m)) / n
            sma_df.loc[i,:]=next_sma
    sma_df.index = A.index[-len(sma_df):]
    return sma_df

#### 所需提交二：提交存储因子表达式的字典

In a factor expression, the factor name is followed by parentheses containing two parts: the factor expression and the required data entries.

- Factor expression: This represents how the factor is calculated or the logic behind it. It can be a mathematical operation, function call, conditional statement, etc., used to calculate and generate the factor value.

- Required data entries: This indicates the number of data entries needed to compute the factor. It specifies the size of the historical data window to be considered when calculating the factor.

For example, if we have a factor named "MA5" with a factor expression of "mean(CLOSE, 5)", the required data entries would be 5. This means that when calculating the "MA5" factor, the past 5 periods of closing price data are used as input. Therefore, the "mean" function is applied to these 5 data points to compute the average, which becomes the factor value.

The required data entries can be defined and adjusted based on the specific factor's calculation logic and requirements. It determines the size of the historical data window considered during factor calculation, thus affecting the resulting factor values.

In [581]:
# factor_def_dict = \
# {'alpha191_022': ('MEAN(((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6)-DELAY((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6),3)),12)', 22)}
# total loss = 0.011
# IC mean = 0.018

factor_def_dict = \
{'alpha_01': ('-1 * CORR(RANK(DELTA(LOG(VOLUME_DF), 1)), RANK(((CLOSE_DF - OPEN_DF) / OPEN_DF)), 6)', 22)}
# total loss = 0.011
# IC mean = 0.024

# factor_def_dict = \
# {'alpha_02': ('-1 * DELTA((((CLOSE_DF - LOW_DF) - (HIGH_DF - CLOSE_DF)) / (HIGH_DF - LOW_DF)), 1)', 22)}
# total loss = 0.011
# IC mean = 0.01566

# factor_def_dict = \
# {'alpha_05': ('-1 * TSMAX(CORR(TSRANK(VOLUME_DF, 5), TSRANK(HIGH_DF, 5), 5), 3)', 22)}
# total loss = 0.011
# IC mean = 0.034
# 运行时间长，13mins

# factor_def_dict = \
# {'alpha_06': ('RANK(SIGN(DELTA((((OPEN_DF * 0.85) + (HIGH_DF * 0.15))), 4))) * -1', 22)}
# total loss = 0.011
# IC mean = 0.053

# factor_def_dict = {'alpha_14': ('CLOSE_DF - DELAY(CLOSE_DF, 5)', 22)}
# total loss = 0.011
# IC mean = 0.027

# factor_def_dict = {'alpha_15': ('OPEN_DF / DELAY(CLOSE_DF, 1) - 1', 22)}
# total loss = 0.011
# IC mean = 0.023

# factor_def_dict = {'alpha_18': ('CLOSE_DF / DELAY(CLOSE_DF, 5)', 22)}
# total loss = 0.011
# IC mean = 0.033

# factor_def_dict = \
# {'alpha_20': ('(CLOSE_DF - DELAY(CLOSE_DF, 6)) / DELAY(CLOSE_DF, 6) * 100', 22)}
# total loss = 0.011
# IC mean = 0.033

# factor_def_dict = \
# {'alpha_29': ('(CLOSE_DF - DELAY(CLOSE_DF, 6)) / DELAY(CLOSE_DF, 6) * VOLUME_DF', 22)}
# total loss = 0.011
# IC mean = 0.031

# factor_def_dict = \
# {'alpha_31': ('(CLOSE_DF - MEAN(CLOSE_DF, 12)) / MEAN(CLOSE_DF, 12) * 100', 22)}
# total loss = 0.011
# IC mean = 0.039

# factor_def_dict = \
# {'alpha_32': ('-1 * SUM(RANK(CORR(RANK(HIGH_DF), RANK(VOLUME_DF), 3)), 3)', 22)}
# total loss = 0.011
# IC mean = 0.03

# factor_def_dict = {'alpha_34': ('MEAN(CLOSE_DF, 12) / CLOSE_DF', 22)}
# total loss = 0.011
# IC mean = 0.039

# factor_def_dict = \
# {'alpha_42': ('(-1 * RANK(STD2(HIGH_DF, 10))) * CORR(HIGH_DF, VOLUME_DF, 10)', 22)}
# total loss = 0.011
# IC mean = 0.061

# factor_def_dict = \
# {'alpha_46': ('(MEAN(CLOSE_DF, 3) + MEAN(CLOSE_DF, 6) + MEAN(CLOSE_DF, 12) + MEAN(CLOSE_DF, 24)) / (4 * CLOSE_DF)', 22)}
# total loss = 0.011
# IC mean = 0.043

# factor_def_dict = \
# {'alpha_53': ('COUNT(CLOSE_DF > DELAY(CLOSE_DF, 1), 12) / 12 * 100', 22)}
# total loss = 0.011
# IC mean = 0.027

# factor_def_dict = \
# {'alpha_54': ('-1 * RANK((STD1(ABS(CLOSE_DF - OPEN_DF)) + (CLOSE_DF - OPEN_DF)) + CORR(CLOSE_DF, OPEN_DF, 10))', 22)}
# total loss = 0.011
# IC mean = 0.016

# factor_def_dict = \
# {'alpha_58': ('COUNT(CLOSE_DF > DELAY(CLOSE_DF, 1), 20) / 20 * 100', 22)}
# total loss = 0.058
# IC mean = 0.0239

# factor_def_dict = \
# {'alpha_62': ('(-1 * CORR(HIGH_DF, RANK(VOLUME_DF), 5))', 22)}
# total loss = 0.012
# IC mean = 0.038

# factor_def_dict = \
# {'alpha_65': ('MEAN(CLOSE_DF, 6) / CLOSE_DF', 22)}
# total loss = 0.011
# IC mean = 0.032

# factor_def_dict = \
# {'alpha_70': ('STD2(AMOUNT_DF, 6)', 22)}
# total loss = 0.011
# IC mean = 0.088

# factor_def_dict = \
# {'alpha_71': ('(CLOSE_DF - MEAN(CLOSE_DF, 24)) / MEAN(CLOSE_DF, 24) * 100', 22)}
# total loss = 0.011
# IC mean = 0.047

# factor_def_dict = \
# {'alpha_76': ('STD2(ABS((CLOSE_DF / DELAY(CLOSE_DF, 1) - 1)) / VOLUME_DF, 20) / MEAN(ABS((CLOSE_DF / DELAY(CLOSE_DF, 1)-  1)) / VOLUME_DF, 20) ', 22)}
# total loss = 0.011
# IC mean = 0.005

# factor_def_dict = \
# {'alpha_80': ('(VOLUME_DF - DELAY(VOLUME_DF, 5)) / DELAY(VOLUME_DF, 5) * 100', 22)}
# total loss = 0.011
# IC mean = 0.026

# factor_def_dict = \
# {'alpha_83': ('-1 * RANK(COVIANCE(RANK(HIGH_DF), RANK(VOLUME_DF), 5))', 22)}
# total loss = 0.011
# IC mean = 0.0
# 运行不出

# factor_def_dict = \
# {'alpha_88': ('(CLOSE_DF - DELAY(CLOSE_DF, 20)) / DELAY(CLOSE_DF, 20) * 100', 22)}
# total loss = 0.011
# IC mean = 0.044

# factor_def_dict = {'alpha_95': ('STD2(AMOUNT_DF, 20)', 22)}
# total loss = 0.011
# IC mean = 0.081

# factor_def_dict = {'alpha_97': ('STD2(VOLUME_DF, 10)', 22)}
# total loss = 0.011
# IC mean = 0.046

# factor_def_dict = {'alpha_100': ('STD2(VOLUME_DF, 20)', 22)}
# total loss = 0.011
# IC mean = 0.043

# factor_def_dict = {'alpha_103': ('((20 - LOWDAY(LOW_DF, 20)) / 20) * 100', 22)}
# total loss = 0.011
# IC mean = 0.019

# factor_def_dict = \
# {'alpha_104': ('-1 * (DELTA(CORR(HIGH_DF, VOLUME_DF, 5), 5) * RANK(STD2(CLOSE_DF, 20)))', 22)}
# total loss = 0.011
# IC mean = 0.013

# factor_def_dict = {'alpha_105': ('(-1 * CORR(RANK(OPEN_DF), RANK(VOLUME_DF), 10)) ', 22)}
# total loss = 0.011
# IC mean = 0.022

# factor_def_dict = {'alpha_106': ('CLOSE_DF - DELAY(CLOSE_DF, 20)', 22)}
# total loss = 0.011
# IC mean = 0.044

# factor_def_dict = \
# {'alpha_107': ('(((-1 * RANK((OPEN_DF - DELAY(HIGH_DF, 1)))) * RANK((OPEN_DF - DELAY(CLOSE_DF, 1)))) * RANK((OPEN_DF - DELAY(LOW_DF, 1))))', 22)}
# total loss = 0.011
# IC mean = 0.028

# factor_def_dict = \
# {'alpha_110': ('SUM(HIGH_DF - OPEN_DF, 20) / SUM(OPEN_DF - LOW_DF, 20) * 100', 22)}
# total loss = 0.011
# IC mean = 0.055

# factor_def_dict = {'alpha_126': ('(CLOSE_DF + HIGH_DF + LOW_DF) / 3', 22)}
# total loss = 0.011
# IC mean = 0.046

# factor_def_dict = \
# {'alpha_133': ('((20-HIGHDAY(HIGH_DF, 20)) / 20) * 100 - ((20 - LOWDAY(LOW, 20)) / 20) * 100', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_134': ('(CLOSE_DF - DELAY(CLOSE_DF, 12)) / DELAY(CLOSE_DF, 12) * VOLUME_DF', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_139': ('(-1 * CORR(OPEN_DF, VOLUME_DF, 10))', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_150': ('(CLOSE_DF + HIGH_DF + LOW_DF) / 3 * VOLUME_DF', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_153': ('(MEAN(CLOSE_DF, 3) + MEAN(CLOSE_DF, 6) + MEAN(CLOSE_DF, 12) + MEAN(CLOSE_DF, 24)) / 4', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_168': ('(-1 * VOLUME_DF / MEAN(VOLUME_DF, 20))', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_175': ('MEAN(MAX(MAX((HIGH_DF - LOW_DF), ABS(DELAY(CLOSE_DF, 1) - HIGH_DF)), ABS(DELAY(CLOSE_DF, 1) - LOW_DF)), 6)', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_177': ('((20 - HIGHDAY(HIGH_DF, 20)) / 20) * 100', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_178': ('(CLOSE_DF - DELAY(CLOSE_DF, 1)) / DELAY(CLOSE_DF, 1) * VOLUME_DF', 22)}
# total loss = 0.011
# IC mean = 0.0

# factor_def_dict = \
# {'alpha_184': ('(RANK(CORR(DELAY((OPEN_DF - CLOSE_DF), 1), CLOSE_DF, 200)) + RANK((OPEN_DF - CLOSE_DF)))', 22)}
# total loss = 0.011
# IC mean = 0.0

#### 运行以下单元格衡量因子整体的覆盖率和IC表现,不要自行修改以下代码

In [582]:
basic_factors = get_needing_basic_factors(str(factor_def_dict))
basic_factors = [x[:-3].lower() for x in basic_factors]
if 'close' not in basic_factors:
    basic_factors.append('close')
basic_factors

['open', 'volume', 'close']

In [583]:
# ori_factor = get_list_factors(basic_factors, start_date='20220101', end_date = '20231230')
ori_factor = pd.read_csv('ori_factor_test.csv',
                         parse_dates=['trade_date'],
                         index_col=["trade_date", "ts_code"])
ori_factor = ori_factor[basic_factors]
ori_factor.columns = [x.upper() for x in ori_factor.columns]
ori_factor = ori_factor.sort_index(ascending=True)
ori_factor['buy_price'] = ori_factor['CLOSE']
ori_factor

Unnamed: 0_level_0,Unnamed: 1_level_0,OPEN,VOLUME,CLOSE,buy_price
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-04,000001.SZ,16.48,1169260.00,16.66,16.66
2022-01-04,000002.SZ,19.49,1947200.00,20.49,20.49
2022-01-04,000004.SZ,19.41,46185.90,19.99,19.99
2022-01-04,000005.SZ,2.30,146240.00,2.36,2.36
2022-01-04,000006.SZ,4.45,127024.00,4.53,4.53
...,...,...,...,...,...
2023-12-08,688799.SH,44.00,3803.73,43.75,43.75
2023-12-08,688800.SH,44.24,15357.00,42.76,42.76
2023-12-08,688819.SH,28.50,37841.60,29.15,29.15
2023-12-08,688981.SH,52.67,381398.00,52.55,52.55


In [584]:
from IPython.display import clear_output

factor_eval_dict = {}
for factor_name in tqdm.tqdm(factor_def_dict):
    factor_exp = factor_def_dict[factor_name][0] # type: ignore
    factor_need_days = factor_def_dict[factor_name][1] # type: ignore
    f, d = exec_and_eval_exp(factor_exp=factor_exp,
                            need_days=factor_need_days,
                            quantiles=None,
                            bins=10,
                            original_data=ori_factor,
                            max_loss=0.5)
    factor_eval_dict = factor_eval_dict.copy()  # 创建一个新的字典副本
    factor_eval_dict[factor_name] = d
    clear_output(wait=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:08<00:00, 68.16s/it]


In [585]:
factor_eval_dict

{'alpha_01': {'tot_loss': 0.011371210564405886,
  'fwdret_loss': 0.011371210564405886,
  'bin_loss': 0.0,
  'quantile_stats':                       min       max      mean       std   count    count %
  factor_quantile                                                           
  1               -0.999544 -0.738134 -0.868224  0.052989   65851   2.975636
  2               -0.799867 -0.543790 -0.684216  0.057514  126500   5.716207
  3               -0.601642 -0.352536 -0.487701  0.057764  168108   7.596364
  4               -0.403342 -0.159142 -0.289729  0.057716  205767   9.298077
  5               -0.205671  0.032936 -0.092072  0.057580  244673  11.056138
  6               -0.007517  0.225264  0.105240  0.057383  279635  12.635980
  7                0.187601  0.417637  0.302674  0.057410  302711  13.678725
  8                0.382960  0.609799  0.500044  0.057348  312860  14.137332
  9                0.578212  0.803639  0.696491  0.057046  303670  13.722059
  10               0.773730  

In [586]:
factor_score_list = []
for factor_name in tqdm.tqdm(factor_def_dict):
    tot_loss = factor_eval_dict[factor_name]['tot_loss']
    print(f"Total loss = {tot_loss}")
    loss_score = 1 if tot_loss < 0.1 else 0   #因子覆盖率需要大于90%
    
    ic_score = abs(factor_eval_dict[factor_name]['ic_summary_table'].loc['IC Mean', :]).mean()  #因子IC
    factor_score = loss_score * ic_score
    factor_score_list.append(factor_score)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 659.17it/s]

Total loss = 0.011371210564405886





#### An example in Python to illustrate the concept of factor coverage ratio.

Suppose we have a portfolio consisting of 10 stocks, and we are using a factor model to measure the returns of these stocks. Taking the Market Value factor as an example, we want to use it to explain the returns of these stocks.

Firstly, we need to calculate the Market Value factor values for each stock and store them in a list called "market_value_factor".

Next, we can use the following Python code to calculate the factor coverage ratio:

```python
# Assuming the market value factor values are [0, 1.2, 0.9, 1.5, 1.0, 1.3, 0.7, 1.1, 0.6, 1.4]
market_value_factor = [0, 1.2, 0.9, 1.5, 1.0, 1.3, 0.7, 1.1, 0.6, 1.4]

# Calculate the factor coverage ratio
num_stocks = len(market_value_factor)  # Number of stocks
covered_stocks = sum(1 for factor in market_value_factor if factor != 0)  # Number of stocks covered by the factor

factor_coverage_ratio = covered_stocks / num_stocks * 100  # Factor coverage ratio in percentage

print("Factor Coverage Ratio: {:.2f}%".format(factor_coverage_ratio))

# Factor Coverage Ratio: 90.00%
```

By running the above code, it will output the factor coverage ratio as a percentage. For example, if the factor covers 8 out of 10 stocks, the factor coverage ratio will be 80%.

In this example, we used the Market Value factor to explain the returns of the stocks in the portfolio and calculated the coverage ratio of the factor. This helps us assess the explanatory power and predictive accuracy of the factor model for the portfolio.

In [587]:
factor_score_list

[0.024000000000000004]

In [588]:
# eval_value > 0.015
eval_value = sum(factor_score_list) / len(factor_score_list)
eval_value

0.024000000000000004