关于各位同学的因子实现如何通过测验，参照这份jupyter文件，将实现的因子和对应的算子在这份文件相应的单元格中添加进来，并运行文件下方的测验代码，输出“通过测验”后，将修改后的文件发给我，我会进行登记。

In [21]:
from cylib.factor_backtest.factor_exp_engine import (get_needing_basic_factors, 
                                                    exec_and_eval_exp,
                                                    get_list_factors)
import tqdm
import pandas as pd
import numpy as np

#### 所需提交一：提交因子表达式所需的算子

In [6]:
# MAX(A, B) - 在A、B中选择最大的数
def MAX(A: pd.DataFrame, B: pd.DataFrame) -> pd.DataFrame:
    if type(B) == int or float:
        max_df = pd.DataFrame(np.maximum(A.values, B), index=A.index, columns=A.columns)
    elif type(B) == pd.DataFrame:
        max_df = pd.DataFrame(np.maximum(A.values, B.values), index=A.index, columns=A.columns)
    return max_df

# MIN(A, B) - 在A、B中选择最小的数
def MIN(A: pd.DataFrame, B: pd.DataFrame) -> pd.DataFrame:
    if type(B) == int or float:
        min_df = pd.DataFrame(np.minimum(A.values, B), index=A.index, columns=A.columns)
    elif type(B) == pd.DataFrame:
        min_df = pd.DataFrame(np.minimum(A.values, B.values), index=A.index, columns=A.columns)
    return min_df

def ABS(price_df):
    return abs(price_df)

# SUM(A, n) - 序列A过去n天的求和
def SUM(A: pd.DataFrame, n: int) -> pd.DataFrame:
    sum_df = A.rolling(window=n).sum()
    return sum_df

# SIGN(A) - 符号函数
def SIGN(A: pd.DataFrame) -> pd.DataFrame:
    sign_df = np.sign(A)
    return pd.DataFrame(sign_df, index=A.index, columns=A.columns)

# CORR(A, B, n) - 序列A、B过去n天的相关系数
def CORR(A: pd.DataFrame, B: pd.DataFrame, n: int) -> pd.DataFrame:
    corr_df = A.rolling(window=n).corr(other=B)
    return corr_df

# COVARIANCE(A, B, n) - 序列A、B过去n天的协方差
def COVARIANCE(A: pd.DataFrame, B: pd.DataFrame, n: int) -> pd.DataFrame:
    cov_df = A.rolling(window=n).cov(other=B)
    return cov_df

# STD1(A) - 序列A的标准差
def STD1(A: pd.DataFrame) -> pd.DataFrame:
    std_df = A.std()
    return std_df

# STD2(A, n) - 序列A过去n天的标准差
def STD2(A: pd.DataFrame, n: int) -> pd.DataFrame:
    std_df = A.rolling(window=n).std()
    return std_df

# DELAY(A, n) - A_i-n
def DELAY(A: pd.DataFrame, n: int) -> pd.DataFrame:
    delay_df = A.shift(n)
    return delay_df

def DELTA(df, n: int):
    df_shift1 = df.shift(n)
    return df_shift1 - df

def RANK(df):
    return df.rank(axis=0)

# MEAN(A, n) - 序列A过去n天的均值
def MEAN(A: pd.DataFrame, n: int) -> pd.DataFrame:
    mean_df = A.rolling(window=n).mean()
    return mean_df

# COUNT(condition, n) - 计算前n期满足条件condition的样本个数
def COUNT(condition: pd.DataFrame, n: int) -> pd.DataFrame:
    count_df = condition.rolling(window=n).sum()
    return count_df

# LOG(A) - 自然对数函数
def LOG(A: pd.DataFrame) -> pd.DataFrame:
    A.replace(0, np.nan, inplace = True)
    log_df = np.log(A)
    df = pd.DataFrame(log_df, index=A.index, columns=A.columns)
    # df.replace(-np.inf, 0, inplace=True)
    return df

# TSMAX(A, n) - 序列A过去n天的最大值
def TSMAX(A: pd.DataFrame, n: int) -> pd.DataFrame:
    tsmax_df = A.rolling(window=n).max()
    return tsmax_df

# TSRANK(A, n) - 序列A的末位值在过去n天的顺序排位
def TSRANK(A: pd.DataFrame, n: int) -> pd.DataFrame:
    rank_df = A.rolling(window=n).apply(lambda x: x.rank(method = 'min').iloc[-1])
    return rank_df

# HIGHDAY(A, n) - 计算A前n期时间序列中最大值距离当前时点的间隔
def HIGHDAY(A: pd.DataFrame, n: int) -> pd.DataFrame:
    high_day_df = A.rolling(window=n).apply(lambda x: len(x) - np.argmax(x) - 1)
    return high_day_df

# LOWDAY(A, n) - 计算A前n期时间序列中最小值距离当前时点的间隔
def LOWDAY(A: pd.DataFrame, n: int) -> pd.DataFrame:
    low_day_df = A.rolling(window=n).apply(lambda x: len(x) - np.argmin(x) - 1)
    return low_day_df

# DECAYLINEAR(A, d) - 对A序列计算移动平均加权
def DECAYLINEAR(A: pd.DataFrame, d: int) -> pd.DataFrame:
    weights = np.arange(1, d + 1)/ np.sum(np.arange(1, d + 1))
    decay_df = A.rolling(window=d).apply(lambda x: np.dot(x, weights))
    return decay_df

#### 所需提交二：提交存储因子表达式的字典

In [18]:
# 提交因子表达式字典
# 因子名字：(因子表达式，所需数据条目)
factor_def_dict = \
{'alpha191_022': ('MEAN(((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6)-DELAY((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6),3)),12)', 22),
'alpha_01': ('-1 * CORR(RANK(DELTA(LOG(VOLUME_DF), 1)), RANK(((CLOSE_DF - OPEN_DF) / OPEN_DF)), 6)', 22),
'alpha_02': ('-1 * DELTA((((CLOSE_DF - LOW_DF) - (HIGH_DF - CLOSE_DF)) / (HIGH_DF - LOW_DF)), 1)', 22),
'alpha_06': ('RANK(SIGN(DELTA((((OPEN_DF * 0.85) + (HIGH_DF * 0.15))), 4))) * -1', 22),
'alpha_14': ('CLOSE_DF - DELAY(CLOSE_DF, 5)', 22),
'alpha_15': ('OPEN_DF / DELAY(CLOSE_DF, 1) - 1', 22),
'alpha_18': ('CLOSE_DF / DELAY(CLOSE_DF, 5)', 22),
'alpha_20': ('(CLOSE_DF - DELAY(CLOSE_DF, 6)) / DELAY(CLOSE_DF, 6) * 100', 22),
'alpha_29': ('(CLOSE_DF - DELAY(CLOSE_DF, 6)) / DELAY(CLOSE_DF, 6) * VOLUME_DF', 22),
'alpha_31': ('(CLOSE_DF - MEAN(CLOSE_DF, 12)) / MEAN(CLOSE_DF, 12) * 100', 22),
'alpha_32': ('-1 * SUM(RANK(CORR(RANK(HIGH_DF), RANK(VOLUME_DF), 3)), 3)', 22),
'alpha_34': ('MEAN(CLOSE_DF, 12) / CLOSE_DF', 22),
'alpha_42': ('(-1 * RANK(STD2(HIGH_DF, 10))) * CORR(HIGH_DF, VOLUME_DF, 10)', 22),
'alpha_46': ('(MEAN(CLOSE_DF, 3) + MEAN(CLOSE_DF, 6) + MEAN(CLOSE_DF, 12) + MEAN(CLOSE_DF, 24)) / (4 * CLOSE_DF)', 22),
'alpha_53': ('COUNT(CLOSE_DF > DELAY(CLOSE_DF, 1), 12) / 12 * 100', 22),
'alpha_54': ('-1 * RANK((STD1(ABS(CLOSE_DF - OPEN_DF)) + (CLOSE_DF - OPEN_DF)) + CORR(CLOSE_DF, OPEN_DF, 10))', 22),
'alpha_58': ('COUNT(CLOSE_DF > DELAY(CLOSE_DF, 1), 20) / 20 * 100', 22),
'alpha_62': ('(-1 * CORR(HIGH_DF, RANK(VOLUME_DF), 5))', 22),
'alpha_65': ('MEAN(CLOSE_DF, 6) / CLOSE_DF', 22),
'alpha_70': ('STD2(AMOUNT_DF, 6)', 22),
'alpha_71': ('(CLOSE_DF - MEAN(CLOSE_DF, 24)) / MEAN(CLOSE_DF, 24) * 100', 22),
'alpha_76': ('STD2(ABS((CLOSE_DF / DELAY(CLOSE_DF, 1) - 1)) / VOLUME_DF, 20) / MEAN(ABS((CLOSE_DF / DELAY(CLOSE_DF, 1)-  1)) / VOLUME_DF, 20) ', 22),
'alpha_80': ('(VOLUME_DF - DELAY(VOLUME_DF, 5)) / DELAY(VOLUME_DF, 5) * 100', 22),
'alpha_88': ('(CLOSE_DF - DELAY(CLOSE_DF, 20)) / DELAY(CLOSE_DF, 20) * 100', 22),
'alpha_95': ('STD2(AMOUNT_DF, 20)', 22),
'alpha_97': ('STD2(VOLUME_DF, 10)', 22),
'alpha_100': ('STD2(VOLUME_DF, 20)', 22),
'alpha_103': ('((20 - LOWDAY(LOW_DF, 20)) / 20) * 100', 22),
'alpha_104': ('-1 * (DELTA(CORR(HIGH_DF, VOLUME_DF, 5), 5) * RANK(STD2(CLOSE_DF, 20)))', 22),
'alpha_105': ('(-1 * CORR(RANK(OPEN_DF), RANK(VOLUME_DF), 10)) ', 22),
'alpha_106': ('CLOSE_DF - DELAY(CLOSE_DF, 20)', 22)}

#### 运行以下单元格衡量因子整体的覆盖率和IC表现,不要自行修改以下代码

In [19]:
str(factor_def_dict)

"{'alpha191_022': ('MEAN(((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6)-DELAY((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6),3)),12)', 22), 'alpha191_023': ('MEAN(((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6)-DELAY((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6),3)),12)', 22), 'alpha191_024': ('MEAN(((CLOSE_DF-MEAN(CLOSE_DF,6))/MEAN(CLOSE_DF,6)-DELAY((CLOSE_DF-MEAN(HIGH_DF,6))/MEAN(CLOSE_DF,6),3)),12)', 22)}"

In [20]:
basic_factors = get_needing_basic_factors(str(factor_def_dict))
basic_factors = [x[:-3].lower() for x in basic_factors]
basic_factors

['close', 'high']

In [9]:
basic_factors = get_needing_basic_factors(str(factor_def_dict))
basic_factors = [x[:-3].lower() for x in basic_factors]
if 'close' not in basic_factors:
    basic_factors.append('close')

In [22]:
name_lists = ['open', 'close', 'low', 'high', 'amount', 'volume']
ori_factor = get_list_factors(name_lists,
                              start_date='20220101',
                              end_date = '20231230')

In [23]:
ori_factor.to_csv('ori_factor_test.csv')

In [10]:
# ori_factor = get_list_factors(basic_factors, start_date='20220101', end_date = '20231230')
ori_factor = pd.read_csv('ori_factor_test.csv',
                         parse_dates=['trade_date'],
                         index_col=["trade_date", "ts_code"])
ori_factor.columns = [x.upper() for x in ori_factor.columns]
ori_factor = ori_factor.sort_index(ascending=True)
ori_factor['buy_price'] = ori_factor['CLOSE']
ori_factor

Unnamed: 0_level_0,Unnamed: 1_level_0,CLOSE,HIGH,BUY_PRICE,buy_price
trade_date,ts_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-04,000001.SZ,16.66,16.66,16.66,16.66
2022-01-04,000002.SZ,20.49,20.65,20.49,20.49
2022-01-04,000004.SZ,19.99,20.15,19.99,19.99
2022-01-04,000005.SZ,2.36,2.36,2.36,2.36
2022-01-04,000006.SZ,4.53,4.55,4.53,4.53
...,...,...,...,...,...
2023-11-27,688799.SH,44.64,45.50,44.64,44.64
2023-11-27,688800.SH,44.13,44.97,44.13,44.13
2023-11-27,688819.SH,30.37,30.94,30.37,30.37
2023-11-27,688981.SH,54.61,54.79,54.61,54.61


In [11]:
# ori_factor.to_csv('ori_factor_test.csv')

#### `tqdm.tqdm(factor_def_dict)`
In Python, `tqdm` is a library that provides a fast, extensible progress bar for loops and iterable objects. When you wrap an iterable with `tqdm`, it will display a progress bar that shows the completion status of the iterable as it's being processed.

The syntax `tqdm.tqdm(factor_def_dict)` suggests that `factor_def_dict` is a dictionary, and by passing it to `tqdm.tqdm()`, you are creating an iterable that will display a progress bar when iterated over. This is useful when you are performing operations on each item in a dictionary and you want to track the progress, especially if the operation is time-consuming.

Here's a simple example to illustrate how you might use `tqdm` with a dictionary:

```python
from tqdm import tqdm

# Suppose this is your dictionary
factor_def_dict = {
    'a': 1,
    'b': 2,
    'c': 3,
    # ... potentially many more items
}

# Wrap the dictionary's .items() method with tqdm to create a progress bar
for key, value in tqdm(factor_def_dict.items()):
    # Perform some operation with key and value
    print(f"Processing {key} with value {value}")
    # ... your processing logic here
```

In the above code, the progress bar will update with each iteration over the `.items()` of the dictionary, giving you a visual indication of how many items have been processed and how many remain.

Note that `tqdm` can be used in a variety of ways, not just with dictionaries. It can wrap any iterable, including lists, sets, and generators. Additionally, it has several optional parameters that allow you to customize the appearance and behavior of the progress bar.

In [3]:
# Suppose this is your dictionary
factor_def_dict = {
    'a': 1,
    'b': 2,
    'c': 3,
    # ... potentially many more items
}

# Wrap the dictionary's .items() method with tqdm to create a progress bar
for key, value in tqdm.tqdm(factor_def_dict.items()):
    # Perform some operation with key and value
    print(f"Processing {key} with value {value}")
    # ... your processing logic here

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 24769.51it/s]

Processing a with value 1
Processing b with value 2
Processing c with value 3





In [11]:
from IPython.display import clear_output

factor_eval_dict = {}
for factor_name in tqdm.tqdm(factor_def_dict):
    factor_exp = factor_def_dict[factor_name][0] # type: ignore
    factor_need_days = factor_def_dict[factor_name][1] # type: ignore
    f, d = exec_and_eval_exp(factor_exp=factor_exp,
                            need_days=factor_need_days,
                            quantiles=None,
                            bins=10,
                            original_data=ori_factor,
                            max_loss=0.5)
    factor_eval_dict[factor_name] = d
    clear_output(wait=True)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:12<00:00, 44.27s/it]


In [12]:
factor_eval_dict

{'alpha191_022': {'tot_loss': 0.011874849251938357,
  'fwdret_loss': 0.011874849251938357,
  'bin_loss': 0.0,
  'quantile_stats':                       min       max      mean       std   count    count %
  factor_quantile                                                           
  1               -0.209784 -0.004911 -0.046663  0.034170    1504   0.071285
  2               -0.163690  0.016455 -0.004102  0.014980   12118   0.574359
  3               -0.130714  0.037917  0.008321  0.008985  146780   6.956959
  4               -0.096418  0.059260  0.013336  0.009659  449198  21.290720
  5               -0.068290  0.081165  0.017549  0.011076  664058  31.474479
  6               -0.039120  0.105852  0.023415  0.012659  513879  24.356417
  7               -0.007890  0.127706  0.029361  0.015021  263307  12.480010
  8                0.012258  0.177118  0.047155  0.016329   49296   2.336492
  9                0.030119  0.193251  0.068490  0.016408    7646   0.362399
  10               0.0484

In [13]:
factor_score_list = []
for factor_name in tqdm.tqdm(factor_def_dict):
    tot_loss = factor_eval_dict[factor_name]['tot_loss']
    loss_score = 1 if tot_loss < 0.1 else 0   #因子覆盖率需要大于90%
    ic_score = abs(factor_eval_dict[factor_name]['ic_summary_table'].loc['IC Mean', :]).mean()  #因子IC
    factor_score = loss_score * ic_score
    factor_score_list.append(factor_score)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 2004.93it/s]


In [14]:
factor_score_list

[0.047999999999999994, 0.047999999999999994, 0.047999999999999994]

In [16]:
eval_value = sum(factor_score_list)/(len(factor_def_dict) if len(factor_def_dict) > 30 else 30)
eval_value

0.0048

In [8]:
if sum(factor_score_list) / (len(factor_def_dict) if len(factor_def_dict) > 30 else 30) > 0.015:
    print('你提交的因子表达式字典符合测验要求！')
    print('你提交的因子数量：', len(factor_score_list))
    print('因子的IC均值：', sum(factor_score_list) / len(factor_score_list))
else:
    print('你提交的因子表达式字典不符合测验要求！')
    print('你提交的因子数量：', len(factor_score_list))
    print('因子的IC均值：', sum(factor_score_list) / len(factor_score_list))

你提交的因子表达式字典不符合测验要求！
你提交的因子数量： 3
因子的IC均值： 0.047999999999999994
