In [1]:
import os
import yaml
import pandas as pd

from utility import set_random_seed
from data import load_symbol_dfs
from combination import AlphaCombinationModel
from tokenizer import AlphaTokenizer
from alpha_generation_env import AlphaGenerationEnv
from generator import RLAlphaGenerator
from glob import glob
import warnings
warnings.filterwarnings("ignore")

config_path = "configs/config.yaml"
config_path = "configs/trial_config.yaml"

with open(config_path, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

set_random_seed(cfg.get("random_seed", 42))

gen_cfg = cfg["generator"]
data_cfg = cfg["data"]
envs_cfg = cfg["env"]

In [2]:
# 加载数据

if True:  # 拆包
    n = data_cfg["n"]

    symbol_dict = data_cfg["symbol"]
    path = data_cfg["path"]

    start_date, end_date = data_cfg["date_range"][0], data_cfg["date_range"][1]

data_dfs = load_symbol_dfs(
    directory=path,
    symbols=symbol_dict,
    start_date=start_date,
    end_date=end_date,
    n=n,
    base_fields=envs_cfg["base_fields"],
)

data = pd.concat(data_dfs["rb"])

In [3]:
alphatokenizer = AlphaTokenizer(base_fields=envs_cfg["base_fields"])
combo = AlphaCombinationModel(max_pool_size=cfg["model"]["max_pool_size"])
combo.inject_data(data, target_col=data_cfg["target_col"])

results = pd.read_csv("output/alphas_weights.csv").sort_values("ic", ascending=False)

alphas_list = []

for i in range(len(results["expr"])):

    expr = results["expr"].iloc[i]

    raw, norm, ic = combo._compute_alpha_and_ic(expr, in_pool=False)

    alphas_list.append(norm)

    infix_expr = alphatokenizer.rpn_to_infix(expr)
    print(f"infix_expr: {infix_expr:<60}  ic = {ic:>6.3f}")

infix_expr: signed_log((askSize1 / (20 + bidSize1)))                      ic =  0.235
infix_expr: (signed_log(neg(bidSize1)) + signed_log(askSize1))            ic =  0.233
infix_expr: signed_log((5 * (askSize1 / bidSize1)))                       ic =  0.233
infix_expr: signed_log(((3 - bidSize1) / askSize1))                       ic =  0.226
infix_expr: signed_sqrt((10 * (askSize1 - bidSize1)))                     ic =  0.221
infix_expr: signed_sqrt(((0.5 - bidSize1) + askSize1))                    ic =  0.221
infix_expr: signed_sqrt((askSize1 + (3 - bidSize1)))                      ic =  0.221
infix_expr: (mid * signed_sqrt((askSize1 - bidSize1)))                    ic =  0.221
infix_expr: (abs_(askSize1) + (market_hit_bid - bidSize1))                ic =  0.216
infix_expr: (ask1 + (abs_(askSize1) - bidSize1))                          ic =  0.214
infix_expr: (askSize1 + (ask1 + neg(bidSize1)))                           ic =  0.214
infix_expr: ((askSize1 + abs_(mid)) - bidSize1)       

In [4]:
import pandas as pd

# --- Assemble the correlation matrix from your `alphas_list` ------------------
# Convert list of Series (assumed aligned on index) to a single DataFrame
df_alphas = pd.concat(alphas_list, axis=1)

# Give each column a readable name
df_alphas.columns = [f"alpha_{i}" for i in range(len(alphas_list))]

# Compute the Pearson correlation matrix
corr_matrix = df_alphas.corr()

In [5]:
import numpy as np

mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

corr_pairs = corr_matrix.where(mask).stack()

min_pair = corr_pairs.idxmin()     # 返回 (alpha_i, alpha_j)
min_value = corr_pairs.min()

print(f"最小相关系数的组合：{min_pair}，相关系数 = {min_value:.4f}")

abs_corr_pairs = corr_pairs.abs()
min_abs_pair = abs_corr_pairs.idxmin()
min_abs_value = corr_pairs[min_abs_pair]  # 保留符号

print(f"绝对值最小（最不相关）的组合：{min_abs_pair}，相关系数 = {min_abs_value:.4f}")


最小相关系数的组合：('alpha_13', 'alpha_39')，相关系数 = -1.0000
绝对值最小（最不相关）的组合：('alpha_7', 'alpha_46')，相关系数 = -0.7486


In [11]:
from operators import signed_log
import numpy as np

df = data.copy()

df["factor"] = (signed_log(df.askSize1 /  (df.bidSize1 + 20)) - signed_log(df.askSize1 /  (df.bidSize1 + 20)).rolling(10).mean()) / signed_log(df.askSize1 /  (df.bidSize1 + 20)).rolling(10).std()

df.factor.corr(df.target)

np.float64(0.18843607496073192)

In [17]:
from sklearn.linear_model import LinearRegression

df = df.copy().dropna()
X = df["factor"].values.reshape(-1,1)
y = (df["target"] * 10000).values

model = LinearRegression(fit_intercept=False)
model.fit(X, y)

r2_no_intercept = model.score(X, y)
print("No-intercept R²:", r2_no_intercept)


No-intercept R²: 0.04216920835467319


In [25]:
norm

0          0.278383
1         -0.967354
2         -1.744839
3          1.653500
4         -1.345688
             ...   
1217881   -1.744839
1217882   -1.744839
1217883    1.766496
1217884    0.602358
1217885    2.062698
Length: 1217886, dtype: float64

In [18]:
raw

array([ 0.17185026, -1.25276297, -1.84305276, ..., -5.86220971,
        0.73122569,  0.77175104], shape=(2424053,))