In [18]:
import json
import os
from typing import Optional, Tuple, List
from datetime import datetime
from pathlib import Path
from openai import OpenAI
import fire
import pandas as pd

import numpy as np
from sb3_contrib.ppo_mask import MaskablePPO
from stable_baselines3.common.callbacks import BaseCallback

from alphagen.data.expression import *
from alphagen.data.parser import ExpressionParser
from alphagen.models.linear_alpha_pool import LinearAlphaPool, MseAlphaPool
from alphagen.rl.env.wrapper import AlphaEnv
from alphagen.rl.policy import LSTMSharedNet
from alphagen.utils import reseed_everything, get_logger
from alphagen.rl.env.core import AlphaEnvCore
from alphagen_qlib.calculator import QLibStockDataCalculator
from alphagen_qlib.stock_data import initialize_qlib
from alphagen_llm.client import ChatClient, OpenAIClient, ChatConfig
from alphagen_llm.prompts.system_prompt import EXPLAIN_WITH_TEXT_DESC
from alphagen_llm.prompts.interaction import InterativeSession, DefaultInteraction

In [1]:
import qlib
from qlib.data import D

qlib.init(provider_uri="~/.qlib/qlib_data/us_data_update", region="us")
df = D.features(D.instruments("sp500"), ["$open","$close","$high","$low","$vwap"], freq="day")
# df = D.features(["^GSPC"], ["$open","$close","$high","$low","$vwap"], freq="day")
df

[41028:MainThread](2025-09-02 09:02:26,384) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[41028:MainThread](2025-09-02 09:02:31,663) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[41028:MainThread](2025-09-02 09:02:31,664) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/tywat/.qlib/qlib_data/us_data_update')}


Unnamed: 0_level_0,Unnamed: 1_level_0,$open,$close,$high,$low,$vwap
instrument,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A,2020-11-10,1.023775,1.000000,1.024592,0.989020,1.037173
A,2020-11-11,1.008530,0.992831,1.014428,0.987659,1.034228
A,2020-11-12,0.992740,0.993103,1.007350,0.988203,1.032996
A,2020-11-13,1.002359,0.999728,1.012160,0.996189,1.033480
A,2020-11-16,1.007259,1.000635,1.011706,0.993376,1.033681
...,...,...,...,...,...,...
ZTS,2025-01-10,1.038854,1.037076,1.047744,1.034282,1.126193
ZTS,2025-01-13,1.038537,1.056126,1.059618,1.033012,1.126123
ZTS,2025-01-14,1.056761,1.043998,1.061333,1.026027,1.126003
ZTS,2025-01-15,1.054729,1.061523,1.066667,1.050220,1.125926


In [3]:
import os
import shutil
import pandas as pd

# Path to your source CSVs
data_path = os.path.expanduser("~/.qlib/stock_data/source/us_data_update")
# backup_path = os.path.join(data_path, "backup")

# Make backup folder
# os.makedirs(backup_path, exist_ok=True)

for filename in os.listdir(data_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(data_path, filename)
        # backup_file = os.path.join(backup_path, filename)

        # Backup first
        # if not os.path.exists(backup_file):
        #     shutil.copy(file_path, backup_file)
        #     print(f"Backed up {filename} → {backup_file}")

        print(f"Processing {file_path}...")

        # Read CSV
        df = pd.read_csv(file_path)

        # Check required columns
        required_cols = {"high", "low", "close", "volume"}
        if not required_cols.issubset(df.columns):
            print(f"⚠️ Skipping {filename}, missing columns")
            continue

        # Compute VWAP (daily cumulative approximation)
        typical_price = (df["high"] + df["low"] + df["close"]) / 3
        df["vwap"] = (typical_price * df["volume"]).cumsum() / df["volume"].cumsum()
        df["vwap"] = df["vwap"].ffill().astype(float)

        # Save back (overwrite)
        df.to_csv(file_path, index=False)

print("✅ Finished adding VWAP to all CSV files. Original files are in 'backup' folder.")


Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\A.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AA.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAA.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAAU.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AACG.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AACT-UN.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AACT.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AADR.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAL.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAM-UN.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAM.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAME.csv...
Processing C:\Users\tywat/.qlib/stock_data/source/us_data_update\AAMI.csv...
Pr

In [2]:
instruments: str = "csi300"
device = torch.device("cuda:0")


def get_dataset(start: str, end: str) -> StockData:
    return StockData(
        instrument=instruments,
        start_time=start,
        end_time=end,
        device=device
    )

segments = [
    ("2012-01-01", "2019-12-31"),
    ("2022-01-01", "2022-06-30"),
    ("2022-07-01", "2022-12-31"),
    ("2023-01-01", "2023-06-30")
]


datasets = [get_dataset(*s) for s in segments]

[23512:MainThread](2025-04-24 12:33:13,832) INFO - qlib.Initialization - [config.py:420] - default_conf: client.
[23512:MainThread](2025-04-24 12:33:14,950) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[23512:MainThread](2025-04-24 12:33:14,952) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': WindowsPath('C:/Users/tywat/.qlib/qlib_data/cn_data')}


In [3]:
close = Feature(FeatureType.CLOSE)
target = Ref(close, -20) / close - 1
calculators = [QLibStockDataCalculator(d, target) for d in datasets]

In [4]:
from alphagen.data.expression import Operators
from alphagen.data.parser import ExpressionParser

def load_linear_alpha_pool_from_json(json_path: str, 
                                     calculator: QLibStockDataCalculator,
                                     single_alpha: bool = False) -> LinearAlphaPool | list[LinearAlphaPool]:
    # Load the JSON file
    parser = ExpressionParser(Operators)
    with open(json_path, 'r') as f:
        pool_data = json.load(f)

    # Extract expressions and weights from the loaded data
    expressions = pool_data['exprs']
    weights = pool_data['weights']

    # Create an instance of LinearAlphaPool
    alpha_pool = MseAlphaPool(
        capacity=len(expressions),  # Set the capacity based on the number of expressions
        calculator=calculator
    )

    # Load the expressions into the pool
    expres = []
    if single_alpha:
        alpha_pools = []

        for expression,weight in zip(expressions,weights):
            alpha_pool = MseAlphaPool(
                capacity=1,
                calculator=calculator
                )
            expre = parser.parse(expression)
            alpha_pool.force_load_exprs([expre], [weight])
            alpha_pools.append(alpha_pool)

        return  alpha_pools
    else:
        for expression in expressions:
            expre = parser.parse(expression)
            expres.append(expre)
        
        
        alpha_pool.force_load_exprs(expres, weights)

        return alpha_pool

alpha_pools = load_linear_alpha_pool_from_json('out/results/csi300_20_0_20250208124320_rl/251904_steps_pool.json', calculators[1])
alpha_pool = load_linear_alpha_pool_from_json('out/results/csi300_20_0_20250208124320_rl/251904_steps_pool.json', calculators[1], single_alpha=True)

In [5]:
ic_value, rank_ic_value = alpha_pools.test_ensemble(calculators[2])
print(alpha_pools.exprs)
print(ic_value, rank_ic_value)

[Greater(Div(Div(-1.0,$high),EMA($open,10d)),-2.0), Delta(Log($vwap),1d), Mul($volume,Mul(Cov($close,Mul(5.0,Min(Mul($high,-30.0),40d)),40d),-0.01)), Sum(Mul(Corr(Div($vwap,-0.5),$close,5d),-10.0),10d), Abs(Sub(2.0,Div($close,Add(Greater(2.0,Delta(Log($low),5d)),30.0)))), Mad(Add(2.0,Mean($vwap,20d)),10d), Corr($close,$low,10d), Abs(Log(Mad(Sub(-0.5,$close),20d))), Mad(Log(Log($volume)),40d), Mul(0.5,Corr(Log($volume),WMA(Log($volume),40d),40d)), Mul(Mul($volume,Mul(Add(Mean($high,20d),30.0),$high)),0.5), Mul(WMA(Log(Abs(Var($low,5d))),20d),-2.0), Abs(Mul(5.0,Sub($open,30.0))), Mean(Less(Sub(-2.0,Corr($volume,$high,20d)),1.0),10d), Sub(Less(1.0,$low),5.0), Add(Corr(Sub(-1.0,$high),$volume,10d),0.01), WMA(Div(Std(WMA(Div(Div($vwap,30.0),$low),40d),20d),-5.0),10d), WMA(Sub(-1.0,Div($low,$close)),20d), Less(Div($close,$vwap),$volume), Sub(Mad(Mean(Log($low),20d),40d),5.0), None]
0.06614601612091064 0.0644562840461731


In [6]:
alpha_index = 3

ic_value, rank_ic_value = alpha_pool[alpha_index].test_ensemble(calculators[2])
print(alpha_pool[alpha_index].exprs)
print(ic_value, rank_ic_value)

[Sum(Mul(Corr(Div($vwap,-0.5),$close,5d),-10.0),10d), None]
0.010267447680234909 0.010892813093960285


In [7]:
for alpha in alpha_pool:
    print(alpha.exprs)

[Greater(Div(Div(-1.0,$high),EMA($open,10d)),-2.0), None]
[Delta(Log($vwap),1d), None]
[Mul($volume,Mul(Cov($close,Mul(5.0,Min(Mul($high,-30.0),40d)),40d),-0.01)), None]
[Sum(Mul(Corr(Div($vwap,-0.5),$close,5d),-10.0),10d), None]
[Abs(Sub(2.0,Div($close,Add(Greater(2.0,Delta(Log($low),5d)),30.0)))), None]
[Mad(Add(2.0,Mean($vwap,20d)),10d), None]
[Corr($close,$low,10d), None]
[Abs(Log(Mad(Sub(-0.5,$close),20d))), None]
[Mad(Log(Log($volume)),40d), None]
[Mul(0.5,Corr(Log($volume),WMA(Log($volume),40d),40d)), None]
[Mul(Mul($volume,Mul(Add(Mean($high,20d),30.0),$high)),0.5), None]
[Mul(WMA(Log(Abs(Var($low,5d))),20d),-2.0), None]
[Abs(Mul(5.0,Sub($open,30.0))), None]
[Mean(Less(Sub(-2.0,Corr($volume,$high,20d)),1.0),10d), None]
[Sub(Less(1.0,$low),5.0), None]
[Add(Corr(Sub(-1.0,$high),$volume,10d),0.01), None]
[WMA(Div(Std(WMA(Div(Div($vwap,30.0),$low),40d),20d),-5.0),10d), None]
[WMA(Sub(-1.0,Div($low,$close)),20d), None]
[Less(Div($close,$vwap),$volume), None]
[Sub(Mad(Mean(Log($low),

In [8]:
ics = []
rank_ics = []
alphas = []

for alpha in alpha_pool:
    ic_value, rank_ic_value = alpha.test_ensemble(calculators[2])

    ics.append(ic_value)
    rank_ics.append(rank_ic_value)
    alphas.append(alpha.exprs)

df_ic_ind = pd.DataFrame({'alpha': alphas, 'ic': ics, 'rank_ic': rank_ics})
df_ic_ind

Unnamed: 0,alpha,ic,rank_ic
0,"[Greater(Div(Div(-1.0,$high),EMA($open,10d)),-...",0.055708,0.084874
1,"[Delta(Log($vwap),1d), None]",-0.02456,-0.012933
2,"[Mul($volume,Mul(Cov($close,Mul(5.0,Min(Mul($h...",-0.036502,-0.035545
3,"[Sum(Mul(Corr(Div($vwap,-0.5),$close,5d),-10.0...",0.010267,0.010893
4,"[Abs(Sub(2.0,Div($close,Add(Greater(2.0,Delta(...",-0.061593,-0.092103
5,"[Mad(Add(2.0,Mean($vwap,20d)),10d), None]",-0.006636,0.043511
6,"[Corr($close,$low,10d), None]",0.056122,0.063845
7,"[Abs(Log(Mad(Sub(-0.5,$close),20d))), None]",-0.081099,-0.097603
8,"[Mad(Log(Log($volume)),40d), None]",-0.025405,-0.042726
9,"[Mul(0.5,Corr(Log($volume),WMA(Log($volume),40...",-0.040783,-0.046877


In [9]:
for p in Path("out/gp").iterdir():
    seed = int(p.name)

with open(p / "40.json") as f:
    report = json.load(f)


state = report["res"]["res"]["pool_state"]
state["exprs"]

['Std(EMA(Min(Mul(5.0,$high),30d),50d),10d)',
 'Std(EMA(Min(Log($vwap),30d),40d),10d)',
 'Sum(Mean(Abs(Corr($low,$high,20d)),40d),20d)',
 'Std(Med(Min(Mul(5.0,$high),30d),10d),10d)',
 'Mad(Min($low,20d),20d)',
 'Std(Std(Min(Mul($vwap,2.0),30d),10d),10d)',
 'Mad(Med($close,50d),10d)',
 'Std(Cov(Corr(Var($volume,40d),$high,20d),$close,30d),10d)',
 'Std(Med(Ref(Mul(10.0,$high),30d),10d),10d)',
 'Std(Min(Sum(Mul(10.0,$high),40d),50d),10d)',
 'Mad(Min($high,30d),10d)',
 'Std(Std(Med(Mul(0.5,$high),20d),20d),10d)',
 'Mad(Ref(Min($high,30d),10d),10d)',
 'Std(Abs(WMA(Cov(0.01,$high,50d),10d)),10d)',
 'Std(Min(WMA($high,20d),50d),10d)',
 'Log(Var(Sum($low,30d),40d))',
 'Std(EMA(Min(Mul(Std($high,10d),$high),30d),50d),10d)',
 'Std(Max(Min(Mul(5.0,$high),30d),20d),10d)',
 'Std(Min(Mean(Corr(5.0,$high,30d),50d),10d),10d)',
 'Std(EMA(WMA($vwap,10d),40d),10d)']

# main

In [36]:
ex_num = "us"

In [37]:
import pickle

file_path = f'out/backtests/{ex_num}/gp/2-graph.pkl'

with open(file_path, 'rb') as file:
    chart = pickle.load(file)
chart.show()

In [38]:
import pickle

file_path = f'out/backtests/{ex_num}/gp/2-report.pkl'

with open(file_path, 'rb') as file:
    gp_report = pickle.load(file)
gp_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388
2022-01-19,9.985750e+07,1.752051e-16,9.499733e+07,0.949973,1.424960e+05,0.001425,9.499733e+07,4.860177e+06,-0.009690
2022-01-20,9.833080e+07,-1.494575e-02,1.178339e+08,0.228691,1.767508e+05,0.000343,9.764108e+07,6.897264e+05,-0.011037
2022-01-21,9.679967e+07,-1.528833e-02,1.363841e+08,0.188652,2.045762e+05,0.000283,9.632043e+07,4.792390e+05,-0.018915
2022-01-24,9.769176e+07,9.502685e-03,1.548913e+08,0.191191,2.323370e+05,0.000287,9.721899e+07,4.727684e+05,0.002772
...,...,...,...,...,...,...,...,...,...
2023-12-22,8.759797e+07,4.067562e-03,7.674675e+09,0.198833,1.151201e+07,0.000298,8.715507e+07,4.429039e+05,0.001660
2023-12-26,8.825846e+07,7.723604e-03,7.685395e+09,0.122381,1.152809e+07,0.000184,8.798041e+07,2.780503e+05,0.004232
2023-12-27,8.827953e+07,4.837023e-04,7.699812e+09,0.163343,1.154972e+07,0.000245,8.791321e+07,3.663143e+05,0.001431
2023-12-28,8.812663e+07,-1.436713e-03,7.717190e+09,0.196858,1.157579e+07,0.000295,8.768443e+07,4.421973e+05,0.000370


In [39]:
import pickle

file_path = f'out/backtests/{ex_num}/rl/0-graph.pkl'

with open(file_path, 'rb') as file:
    chart = pickle.load(file)
chart.show()

In [40]:
import pickle

file_path = f'out/backtests/{ex_num}/rl/0-report.pkl'

with open(file_path, 'rb') as file:
    alphaGen_report = pickle.load(file)
alphaGen_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388
2022-01-19,9.985751e+07,-3.489549e-16,9.499603e+07,0.949960,1.424940e+05,0.001425,9.499603e+07,4.861473e+06,-0.009690
2022-01-20,9.841425e+07,-1.410991e-02,1.178467e+08,0.228833,1.767701e+05,0.000343,9.772334e+07,6.909098e+05,-0.011037
2022-01-21,9.626600e+07,-2.154409e-02,1.365190e+08,0.189731,2.047785e+05,0.000285,9.578397e+07,4.820240e+05,-0.018915
2022-01-24,9.756858e+07,1.385602e-02,1.573726e+08,0.216625,2.360589e+05,0.000325,9.703732e+07,5.312563e+05,0.002772
...,...,...,...,...,...,...,...,...,...
2023-12-22,8.697957e+07,2.245214e-03,8.667073e+09,0.195772,1.300061e+07,0.000294,8.654544e+07,4.341278e+05,0.001660
2023-12-26,8.739349e+07,5.065340e-03,8.684848e+09,0.204366,1.302727e+07,0.000307,8.693999e+07,4.535016e+05,0.004232
2023-12-27,8.726507e+07,-1.177316e-03,8.701867e+09,0.194735,1.305280e+07,0.000292,8.682995e+07,4.351245e+05,0.001431
2023-12-28,8.705163e+07,-2.147457e-03,8.719230e+09,0.198966,1.307884e+07,0.000298,8.660838e+07,4.432473e+05,0.000370


In [41]:
alphaGen_report["cum_return"] = alphaGen_report["return"].cumsum()
alphaGen_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench,cum_return
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388,0.000000e+00
2022-01-19,9.985751e+07,-3.489549e-16,9.499603e+07,0.949960,1.424940e+05,0.001425,9.499603e+07,4.861473e+06,-0.009690,-3.489549e-16
2022-01-20,9.841425e+07,-1.410991e-02,1.178467e+08,0.228833,1.767701e+05,0.000343,9.772334e+07,6.909098e+05,-0.011037,-1.410991e-02
2022-01-21,9.626600e+07,-2.154409e-02,1.365190e+08,0.189731,2.047785e+05,0.000285,9.578397e+07,4.820240e+05,-0.018915,-3.565400e-02
2022-01-24,9.756858e+07,1.385602e-02,1.573726e+08,0.216625,2.360589e+05,0.000325,9.703732e+07,5.312563e+05,0.002772,-2.179799e-02
...,...,...,...,...,...,...,...,...,...,...
2023-12-22,8.697957e+07,2.245214e-03,8.667073e+09,0.195772,1.300061e+07,0.000294,8.654544e+07,4.341278e+05,0.001660,6.000837e-02
2023-12-26,8.739349e+07,5.065340e-03,8.684848e+09,0.204366,1.302727e+07,0.000307,8.693999e+07,4.535016e+05,0.004232,6.507371e-02
2023-12-27,8.726507e+07,-1.177316e-03,8.701867e+09,0.194735,1.305280e+07,0.000292,8.682995e+07,4.351245e+05,0.001431,6.389640e-02
2023-12-28,8.705163e+07,-2.147457e-03,8.719230e+09,0.198966,1.307884e+07,0.000298,8.660838e+07,4.432473e+05,0.000370,6.174894e-02


In [42]:
import pickle

file_path = f'out/backtests/{ex_num}/boot/0-graph.pkl'

with open(file_path, 'rb') as file:
    chart = pickle.load(file)
chart.show()

In [43]:
import pickle

file_path = f'out/backtests/{ex_num}/boot/0-report.pkl'

with open(file_path, 'rb') as file:
    boot_report = pickle.load(file)
boot_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388
2022-01-19,9.985750e+07,-4.627509e-17,9.499733e+07,0.949973,1.424960e+05,0.001425,9.499733e+07,4.860175e+06,-0.009690
2022-01-20,9.837516e+07,-1.450253e-02,1.177693e+08,0.228044,1.766539e+05,0.000342,9.768672e+07,6.884395e+05,-0.011037
2022-01-21,9.682040e+07,-1.547771e-02,1.391951e+08,0.217797,2.087927e+05,0.000327,9.626963e+07,5.507707e+05,-0.018915
2022-01-24,9.748572e+07,7.197674e-03,1.602353e+08,0.217311,2.403529e+05,0.000326,9.694793e+07,5.377937e+05,0.002772
...,...,...,...,...,...,...,...,...,...
2023-12-22,8.174414e+07,-1.857880e-03,8.287607e+09,0.200017,1.243141e+07,0.000300,8.132565e+07,4.184970e+05,0.001660
2023-12-26,8.222712e+07,6.211700e-03,8.304137e+09,0.202212,1.245621e+07,0.000303,8.180507e+07,4.220486e+05,0.004232
2023-12-27,8.220556e+07,2.887550e-05,8.320095e+09,0.194073,1.248014e+07,0.000291,8.179750e+07,4.080503e+05,0.001431
2023-12-28,8.266912e+07,5.925901e-03,8.335814e+09,0.191215,1.250372e+07,0.000287,8.226737e+07,4.017499e+05,0.000370


In [44]:
import pickle

file_path = f'out/backtests/{ex_num}/mcts/0-graph.pkl'

with open(file_path, 'rb') as file:
    chart = pickle.load(file)
chart.show()

In [45]:
import pickle

file_path = f'out/backtests/{ex_num}/mcts/0-report.pkl'

with open(file_path, 'rb') as file:
    riskminer_report = pickle.load(file)
riskminer_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388
2022-01-19,9.985751e+07,-2.887100e-16,9.499526e+07,0.949953,1.424929e+05,0.001425,9.499526e+07,4.862251e+06,-0.009690
2022-01-20,9.890479e+07,-9.196745e-03,1.178974e+08,0.229349,1.768462e+05,0.000344,9.821343e+07,6.913636e+05,-0.011037
2022-01-21,9.741589e+07,-1.471841e-02,1.400196e+08,0.223671,2.100294e+05,0.000336,9.684784e+07,5.680437e+05,-0.018915
2022-01-24,9.826339e+07,8.997998e-03,1.593843e+08,0.198784,2.390765e+05,0.000298,9.776633e+07,4.970539e+05,0.002772
...,...,...,...,...,...,...,...,...,...
2023-12-22,9.116059e+07,1.921785e-03,8.657150e+09,0.200709,1.298572e+07,0.000301,9.069380e+07,4.667910e+05,0.001660
2023-12-26,9.150410e+07,4.064130e-03,8.675138e+09,0.197322,1.301271e+07,0.000296,9.104438e+07,4.597144e+05,0.004232
2023-12-27,9.151439e+07,4.115709e-04,8.693382e+09,0.199376,1.304007e+07,0.000299,9.104863e+07,4.657612e+05,0.001431
2023-12-28,9.161266e+07,1.354008e-03,8.710475e+09,0.186788,1.306571e+07,0.000280,9.117535e+07,4.373153e+05,0.000370


In [46]:
import pickle

file_path = f'out/backtests/{ex_num}/emcts/0-report.pkl'

with open(file_path, 'rb') as file:
    eminer_report = pickle.load(file)
eminer_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388
2022-01-19,9.985751e+07,-3.201421e-17,9.499656e+07,0.949966,1.424948e+05,0.001425,9.499656e+07,4.860949e+06,-0.009690
2022-01-20,9.857885e+07,-1.246075e-02,1.178979e+08,0.229341,1.768469e+05,0.000344,9.788742e+07,6.914296e+05,-0.011037
2022-01-21,9.682670e+07,-1.746305e-02,1.383401e+08,0.207369,2.075102e+05,0.000311,9.630041e+07,5.262972e+05,-0.018915
2022-01-24,9.815107e+07,1.402498e-02,1.607559e+08,0.231504,2.411339e+05,0.000347,9.757972e+07,5.713536e+05,0.002772
...,...,...,...,...,...,...,...,...,...
2023-12-22,9.743654e+07,1.517489e-03,8.989926e+09,0.200871,1.348489e+07,0.000301,9.693683e+07,4.997128e+05,0.001660
2023-12-26,9.794707e+07,5.540792e-03,9.009489e+09,0.200785,1.351423e+07,0.000301,9.744746e+07,4.996030e+05,0.004232
2023-12-27,9.792943e+07,1.075130e-04,9.028266e+09,0.191703,1.354240e+07,0.000288,9.744937e+07,4.800637e+05,0.001431
2023-12-28,9.829991e+07,4.079099e-03,9.047594e+09,0.197362,1.357139e+07,0.000296,9.780667e+07,4.932337e+05,0.000370


In [47]:
import pickle

file_path = f'out/backtests/{ex_num}/oracle/0-report.pkl'

with open(file_path, 'rb') as file:
    oracle_report = pickle.load(file)
oracle_report

Unnamed: 0_level_0,account,return,total_turnover,turnover,total_cost,cost,value,cash,bench
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-18,1.000000e+08,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,1.000000e+08,-0.018388
2022-01-19,9.985751e+07,-8.207280e-17,9.499548e+07,0.949955,1.424932e+05,0.001425,9.499548e+07,4.862027e+06,-0.009690
2022-01-20,9.853824e+07,-1.286960e-02,1.177558e+08,0.227928,1.766337e+05,0.000342,9.785003e+07,6.882081e+05,-0.011037
2022-01-21,9.722083e+07,-1.307404e-02,1.371652e+08,0.196973,2.057478e+05,0.000295,9.672008e+07,5.007539e+05,-0.018915
2022-01-24,9.818221e+07,1.020076e-02,1.573972e+08,0.208104,2.360959e+05,0.000312,9.766589e+07,5.163238e+05,0.002772
...,...,...,...,...,...,...,...,...,...
2023-12-21,8.711626e+07,1.906897e-02,8.086044e+09,0.204226,1.212907e+07,0.000306,8.667076e+07,4.455068e+05,0.010301
2023-12-22,8.725208e+07,1.851606e-03,8.103038e+09,0.195071,1.215456e+07,0.000293,8.681783e+07,4.342445e+05,0.001660
2023-12-26,8.778325e+07,6.396060e-03,8.120970e+09,0.205516,1.218146e+07,0.000308,8.732598e+07,4.572676e+05,0.004232
2023-12-27,8.787559e+07,1.363376e-03,8.139198e+09,0.207652,1.220880e+07,0.000311,8.741014e+07,4.654511e+05,0.001431


In [48]:
import pandas as pd
df_com = pd.DataFrame()

df_com["GP"] = gp_report["return"].cumsum()
df_com["Alpha Gen"] = alphaGen_report["return"].cumsum()
df_com["Bootstrapped DQN"] = boot_report["return"].cumsum()
df_com["Oracle"] = oracle_report["return"].cumsum()
df_com["MCTS"] = riskminer_report["return"].cumsum()
df_com["EMCTS"] = eminer_report["return"].cumsum()
df_com["Benchmark"] = boot_report["bench"].cumsum()


df_com.head()

Unnamed: 0_level_0,GP,Alpha Gen,Bootstrapped DQN,Oracle,MCTS,EMCTS,Benchmark
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-18,0.0,0.0,0.0,0.0,0.0,0.0,-0.018388
2022-01-19,1.752051e-16,-3.489549e-16,-4.6275090000000006e-17,-8.207280000000001e-17,-2.8871e-16,-3.201421e-17,-0.028078
2022-01-20,-0.01494575,-0.01410991,-0.01450253,-0.0128696,-0.009196745,-0.01246075,-0.039115
2022-01-21,-0.03023407,-0.035654,-0.02998023,-0.02594364,-0.02391515,-0.0299238,-0.05803
2022-01-24,-0.02073139,-0.02179799,-0.02278256,-0.01574288,-0.01491715,-0.01589881,-0.055258


In [49]:
import pickle

rmse_files = []
df_rmse = pd.DataFrame()

for model in ["boot","gp","rl","mcts","emcts"]:
    file_path = f'out/backtests/{ex_num}/{model}/0-rmse.pkl'

    if model == "gp":
        file_path = f'out/backtests/{ex_num}/{model}/2-rmse.pkl'

    with open(file_path, 'rb') as file:
        rmse = pickle.load(file)
    
    df_rmse[model] = rmse["rmse"]

df_rmse.rename(columns={"boot": "Bootstrapped DQN", "gp": "GP", "rl": "Alpha Gen", "mcts":"RiskMiner", "emcts":"EMCTS"}, inplace=True)
df_rmse

Unnamed: 0_level_0,Bootstrapped DQN,GP,Alpha Gen,RiskMiner,EMCTS
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-18,181.620320,196.040396,175.970881,162.833670,177.693674
2022-01-19,162.679863,201.060958,173.806078,163.022802,161.954234
2022-01-20,183.338353,202.149893,198.899295,190.048579,194.559471
2022-01-21,188.401717,200.692523,198.073964,182.284197,190.169261
2022-01-24,171.502495,187.199886,182.996497,192.854536,190.011106
...,...,...,...,...,...
2023-12-22,178.959782,196.831140,173.138138,175.582479,174.663578
2023-12-26,159.756099,199.966748,196.665568,192.345819,181.558264
2023-12-27,180.419103,198.041651,185.557666,180.928211,195.395172
2023-12-28,161.213064,202.490904,203.857480,187.420468,186.098962


In [50]:
df_rmse_ma = df_rmse.rolling(30).mean()

In [51]:
df_com.columns

Index(['GP', 'Alpha Gen', 'Bootstrapped DQN', 'Oracle', 'MCTS', 'EMCTS',
       'Benchmark'],
      dtype='object')

In [52]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=2, cols=1,subplot_titles=["Cumulative Return", "RMSE"])


for col in ['GP', 'Alpha Gen', 'Bootstrapped DQN', 'MCTS', 'EMCTS', 'Benchmark']:#df_com.columns:
    fig.add_trace(
        go.Scatter(
            x=df_com.index,   
            y=df_com[col],    
            mode='lines',
            name=col,
            legendgroup = '1',
        ),
        row=1, 
        col=1
    )

for col in ['GP', 'Alpha Gen', 'Bootstrapped DQN', 'RiskMiner', 'EMCTS']:#df_rmse_ma.columns:
    fig.add_trace(
        go.Scatter(
            x=df_rmse_ma.index,
            y=df_rmse_ma[col],    
            mode='lines',
            name=col,
            legendgroup = '2',
        ),
        row=2, 
        col=1
    )

# Update the layout to add the title and template
fig.update_layout(
    template='seaborn',
    autosize=False,
    width=1200,
    height=1200,
    legend_tracegroupgap=580,
    legend_groupclick="toggleitem"
)

fig.show()


# fig = px.line(df_com, y=["Bootstrapped DQN","Alpha Gen","GP","Benchmark"], 
#             #   x="lifeExp", 
#               title='Cumulative Return',
#               template="seaborn",
#               )
# fig.show()

# Test

In [72]:
from alphagen_qlib.stock_data import StockData

data = StockData(
        instrument="csi300",
        start_time="2020-01-01",
        end_time="2022-01-01"
    )
data

<alphagen_qlib.stock_data.StockData at 0x17e97367da0>

In [None]:
from qlib.data import D

instruments = data.stock_ids.tolist()

# Determine the proper start and end times for fetching price data.
# Here we use the same dates as in your StockData instance.
start_time = data._dates[data.max_backtrack_days].strftime("%Y-%m-%d")
end_time = data._dates[-data.max_future_days - 1].strftime("%Y-%m-%d")

# Query Qlib to get the closing price for each instrument.
# The field '$close' is used here (adjust if your field naming is different)
price_df = D.features(
    instruments=instruments,
    fields=["$close"],
    start_time="2020-01-01",
    end_time="2022-01-01"
)

price_df = price_df.reorder_levels(order=[1, 0])
price_df

In [None]:
def compute_oracle_scores(price_df: pd.DataFrame) -> pd.DataFrame:
    # price_df is expected to be a MultiIndex DataFrame with (date, instrument)
    # Unstack to get dates as rows and instruments as columns
    price_unstacked = price_df.unstack(level=1)
    # Compute daily percentage returns and shift so that prediction on day t 
    # is compared with return from t to t+1
    oracle_signal = price_unstacked.pct_change().shift(-1)
    # Stack back to a MultiIndex DataFrame
    return oracle_signal.stack()


oracle_scores = compute_oracle_scores(price_df)
oracle_scores


The default fill_method='pad' in DataFrame.pct_change is deprecated and will be removed in a future version. Either fill in any non-leading NA values prior to calling pct_change or specify 'fill_method=None' to not fill NA values.





Unnamed: 0_level_0,Unnamed: 1_level_0,$close
datetime,instrument,Unnamed: 2_level_1
2020-01-02,SH600000,0.010406
2020-01-02,SH600004,-0.007961
2020-01-02,SH600009,-0.000765
2020-01-02,SH600010,0.008123
2020-01-02,SH600011,0.000000
...,...,...
2021-12-30,SZ300782,-0.024249
2021-12-30,SZ300866,-0.033949
2021-12-30,SZ300888,0.008192
2021-12-30,SZ300896,-0.012722


In [17]:
from alphagen_qlib.utils import load_alpha_pool_by_path

calc = QLibStockDataCalculator(data, None)

for p in Path("out/boot_dqn").iterdir():
        inst, size, seed, time, ver = p.name.split('_', 4)
        size, seed = int(size), int(seed)
        if inst != "csi300" or size != 20 or time < "20240923" or ver == "llm_d5":
            continue
        try:
            exprs, weights = load_alpha_pool_by_path(str(p / "249500_steps_pool.json"))
        except:
            continue

boot_score = data.make_dataframe(calc.make_ensemble_alpha(exprs, weights))
boot_score

Unnamed: 0_level_0,Unnamed: 1_level_0,0
datetime,instrument,Unnamed: 2_level_1
2020-01-02,SH600000,0.029499
2020-01-02,SH600004,0.065036
2020-01-02,SH600009,-0.070193
2020-01-02,SH600010,0.009689
2020-01-02,SH600011,0.194273
...,...,...
2021-12-31,SZ300782,-0.121243
2021-12-31,SZ300866,-0.022676
2021-12-31,SZ300888,0.029904
2021-12-31,SZ300896,-0.026693


In [18]:
def normalize_series(series: pd.Series) -> pd.Series:
    return (series - series.mean()) / series.std()

def rank_series_per_date(series: pd.Series) -> pd.Series:
    """
    Rank the series for each date (assumed to be the first level of the MultiIndex).
    The highest value is assigned rank 1.
    """
    return series.groupby(level=0).rank(ascending=False, method='min')
def compute_rmse_per_date(model_scores: pd.Series, oracle_scores: pd.Series) -> pd.DataFrame:
    """
    Compute the RMSE across stocks for each date.
    
    Parameters:
      model_scores: pd.Series with MultiIndex (date, instrument) containing your model's prediction scores.
      oracle_scores: pd.Series with MultiIndex (date, instrument) containing the oracle's prediction scores.
      
    Returns:
      A DataFrame with the date as the index and a column 'rmse' containing the RMSE for that date.
    """
    # normalize
    # model_scores = normalize_series(model_scores)
    # oracle_scores = normalize_series(oracle_scores)

    # rank the scores
    model_scores = rank_series_per_date(model_scores)
    oracle_scores = rank_series_per_date(oracle_scores)

    # Combine both series into one DataFrame
    df = pd.DataFrame({
        "model": model_scores,
        "oracle": oracle_scores
    })
    # Group by the date level. If your MultiIndex doesn't have names,
    # you can group by level=0 (assuming the first level is the date).
    rmse_series = df.groupby(level=0).apply(
        lambda group: np.sqrt(((group["oracle"] - group["model"]) ** 2).mean())
    )
    rmse_df = rmse_series.to_frame(name="rmse")
    # Ensure the index is named "date" (or adjust as needed)
    rmse_df.index.name = "date"
    return rmse_df

rmse_df = compute_rmse_per_date(boot_score.iloc[:,0], oracle_scores.iloc[:,0])
rmse_df

Unnamed: 0_level_0,rmse
date,Unnamed: 1_level_1
2020-01-02,175.373923
2020-01-03,172.032975
2020-01-06,158.534962
2020-01-07,143.087359
2020-01-08,158.674098
...,...
2021-12-27,165.069690
2021-12-28,162.478913
2021-12-29,165.498930
2021-12-30,164.577087


In [19]:
import plotly.express as px
fig = px.line(rmse_df, y=["rmse"], 
            #   x="lifeExp", 
              title='RMSE',
              template="seaborn",
              )
fig.show()