<a href="https://colab.research.google.com/github/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
sys.path.insert(0, "../..")

import qlib
import pandas as pd
from qlib.config import REG_CN
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.evaluate import (
    backtest as normal_backtest,
    risk_analysis,
)
from qlib.utils import exists_qlib_data, init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.utils import flatten_dict


Please install necessary libs for CatBoostModel.


In [2]:
provider_uri = "../../data/qlib_data_v2"
scripts_dir = "../../scripts"
qlib.init(provider_uri=provider_uri, region=REG_CN)

[14801:MainThread](2021-11-25 16:17:24,986) INFO - qlib.Initialization - [config.py:393] - default_conf: client.
[14801:MainThread](2021-11-25 16:17:24,992) INFO - qlib.Initialization - [__init__.py:57] - qlib successfully initialized based on client settings.
[14801:MainThread](2021-11-25 16:17:24,993) INFO - qlib.Initialization - [__init__.py:59] - data_path={'__DEFAULT_FREQ': PosixPath('/data3/xujianjin/qlib/data/qlib_data_v2')}


# train model

In [3]:
market = "csi300"
benchmark = "SH000300"

###################################
# train model
###################################
data_handler_config = {
    "start_time": "2008-01-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2008-01-01",
    "fit_end_time": "2014-12-31",
    "instruments": market,
}
task = {
    "model": {
        "class": "XGBModel",
        "module_path": "qlib.contrib.model.xgboost",
        "kwargs": {
            "eval_metric": "rmse",
            "colsample_bytree": 0.8879,
            "eta": 0.0421,
            "subsample": 0.8789,
            "max_depth": 8,
            "n_estimators": 647,
            "nthread": 20,
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": ("2008-01-01", "2014-12-31"),
                "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2017-01-01", "2020-08-01"),
            },
        },
    },
}

# model initiaiton
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

# start exp to train model
with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id


[14801:MainThread](2021-11-25 16:17:53,329) INFO - qlib.timer - [log.py:113] - Time cost: 28.241s | Loading data Done
[14801:MainThread](2021-11-25 16:17:54,323) INFO - qlib.timer - [log.py:113] - Time cost: 0.758s | DropnaLabel Done
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
[14801:MainThread](2021-11-25 16:18:02,338) INFO - qlib.timer - [log.py:113] - Time cost: 8.012s | CSZScoreNorm Done
[14801:MainThread](2021-11-25 16:18:02,341) INFO - qlib.timer - [log.py:113] - Time cost: 9.010s | fit & process data Done
[14801:MainThread](2021-11-25 16:18:02,342) INFO - qlib.timer - [log.py:113] - Time cost: 37.255s | Init data Done
[14801:MainThread](2021-11-25 16:18:02,344) INFO - qlib.workflow - [expm.py:282] - No tracking URI is provided. Use the defau

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-rmse:1.10671	valid-rmse:1.10709
[20]	train-rmse:1.01081	valid-rmse:1.01781
[40]	train-rmse:0.98862	valid-rmse:1.00099
[60]	train-rmse:0.98172	valid-rmse:0.99828
[80]	train-rmse:0.97770	valid-rmse:0.99821
[100]	train-rmse:0.97453	valid-rmse:0.99849
[120]	train-rmse:0.97134	valid-rmse:0.99881
[124]	train-rmse:0.97082	valid-rmse:0.99885


[14801:MainThread](2021-11-25 16:23:47,550) INFO - qlib.timer - [log.py:113] - Time cost: 0.000s | waiting `async_log` Done


# prediction, backtest & analysis

In [4]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": benchmark,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name="backtest_analysis"):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(recorder, port_analysis_config, "day")
    par.generate()

[14801:MainThread](2021-11-25 16:23:47,667) INFO - qlib.workflow - [expm.py:282] - No tracking URI is provided. Use the default tracking URI.
[14801:MainThread](2021-11-25 16:23:47,669) INFO - qlib.workflow - [expm.py:318] - <mlflow.tracking.client.MlflowClient object at 0x7f417c2c3280>
[14801:MainThread](2021-11-25 16:23:47,672) INFO - qlib.workflow - [exp.py:249] - Experiment 2 starts running ...
[14801:MainThread](2021-11-25 16:23:47,682) INFO - qlib.workflow - [recorder.py:290] - Recorder 9958fea5e3244fb682a0144b89255af6 starts running under Experiment 2 ...
[14801:MainThread](2021-11-25 16:23:48,893) INFO - qlib.workflow - [record_temp.py:191] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 2
[14801:MainThread](2021-11-25 16:23:49,086) INFO - qlib.backtest caller - [__init__.py:82] - Create new exchange


'The following are prediction results of the XGBModel model.'
                          score
datetime   instrument          
2017-01-03 SH600000   -0.061814
           SH600008    0.009864
           SH600009    0.027207
           SH600010   -0.030914
           SH600015   -0.101295


backtest loop:   0%|          | 0/871 [00:00<?, ?it/s]

  return np.nanmean(self.data)
[14801:MainThread](2021-11-25 16:24:37,429) INFO - qlib.workflow - [record_temp.py:441] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[14801:MainThread](2021-11-25 16:24:37,445) INFO - qlib.workflow - [record_temp.py:466] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 2
[14801:MainThread](2021-11-25 16:24:37,490) INFO - qlib.timer - [log.py:113] - Time cost: 0.013s | waiting `async_log` Done


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000468
std                0.012299
annualized_return  0.111406
information_ratio  0.587139
max_drawdown      -0.370479
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000584
std                0.005479
annualized_return  0.139016
information_ratio  1.644758
max_drawdown      -0.079476
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean               0.000402
std                0.005478
annualized_return  0.095580
information_ratio  1.131041
max_drawdown      -0.088014
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0


# analyze graphs

In [None]:
from qlib.contrib.report import analysis_model, analysis_position
from qlib.data import D
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="backtest_analysis")
print(recorder)
pred_df = recorder.load_object("pred.pkl")
pred_df_dates = pred_df.index.get_level_values(level='datetime')
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

## analysis position

### report

In [None]:
analysis_position.report_graph(report_normal_df)

### risk analysis

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

## analysis model

In [None]:
label_df = dataset.prepare("test", col_set="label")
label_df.columns = ['label']

### score IC

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

In [None]:
analysis_model.model_performance_graph

### model performance

In [None]:
analysis_model.model_performance_graph(pred_label)
# 选股的相关性 - 更高级的策略