In [1]:
import numpy as np
import pandas as pd
import os, json, datetime, sys, joblib
import lightgbm
import torch


from pytrend.numerai import (
    create_era_index,
    load_numerai_data,
    dynamic_feature_neutralisation,
    run_numerai_models_performances,
    save_model_performance_test,
)
from pytrend.util import dynamic_model_selection_masks, walk_forward_dynamic_models
from pytrend.optimisation import numerai_optimisation_pipeline_optuna

### Generate Model Performances

In [2]:
### Hyper-Parameter Space
optuna_lightgbm_args = {
    "feature_eng": {
        "method": None,
        "parameters": {},
    },
    "ml_method": {
        "method": "lightgbm",
        "parameters": {
            "num_iterations": ["int", {"low": 50, "high": 1000, "step": 50}],
            "learning_rate": ["float", {"low": 0.005, "high": 0.1, "log": True}],
            "min_data_in_leaf": ["int", {"low": 2500, "high": 40000, "step": 2500}],
            "feature_fraction": [
                "float",
                {"low": 0.1, "high": 1, "step": 0.05},
            ],
            "lambda_l1": ["float", {"low": 0.01, "high": 1, "log": True}],
            "lambda_l2": ["float", {"low": 0.01, "high": 1, "log": True}],
            "bagging_fraction": ["float", {"low": 0.5, "high": 1, "step": 0.05}],
            "bagging_freq": ["int", {"low": 10, "high": 50, "step": 5}],
            "drop_rate": ["float", {"low": 0.1, "high": 0.3, "step": 0.05}],
            "skip_drop": ["float", {"low": 0.05, "high": 0.25, "step": 0.05}],
            "top_rate": ["float", {"low": 0.1, "high": 0.4, "step": 0.05}],
            "other_rate": ["float", {"low": 0.05, "high": 0.2, "step": 0.05}],
            "boosting": [
                "categorical",
                {
                    "choices": [
                        "gbdt",
                    ]
                },
            ],
            "early_stopping_round": ["int", {"low": 5000, "high": 6000, "step": 500}],
            "objective": "regression",
            "device_type": "gpu",
            "num_threads": 0,
            "verbosity": -1,
            "num_gpu": 1,
            "max_bin": 7,
            "gpu_use_dp": False,
        },
    },
    "model_params": {
        "train": {
            "test_size": 52 * 4,
            "valid_splits": 1,
            "max_train_size": None,
            "gap": 52,
            "cross_validation": "GroupedTimeSeriesSplit",
        },
        "validate": {
            "test_size": 52 * 4,
            "valid_splits": 1,
            "max_train_size": None,
            "gap": 52,
            "cross_validation": "GroupedTimeSeriesSplit",
        },
        "selection": {
            "proportion": 0,
            "criteria": "sharpe",
        },
        "train_targets": ["target_20d"],
        "train_endera": datetime.datetime.strptime("2015-12-31", "%Y-%m-%d"),
        "validate_targets": [
            "target_20d",
            "target_20d_raw_return",
            "target_20d_factor_neutral",
            "target_20d_factor_feat_neutral",
            "target_4d",
        ],
        "validate_enderas": [
            datetime.datetime.strptime("2015-12-31", "%Y-%m-%d"),
            datetime.datetime.strptime("2016-12-31", "%Y-%m-%d"),
            datetime.datetime.strptime("2017-12-31", "%Y-%m-%d"),
            datetime.datetime.strptime("2018-12-31", "%Y-%m-%d"),
            datetime.datetime.strptime("2019-12-31", "%Y-%m-%d"),
        ],
        "train_resample_freq": 1,
        "validate_resample_freq": 1,
        "output_folder": "numerai-signals-models",
        "model_no_start": None,
        "no_models_per_config": 20,
        "feature_sets": "signals",
    },
}

In [11]:
model_no_start = 0

feature_sets = [
    "all",
    "signature",
    "catch22",
    "stats",
    "financials",
    "ravenpack",
]

for feature_set in feature_sets:

    numerai_files = {
        "dataset": f"../signals-data/numerai_signals_features_{feature_set}.parquet",
        "feature_metadata": f"../signals-data/numerai_signals_features_{feature_set}_metadata.json",
    }
    optimisation_args = optuna_lightgbm_args
    optimisation_args["model_params"]["model_no_start"] = model_no_start

    MODEL_FOLDER = "../numerai-signals-models"
    PERFORMANCES_FOLDER = "../numerai-signals-performances-target-20d-raw"
    
    start = optimisation_args["model_params"]["model_no_start"]
    end = optimisation_args["model_params"]["model_no_start"] + optimisation_args[
        "model_params"
    ]["no_models_per_config"] * len(
        optimisation_args["model_params"]["validate_targets"]
    ) * len(
        optimisation_args["model_params"]["validate_enderas"]
    )
    for seed in range(
        start, end, optimisation_args["model_params"]["no_models_per_config"]
    ):
        Numerai_Model_Names = [
            f'{MODEL_FOLDER}/{optimisation_args["ml_method"]["method"]}_{optimisation_args["feature_eng"]["method"]}_1_{seed+seq}.parameters'
            for seq in range(optimisation_args["model_params"]["no_models_per_config"])
        ]
        run_numerai_models_performances(
            Numerai_Model_Names,
            None,
            None,
            PERFORMANCES_FOLDER,
            data_file=numerai_files["dataset"],
            data_version="signals",
            target_col=["raw_return_target_20d"],
        )

    model_no_start = model_no_start + 1000

Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00


KeyboardInterrupt: 

In [3]:
## Run Numerai Model Performances for both Classic and Signals tournament
def run_numerai_models_performances(
    Numerai_Model_Names,
    feature_corr,
    features_optimizer,
    PERFORMANCES_FOLDER,
    data_file="data/v4_all_int8.parquet",
    data_version="v4",
    target_col=["target"],
    gbm_start_iteration=0,
):

    ## Calculate Starting Era
    parametername = Numerai_Model_Names[0]
    no_models = len(Numerai_Model_Names)
    stem = parametername.split("/")[-1].replace(".parameters", "")
    correlations_filename = f"{PERFORMANCES_FOLDER}/{stem}_{no_models}.csv"
    if os.path.exists(parametername):
        parameters = joblib.load(parametername)
        if data_version == "signals":
            test_start = parameters["parameters"]["model"]["validation_end"]
            test_end = datetime.datetime.strptime("2099-12-31", "%Y-%m-%d")
        else:
            test_start = shift_era(
                parameters["parameters"]["model"]["validation_end"], gap=14
            )
            test_end = feature_corr.index[-1]
        if os.path.exists(correlations_filename):
            most_recent_date = pd.read_csv(correlations_filename, index_col=0).index[-1]
            if data_version == "signals":
                test_start = datetime.datetime.strptime(most_recent_date, "%Y-%m-%d")
            else:
                test_start = shift_era(convert_datetime_to_era(most_recent_date), gap=1)
        print(f"Model Performances {test_start} {test_end}")
        ### Get Model Predictions for the latest eras
        if test_end > test_start and not os.path.exists(correlations_filename):
            (
                validate_performance,
                validate_correlations,
                validate_predictions,
            ) = save_model_performance_test(
                Numerai_Model_Names,
                feature_corr,
                features_optimizer,
                startera=test_start,
                endera=test_end,
                data_file=data_file,
                data_version=data_version,
                target_col=target_col,
                gbm_start_iteration=gbm_start_iteration,
            )
            ## Update Model Performances
            output = validate_correlations["neutralised_correlation"]
            if os.path.exists(correlations_filename):
                old_file = pd.read_csv(correlations_filename, index_col=0)
                df = pd.concat([old_file, output.dropna()])
                df.index = pd.to_datetime(df.index)
                df[~df.index.duplicated()].sort_index().to_csv(correlations_filename)
            else:
                output.dropna().to_csv(correlations_filename)
            
            preds_filename = f"{PERFORMANCES_FOLDER}/{stem}_{no_models}_predictions.parquet"
            validate_predictions.to_parquet(preds_filename)

In [4]:
model_no_start = 0

feature_sets = [
    "all",
    "signature",
    "catch22",
    "stats",
    "financials",
    "ravenpack",
    "price",
]

for feature_set in feature_sets:

    numerai_files = {
        "dataset": f"../signals-data/numerai_signals_features_{feature_set}.parquet",
        "feature_metadata": f"../signals-data/numerai_signals_features_{feature_set}_metadata.json",
    }
    optimisation_args = optuna_lightgbm_args

    MODEL_FOLDER = "../numerai-signals-models"
    PERFORMANCES_FOLDER = "../numerai-signals-performances-target-20d"
    

    for seed in range(
        model_no_start, model_no_start+20, 20
    ):
        Numerai_Model_Names = [
            f'{MODEL_FOLDER}/{optimisation_args["ml_method"]["method"]}_{optimisation_args["feature_eng"]["method"]}_1_{seed+seq}.parameters'
            for seq in range(optimisation_args["model_params"]["no_models_per_config"])
        ]
        run_numerai_models_performances(
            Numerai_Model_Names,
            None,
            None,
            PERFORMANCES_FOLDER,
            data_file=numerai_files["dataset"],
            data_version="signals",
            target_col=["target_20d"],
        )

    model_no_start = model_no_start + 1000

Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00
Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00
Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00
Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00
Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00
Model Performances 2021-12-31 00:00:00 2099-12-31 00:00:00
Model Performances 2015-12-25 00:00:00 2099-12-31 00:00:00


In [9]:
average_prediction_df = pd.read_parquet(f"{PERFORMANCES_FOLDER}/lightgbm_None_1_5000_20_predictions.parquet")

In [10]:
output = average_prediction_df.reset_index()[['bloomberg_ticker','prediction','friday_date']]
output.columns = ['bloomberg_ticker','signal','friday_date']

In [11]:
output['friday_date'] = pd.to_datetime(output['friday_date'],format='%Y%m%d')
output['data_type'] = 'validation'

In [12]:
output[output['friday_date']>='2016-06-26'].to_csv('validate_ravenpack.csv',index=False)