In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS
from talib import WMA, RSI, MACD, BBANDS
from itertools import product
import warnings
import tables

In [2]:
import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from collections import defaultdict
from sklearn.model_selection import TimeSeriesSplit
from time import time
from lightgbm import log_evaluation



In [12]:
def loadPrices(fn):
    global nt, nInst
    df = pd.read_csv(fn, sep="\s+", header=None, index_col=None)
    (nt, nInst) = df.shape
    return (df.values).T


pricesFile = "./prices.txt"
prcAll = loadPrices(pricesFile)

prcHistT = pd.DataFrame(prcAll.T)

variance = prcHistT.var()
stdev = np.sqrt(variance)

In [None]:
def simPrices(prcHist):
    n = 252
    t = 1
    tau = t / n
    prcHist = pd.DataFrame(prcHist.T)
    returns = prcHist.pct_change()

    sigma = returns.iloc[:-500, :].std()

    mean = returns.mean()
    mu = mean - (0.5 * sigma**2)
    print(mean)
    S = prcHist.iloc[-1, :]
    simmedPrices = np.zeros((n, len(prcHist.columns)))
    simmedPrices[0, :] = S

    for i in range(1, n):
        Z = np.random.normal(0, 1)
        # Simulate log prices
        simmedPrices[i, :] = simmedPrices[i - 1, :] + (mu*simmedPrices[i - 1, :]) + (sigma*simmedPrices[i - 1, :]*Z)

    # Convert simulated log prices back to normal prices
    simmedPrices = pd.DataFrame(simmedPrices)

    return simmedPrices


simulated_prices = simPrices(prcAll).T
prcFull = pd.concat([pd.DataFrame(prcAll), pd.DataFrame(simulated_prices)], axis=1, ignore_index=True)


In [13]:
def momentum_features(prcSoFar): 
  data = pd.DataFrame()
  lags = [1, 5, 21, 42, 63, 126, 189, 252]
  for lag in lags:
    data[f'return_{lag}d'] = (prcSoFar
      .pct_change(lag, fill_method=None)
      .stack()
      .pipe(lambda x: x.clip(lower=x.quantile(0.01),
        upper=x.quantile(0.99)))
    )

  for lag in [42,63,126,189,252]: ##momentum diff indicators
    data[f'momentum_{lag//21}'] = data[f'return_{lag}d'].sub(data.return_21d)
  data[f'momentum_3_12'] = data[f'return_252d'].sub(data.return_63d)

  for t in [1, 5, 21]:  # target returns
    data[f'r{t}_fwd'] = data.groupby(level=0)[f'return_{t}d'].shift(-t)
  return data


In [8]:
def rsi(prices):
    return prices.apply(RSI)


def bollinger(prices):
    # Initialize dictionaries to hold the high and low Bollinger Bands for each column
    bb_high = {}
    bb_low = {}

    # Apply BBANDS to each column in the DataFrame
    for column in prices.columns:
        high, mid, low = BBANDS(prices[column].values, timeperiod=20)
        bb_high[column] = high
        bb_low[column] = low

    # Convert the dictionaries to DataFrames
    bb_high_df = pd.DataFrame(bb_high, index=prices.index)
    bb_low_df = pd.DataFrame(bb_low, index=prices.index)

    # Combine the high and low Bollinger Bands into a single DataFrame
    bollinger_bands = pd.concat(
        [bb_high_df, bb_low_df], axis=1, keys=["bb_high", "bb_low"]
    )

    return bollinger_bands


def macd(prices):
    def compute_macd(close):
        macd, macd_signal, macd_hist = MACD(close)
        return (macd - np.nanmean(macd)) / np.nanstd(macd)

    macd_df = prices.apply(lambda col: compute_macd(col.values), axis=0)
    return macd_df

In [5]:
def log(df):
    return np.log1p(df)
def sign(df):
    return np.sign(df)
def power(df, exp):
    return df.pow(exp)

In [6]:
def rank(df: pd.DataFrame)-> pd.DataFrame:
  return df.rank(axis=1, pct=True)

def scale(df: pd.DataFrame)-> pd.DataFrame:
  return df.div(df.abs().sum(axis=1), axis=0)

def lagged_ts(df: pd.DataFrame, t: int = 1)-> pd.DataFrame:
  return df.shift(t)

def diff_ts(df: pd.DataFrame, period: int = 1)-> pd.DataFrame:
  return df.diff(period)

def rollingsum_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).sum()

def rollingmean_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).mean()

def rollingweightedmean_ts(df: pd.DataFrame, period: int = 10)-> pd.DataFrame:
  return df.apply(lambda x: WMA(x, timeperiod=period))

def rollingstd_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).std()

def rollingrank_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).apply(lambda x: x.rank().iloc[-1])

def rollingproduct_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).apply(np.prod)

def rollingmin_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).min()

def rollingmax_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).max()

def maxdate_ts(df: pd.DataFrame, window: int = 10)-> pd.DataFrame:
  return df.rolling(window).apply(np.argmax).add(1)

def mindate_ts(df: pd.DataFrame, window: int = 10)->pd.DataFrame:
  return df.rolling(window).apply(np.argmin).add(1)

def rollingcorr_ts(x: pd.Series, y: pd.Series, window: int = 10)-> pd.DataFrame:
    return x.rolling(window).corr(y)

def rollingcov_ts(x: pd.Series, y: pd.Series, window: int = 10) -> pd.DataFrame:
    return x.rolling(window).cov(y)

In [14]:
def alpha1(prices, returns): #1
  prices = prices.copy()
  returns = returns.shift(1)
  prices[returns < 0] = rollingstd_ts(returns, 20)
  return (rank(maxdate_ts(power(prices, 2), 5)).mul(-0.5))

def alpha2(prices): #4
  return (-1* rollingrank_ts(rank(prices), 9))

def alpha3(prices): #9
  prices_diff = diff_ts(prices, 1)
  return prices_diff.where(rollingmin_ts(prices_diff, 5) > 0, prices_diff.where(rollingmax_ts(prices_diff, 5) < 0,-prices_diff))

def alpha4(prices): #10
  prices_diff = diff_ts(prices, 1)
  return prices_diff.where(rollingmin_ts(prices_diff, 4) > 0, prices_diff.where(rollingmin_ts(prices_diff, 4) > 0, -prices_diff))

def alpha5(prices): #23
  return diff_ts(prices, 2).mul(-1).where(rollingmean_ts(prices, 20) < prices, 0)

def alpha6(prices): #24
  cond = diff_ts(rollingmean_ts(prices, 100), 100) / lagged_ts(prices, 100) <= 0.05
  return prices.sub(rollingmin_ts(prices, 100)).mul(-1).where(cond, -diff_ts(prices, 3))

def alpha7(prices, returns): #29
  return rollingmin_ts(rank(rank(scale(log(rollingsum_ts(rank(rank(-rank(diff_ts((prices - 1), 5)))), 2))))), 5).add(rollingrank_ts(lagged_ts((-1*returns), 6), 5))

def alpha8(prices, returns): #34
  return rank(rank(rollingstd_ts(returns, 2).div(rollingstd_ts(returns, 5)).replace([-np.inf, np.inf], np.nan)).mul(-1).sub(rank(diff_ts(prices, 1))).add(2))
 
def alpha9(prices): #46
  cond = lagged_ts(diff_ts(prices, 10), 10).div(10).sub(diff_ts(prices, 10).div(10))
  alpha = pd.DataFrame(-np.ones_like(cond), index=prices.index, columns=prices.columns)
  alpha[cond.isnull()] = np.nan
  return cond.where(cond > 0.25, -alpha.where(cond < 0, -diff_ts(prices, 1)))

def alpha10(prices): #49
    cond = diff_ts(lagged_ts(prices, 10), 10).div(10).sub(diff_ts(prices, 10).div(10)) >= -0.1 * prices
    return -diff_ts(prices, 1).where(cond, 1) 


def alpha11(prices): #51
  cond = diff_ts(diff_ts(prices, 10), 10).div(10).sub(diff_ts(prices, 10).div(10)) >= -0.05 * prices
  return -diff_ts(prices, 1).where(cond, 1)


In [8]:
data = momentum_features(prcHistT)


returns = (data["return_1d"].unstack()).copy()

display(data)

Unnamed: 0,Unnamed: 1,return_1d,return_5d,return_21d,return_42d,return_63d,return_126d,return_189d,return_252d,momentum_2,momentum_3,momentum_6,momentum_9,momentum_12,momentum_3_12,r1_fwd,r5_fwd,r21_fwd
1,0,0.001486,,,,,,,,,,,,,,0.006281,,
1,1,0.006281,,,,,,,,,,,,,,0.001238,,
1,2,0.001238,,,,,,,,,,,,,,-0.000396,,
1,3,-0.000396,,,,,,,,,,,,,,-0.000768,,
1,4,-0.000768,,,,,,,,,,,,,,-0.003846,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249,45,-0.006003,-0.009890,-0.016450,-0.026018,-0.021813,-0.040348,-0.038633,-0.029312,-0.009568,-0.005364,-0.023898,-0.022184,-0.012863,-0.007499,-0.006398,,
1249,46,-0.006398,0.013443,0.000773,0.010541,0.014370,0.102357,0.078472,-0.005380,0.009768,0.013596,0.101583,0.077699,-0.006153,-0.019749,-0.005321,,
1249,47,-0.005321,0.000000,0.014586,0.045804,0.012183,0.003018,-0.079126,-0.115089,0.031218,-0.002403,-0.011568,-0.093712,-0.129675,-0.127271,-0.007388,,
1249,48,-0.007388,0.003565,-0.037134,-0.024261,-0.069416,-0.042281,0.149300,0.288298,0.012872,-0.032282,-0.005147,0.186434,0.325432,0.357714,0.001957,,


In [15]:
# Assuming you have already defined all the alpha functions

# Calculate all alphas
alpha1_result = alpha1(prcHistT, returns)
alpha2_result = alpha2(prcHistT)
alpha3_result = alpha3(prcHistT)
alpha4_result = alpha4(prcHistT)
alpha5_result = alpha5(prcHistT)
alpha6_result = alpha6(prcHistT)
alpha7_result = alpha7(prcHistT, returns)
alpha8_result = alpha8(prcHistT, returns)
alpha9_result = alpha9(prcHistT)
alpha10_result = alpha10(prcHistT)
alpha11_result = alpha11(prcHistT)

# Combine all alphas into a single DataFrame
all_alphas = pd.concat([
    alpha1_result, alpha2_result, alpha3_result, alpha4_result,
    alpha5_result, alpha6_result, alpha7_result, alpha8_result,
    alpha9_result, alpha10_result, alpha11_result
], axis=1, keys=['alpha1', 'alpha2', 'alpha3', 'alpha4', 'alpha5', 
                 'alpha6', 'alpha7', 'alpha8', 'alpha9', 'alpha10', 'alpha11'])

# Reshape to match the MultiIndex structure
all_alphas_stacked = all_alphas.stack()

# Create MultiIndex
time_index = range(len(prcHistT))
stock_index = prcHistT.columns
multi_index = pd.MultiIndex.from_product([time_index, stock_index], names=['time', 'stock'])

# Reindex to ensure all combinations are present
all_alphas_multi = all_alphas_stacked.reindex(multi_index)

# Optionally, you can add other features from your previous DataFrame if needed
# For example, if you want to include the return features:
return_features = data[
    ["return_1d", "return_5d", "return_21d", "return_42d", "return_63d", "return_126d", "return_189d", "return_252d",
     "momentum_2", "momentum_3", "momentum_6", "momentum_9", "momentum_12", "momentum_3_12",
     "r1_fwd", "r5_fwd", "r21_fwd"
    ]
]
all_features = pd.concat([all_alphas_multi, return_features], axis=1)

# Sort the index to ensure it's in the correct order
all_features = all_features.sort_index()

In [16]:
display(all_features)

Unnamed: 0,Unnamed: 1,alpha1,alpha2,alpha3,alpha4,alpha5,alpha6,alpha7,alpha8,alpha9,alpha10,...,return_252d,momentum_2,momentum_3,momentum_6,momentum_9,momentum_12,momentum_3_12,r1_fwd,r5_fwd,r21_fwd
0,0,,,,,0.00,,,,,-1.00,...,,,,,,,,,,
0,1,,,,,0.00,,,,,-1.00,...,,,,,,,,,,
0,2,,,,,0.00,,,,,-1.00,...,,,,,,,,,,
0,3,,,,,0.00,,,,,-1.00,...,,,,,,,,,,
0,4,,,,,0.00,,,,,-1.00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1249,45,-0.150,-1.5,0.26,0.26,0.00,-0.00,3.69,0.34,-0.26,0.26,...,-0.029312,-0.009568,-0.005364,-0.023898,-0.022184,-0.012863,-0.007499,-0.006398,,
1249,46,-0.335,-5.0,0.50,0.50,0.02,-0.65,3.02,0.64,1.00,0.50,...,-0.005380,0.009768,0.013596,0.101583,0.077699,-0.006153,-0.019749,-0.005321,,
1249,47,-0.255,-5.0,0.16,0.16,0.16,-1.31,5.14,0.75,-0.16,0.16,...,-0.115089,0.031218,-0.002403,-0.011568,-0.093712,-0.129675,-0.127271,-0.007388,,
1249,48,-0.150,-2.0,0.44,0.44,0.00,0.39,5.08,0.93,1.00,0.44,...,0.288298,0.012872,-0.032282,-0.005147,0.186434,0.325432,0.357714,0.001957,,


In [17]:
def get_fi(model):
  fi = model.feature_importance(importance_type="gain")
  return pd.Series(fi / fi.sum(), index=model.feature_name())

In [15]:
base_params = dict(boosting='gbdt', objective='regression', verbose=-1)

max_depths = [2, 3, 5, 7]
num_leaves_opts = [2**i for i in max_depths]
min_data_in_leaf_opts = [250, 500, 1000]

learning_rate_ops = [0.01, 0.1, 0.3]
feature_fraction_opts = [0.3, 0.6, 0.95]

param_names = ["learning_rate", "num_leaves", "feature_fraction", "min_data_in_leaf"]

cv_params = list(product(learning_rate_ops, num_leaves_opts, feature_fraction_opts, min_data_in_leaf_opts))
n_params = len(cv_params)
print(f"# Parameters: {n_params}")


labels = sorted(data.filter(like="_fwd").columns)
print(labels)
features = all_features.columns.difference(labels).tolist()
print(features)
lookaheads = [1, 5, 21]  # 1 day, 1 week, 1 month
label_dict = dict(zip(lookaheads, labels))

train_lengths = [(3 * 252), 126]
test_lengths = [63]
test_params = list(product(lookaheads, train_lengths, test_lengths))
n = len(test_params)
test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)
test_params = [test_params[i] for i in test_param_sample]
print("Train configs:", len(test_params))


def ic_lgbm(preds, train_data):
    is_higher_better = True
    return "ic", spearmanr(preds, train_data.get_label())[0], is_higher_better


num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))
num_boost_round = num_iterations[-1]

metric_cols = (
    param_names
    + ["t", "daily_ic_mean", "daily_ic_mean_n", "daily_ic_median", "daily_ic_median_n"]
    + [str(n) for n in num_iterations]
)

# Parameters: 108
['r1_fwd', 'r21_fwd', 'r5_fwd']
['alpha1', 'alpha10', 'alpha11', 'alpha2', 'alpha3', 'alpha4', 'alpha5', 'alpha6', 'alpha7', 'alpha8', 'alpha9', 'momentum_12', 'momentum_2', 'momentum_3', 'momentum_3_12', 'momentum_6', 'momentum_9', 'return_126d', 'return_189d', 'return_1d', 'return_21d', 'return_252d', 'return_42d', 'return_5d', 'return_63d']
Train configs: 6


In [21]:
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=tables.exceptions.NaturalNameWarning)

def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:02.0f}:{m:02.0f}:{s:02.0f}'

def get_fi(model):
    return pd.Series(model.feature_importance(), index=model.feature_name())



for lookahead, train_length, test_length in test_params:
    # randomized grid search
    cvp = np.random.choice(list(range(n_params)),
                           size=int(n_params / 2),
                           replace=False)
    cv_params_ = [cv_params[i] for i in cvp]

    # set up cross-validation
    n_splits = int(2 * 252 / test_length)  # Assuming 252 trading days in a year
    print(f'Lookahead: {lookahead:2.0f} | '
          f'Train: {train_length:3.0f} | '
          f'Test: {test_length:2.0f} | '
          f'Params: {len(cv_params_):3.0f} | '
          f'Train configs: {len(test_params)}')

    # time-series cross-validation
    cv = TimeSeriesSplit(n_splits=n_splits, test_size=test_length)

    label = label_dict[lookahead]
    feature_cols = [col for col in all_features.columns if col != label]
    outcome_data = all_features.loc[:, feature_cols + [label]].dropna()
    
    T = 0
    predictions, metrics, feature_importance, daily_ic = [], [], [], []
    
    # iterate over (shuffled) hyperparameter combinations
    for p, param_vals in enumerate(cv_params_):
        key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])
        params = dict(zip(param_names, param_vals))
        params.update(base_params)

        start = time()
        cv_preds, nrounds = [], []
        ic_cv = defaultdict(list)
        
        # iterate over folds
        for i, (train_idx, test_idx) in enumerate(cv.split(outcome_data)):
            
            # select train subset
            train_data = outcome_data.iloc[train_idx]
            test_data = outcome_data.iloc[test_idx]
            
            lgb_train = lgb.Dataset(data=train_data[feature_cols],
                                    label=train_data[label],
                                    free_raw_data=False)
            
            # train model for num_boost_round
            model = lgb.train(params=params,
                              train_set=lgb_train,
                              num_boost_round=num_boost_round,
                              callbacks=[log_evaluation(period=0)])
            
            # log feature importance
            if i == 0:
                fi = get_fi(model).to_frame()
            else:
                fi[i] = get_fi(model)

            # capture predictions
            X_test = test_data[feature_cols]
            y_test = test_data[label]
            y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}
            
            # record predictions for each fold
            cv_preds.append(pd.DataFrame({'y_test': y_test, **y_pred, 'i': i}, index=test_data.index))
        
        # combine fold results
        cv_preds = pd.concat(cv_preds).assign(**params)
        predictions.append(cv_preds)
        
        # compute IC per day
        by_day = cv_preds.groupby(level=0)
        ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)
                               for n in num_iterations], axis=1)
        daily_ic_mean = ic_by_day.mean()
        daily_ic_mean_n = daily_ic_mean.idxmax()
        daily_ic_median = ic_by_day.median()
        daily_ic_median_n = daily_ic_median.idxmax()
        
        # compute IC across all predictions
        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]
        t = time() - start
        T += t
        
        # collect metrics
        metrics = pd.Series(list(param_vals) +
                            [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,
                            index=metric_cols)
        msg = f'\t{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params["learning_rate"]:5.2f} | '
        msg += f'{params["num_leaves"]:3.0f} | {params["feature_fraction"]:3.0%} | {params["min_data_in_leaf"]:4.0f} | '
        msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'
        print(msg)

        # persist results for given CV run and hyperparameter combination
        metrics.to_hdf('lgb_results.h5', 'metrics/' + key)
        ic_by_day.assign(**params).to_hdf('lgb_results.h5', 'daily_ic/' + key)
        fi.T.describe().T.assign(**params).to_hdf('lgb_results.h5', 'fi/' + key)
        cv_preds.to_hdf('lgb_results.h5', 'predictions/' + key)

Lookahead:  1 | Train: 756 | Test: 63 | Params:  54 | Train configs: 6
	  0 | 00:00:12 ( 12) |  0.10 | 128 | 60% |  250 |  11.85% |  10.61% |  450 |  10.39% |  100
	  1 | 00:00:19 (  7) |  0.01 |  32 | 60% |  500 |   3.68% |  0.17% |  350 |  4.34% |  350
	  2 | 00:00:25 (  6) |  0.10 |  32 | 95% | 1000 |   2.84% |  0.31% |  350 |  6.36% |  300
	  3 | 00:00:28 (  3) |  0.10 |   8 | 95% |  500 |   6.11% |  1.57% |   50 |  3.73% |   50
	  4 | 00:00:31 (  2) |  0.01 |   8 | 30% |  500 |   3.73% |  0.55% |   25 |  2.06% |  400
	  5 | 00:00:33 (  3) |  0.10 |   8 | 60% |  250 |   5.83% |  2.61% |  400 |  7.64% |  300
	  6 | 00:00:36 (  2) |  0.01 |   4 | 95% |  500 |   1.85% | -1.18% |  350 |  0.78% |   75
	  7 | 00:00:39 (  3) |  0.01 |   8 | 95% |  500 |   5.15% |  1.39% |  500 |  2.62% |   25
	  8 | 00:00:47 (  8) |  0.30 |  32 | 95% |  500 |   5.11% |  6.16% |  250 |  4.39% |  400
	  9 | 00:00:50 (  3) |  0.30 |   8 | 60% | 1000 |   4.05% |  0.65% |   10 |  1.59% |  300
	 10 | 00:00:53 (

In [18]:
import pandas as pd
from pathlib import Path

# Adjust the path to match your current working directory
results_path = Path('~/Projects/Algothon-24')
lgb_metrics = []

with pd.HDFStore(results_path / 'tuning_lgb.h5') as store:
    for key in store.keys():
        if key.startswith('/metrics'):
            parts = key.split('/')
            lookahead, train_length, test_length = parts[2:5]
            
            s = store[key]
            s['lookahead'] = lookahead
            s['train_length'] = train_length
            s['test_length'] = test_length
            
            lgb_metrics.append(s)

lgb_metrics = pd.DataFrame(lgb_metrics)

# Define the columns as before
scope_params = ['lookahead', 'train_length', 'test_length']
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']

id_vars = scope_params + lgb_train_params + daily_ic_metrics

# Melt the DataFrame
lgb_metrics_melted = pd.melt(lgb_metrics, 
                             id_vars=id_vars, 
                             value_name='ic', 
                             var_name='boost_rounds').dropna()

# Convert to numeric
lgb_metrics_melted = lgb_metrics_melted.apply(pd.to_numeric, errors='ignore')

# Save to HDF5 in the same directory
lgb_metrics_melted.to_hdf(results_path / 'model_tuning.h5', 'lgb/metrics')

# Display info
lgb_metrics_melted.info()
lgb_metrics.groupby(scope_params).size()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4536 entries, 0 to 4535
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   lookahead          4536 non-null   int64  
 1   train_length       4536 non-null   int64  
 2   test_length        4536 non-null   int64  
 3   learning_rate      4536 non-null   float64
 4   num_leaves         4536 non-null   float64
 5   feature_fraction   4536 non-null   float64
 6   min_data_in_leaf   4536 non-null   float64
 7   daily_ic_mean      4536 non-null   float64
 8   daily_ic_mean_n    4536 non-null   float64
 9   daily_ic_median    4536 non-null   float64
 10  daily_ic_median_n  4536 non-null   float64
 11  boost_rounds       4536 non-null   object 
 12  ic                 4536 non-null   float64
dtypes: float64(9), int64(3), object(1)
memory usage: 460.8+ KB


lookahead  train_length  test_length
1          126           63             54
           756           63             54
21         126           63             54
           756           63             54
5          126           63             54
           756           63             54
dtype: int64

In [19]:
int_cols = ["lookahead", "train_length", "test_length", "boost_rounds"]

lgb_ic = []
with pd.HDFStore(results_path / "tuning_lgb.h5") as store:
    keys = [k[1:] for k in store.keys()]
    for key in keys:
        _, t, train_length, test_length = key.split("/")[:4]
        if key.startswith("daily_ic"):
            df = (
                store[key]
                .drop(["boosting", "objective", "verbose"], axis=1)
                .assign(lookahead=t, train_length=train_length, test_length=test_length)
            )
            lgb_ic.append(df)
    lgb_ic = pd.concat(lgb_ic).reset_index(drop=True)

# Print columns before melt operation
print("Columns before melt operation:", lgb_ic.columns.tolist())

id_vars = scope_params + lgb_train_params
lgb_ic = pd.melt(
    lgb_ic, id_vars=id_vars, value_name="ic_value", var_name="boost_rounds"
).dropna()

# Print columns after melt operation
print("Columns after melt operation:", lgb_ic.columns.tolist())

# Ensure only the specified integer columns are converted to integers
lgb_ic[int_cols] = lgb_ic[int_cols].astype(int)

# Display info to check data types
lgb_ic.info()

Columns before melt operation: [10, 25, 50, 75, 100, 150, 200, 250, 300, 350, 400, 450, 500, 'learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf', 'lookahead', 'train_length', 'test_length']
Columns after melt operation: ['lookahead', 'train_length', 'test_length', 'learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf', 'boost_rounds', 'ic_value']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75816 entries, 0 to 75815
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   lookahead         75816 non-null  int64  
 1   train_length      75816 non-null  int64  
 2   test_length       75816 non-null  int64  
 3   learning_rate     75816 non-null  float64
 4   num_leaves        75816 non-null  int64  
 5   feature_fraction  75816 non-null  float64
 6   min_data_in_leaf  75816 non-null  int64  
 7   boost_rounds      75816 non-null  int64  
 8   ic_value          75816 non-null  f

In [20]:
lgb_ic.to_hdf("model_tuning.h5", "lgb/ic")
lgb_daily_ic = (
    lgb_ic.groupby(id_vars[0:] + ["boost_rounds"])["ic_value"]
    .mean()
    .to_frame("ic")
    .reset_index()
)
lgb_daily_ic.to_hdf("model_tuning.h5", "lgb/daily_ic")
lgb_daily_ic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4212 entries, 0 to 4211
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   lookahead         4212 non-null   int64  
 1   train_length      4212 non-null   int64  
 2   test_length       4212 non-null   int64  
 3   learning_rate     4212 non-null   float64
 4   num_leaves        4212 non-null   int64  
 5   feature_fraction  4212 non-null   float64
 6   min_data_in_leaf  4212 non-null   int64  
 7   boost_rounds      4212 non-null   int64  
 8   ic                4212 non-null   float64
dtypes: float64(3), int64(6)
memory usage: 296.3 KB


In [21]:
lgb_ic = pd.read_hdf("model_tuning.h5", "lgb/ic")
lgb_daily_ic = pd.read_hdf("model_tuning.h5", "lgb/daily_ic")

In [22]:
group_cols = scope_params + lgb_train_params + ["boost_rounds"]
lgb_daily_ic.groupby("lookahead", group_keys=False).apply(lambda x: x.nlargest(3, "ic"))

Unnamed: 0,lookahead,train_length,test_length,learning_rate,num_leaves,feature_fraction,min_data_in_leaf,boost_rounds,ic
1181,1,756,63,0.1,128,0.6,250,450,0.106148
626,1,126,63,0.3,32,0.3,500,50,0.099861
1328,1,756,63,0.3,32,0.3,500,50,0.099861
2595,5,756,63,0.1,128,0.6,250,300,0.353721
2764,5,756,63,0.3,32,0.95,250,300,0.352499
2599,5,756,63,0.1,128,0.6,250,500,0.351763
3443,21,126,63,0.3,32,0.95,250,450,0.205537
4145,21,756,63,0.3,32,0.95,250,450,0.205537
3441,21,126,63,0.3,32,0.95,250,350,0.203173


In [23]:
lgb_metrics.groupby('lookahead', group_keys=False).apply(lambda x: x.nlargest(3, 'daily_ic_mean'))

Unnamed: 0,learning_rate,num_leaves,feature_fraction,min_data_in_leaf,t,daily_ic_mean,daily_ic_mean_n,daily_ic_median,daily_ic_median_n,10,...,200,250,300,350,400,450,500,lookahead,train_length,test_length
247,0.1,128.0,0.6,250.0,11.985325,0.106148,450.0,0.103941,100.0,0.048763,...,0.091598,0.100927,0.100756,0.097479,0.109097,0.115345,0.118527,1,756,63
228,0.3,32.0,0.3,500.0,10.209575,0.099861,50.0,0.11643,500.0,0.028331,...,0.049328,0.064375,0.044472,0.053361,0.05704,0.056675,0.067342,1,756,63
282,0.3,32.0,0.3,500.0,7.69412,0.099861,50.0,0.11643,500.0,0.028331,...,0.049328,0.064375,0.044472,0.053361,0.05704,0.056675,0.067342,1,126,63
119,0.3,32.0,0.95,250.0,7.543531,0.205537,450.0,0.229803,350.0,0.088488,...,0.211297,0.228528,0.224222,0.249544,0.248268,0.246807,0.247978,21,756,63
173,0.3,32.0,0.95,250.0,12.974158,0.205537,450.0,0.229803,350.0,0.088488,...,0.211297,0.228528,0.224222,0.249544,0.248268,0.246807,0.247978,21,126,63
122,0.3,128.0,0.6,250.0,12.098642,0.158676,400.0,0.212562,500.0,0.053863,...,0.16531,0.178621,0.181382,0.190168,0.189948,0.193071,0.197347,21,756,63
31,0.1,128.0,0.6,250.0,14.996168,0.353721,300.0,0.394335,500.0,0.145222,...,0.310533,0.318786,0.329287,0.328029,0.3288,0.329765,0.338409,5,756,63
8,0.3,32.0,0.95,250.0,12.161077,0.352499,300.0,0.35936,300.0,0.199265,...,0.317805,0.324196,0.329401,0.310951,0.317389,0.313458,0.318716,5,756,63
9,0.3,32.0,0.95,500.0,13.084522,0.346767,300.0,0.335961,250.0,0.164549,...,0.301521,0.31804,0.333633,0.33016,0.330193,0.331536,0.332182,5,756,63


In [24]:
def get_lgb_params(data, t=5, best=0):
    param_cols = scope_params[1:] + lgb_train_params + ["boost_rounds"]
    df = data[data.lookahead == t].sort_values("ic", ascending=False).iloc[best]
    return df.loc[param_cols]

best_params = get_lgb_params(lgb_daily_ic, t=1, best=1)

print(best_params)

train_length        126.0
test_length          63.0
learning_rate         0.3
num_leaves           32.0
feature_fraction      0.3
min_data_in_leaf    500.0
boost_rounds         50.0
Name: 626, dtype: float64


In [19]:
def get_feature_importance(model, importance_type="split"):
    fi = pd.Series(
        model.feature_importance(importance_type=importance_type),
        index=model.feature_name(),
    )
    return fi / fi.sum()


feature_importance = (
    get_feature_importance(lgb_model)
    .to_frame("Split")
    .join(get_feature_importance(lgb_model, "gain").to_frame("Gain"))
)
(
    feature_importance.nlargest(20, columns="Gain")
    .sort_values("Gain", ascending=False)
    .plot.bar(
        subplots=True, layout=(2, 1), figsize=(14, 6), legend=False, sharey=True, rot=0
    )
)
plt.suptitle("Normalized Importance (Top 20 Features)", fontsize=14)
plt.tight_layout()
plt.subplots_adjust(top=0.9)

NameError: name 'lgb_model' is not defined