In [1]:
from provenquant.core.feature_selection import backward_feature_elimination, calculate_mda_feature_importances, calculate_sfi_feature_importances, orthogonal_features
from provenquant.core.labeling import filtrate_tripple_label_barrier, get_tripple_label_barrier
from provenquant.utils import get_volatility
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
import numpy as np
import os
import pandas as pd
import talib as ta

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_dir = os.getcwd()
dataframe_path = os.path.join(current_dir, '..', '..', 'data', 'btc_usdt.feather')
dataframe = pd.read_feather(dataframe_path)

# Create features

In [3]:
# Price Change Features
dataframe['open_pct_change'] = dataframe['open'].pct_change()
dataframe['high_pct_change'] = dataframe['high'].pct_change()
dataframe['low_pct_change'] = dataframe['low'].pct_change()
dataframe['close_pct_change'] = dataframe['close'].pct_change()

# Trend Indicators
dataframe['sma_50'] = ta.SMA(dataframe['close'], timeperiod=50)
dataframe['sma_200'] = ta.SMA(dataframe['close'], timeperiod=200)

# Momentum Indicators
dataframe['rsi_14'] = ta.RSI(dataframe['close'], timeperiod=14)
dataframe['stoch_slowk'], dataframe['stoch_slowd'] = ta.STOCH(dataframe['high'], dataframe['low'], dataframe['close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
dataframe['macd'], dataframe['macd_signal'], dataframe['macd_hist'] = ta.MACD(dataframe['close'], fastperiod=12, slowperiod=26, signalperiod=9)

# Volatility Indicators
dataframe['atr_14'] = ta.ATR(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)
dataframe['bb_upper'], dataframe['bb_middle'], dataframe['bb_lower'] = ta.BBANDS(dataframe['close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
dataframe['bb_width'] = (dataframe['bb_upper'] - dataframe['bb_lower']) / dataframe['bb_middle']
dataframe['adx_14'] = ta.ADX(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)

dataframe = dataframe.dropna()
dataframe.reset_index(drop=True, inplace=True)

In [4]:
volatility = get_volatility(
    dataframe,
    window=100,
)
threshold = volatility.mean() * 10

filtered_dataframe = filtrate_tripple_label_barrier(
    dataframe,
    cusum_threshold=threshold,
    vertical_barrier=120,
    datetime_col='date'
)
filtered_dataframe

close_series = pd.Series(data=dataframe['close'].values, index=dataframe['date'])

labeled_dataframe = get_tripple_label_barrier(
    dataframe=filtered_dataframe,
    close_series=close_series,
    tp=0.01,
    sl=0.01,
)

labeled_dataframe

Unnamed: 0,t1,open,high,low,close,volume,open_pct_change,high_pct_change,low_pct_change,close_pct_change,...,bb_upper,bb_middle,bb_lower,bb_width,adx_14,label,return,max_return,min_return,mapped_label
2025-12-01 04:13:00+00:00,2025-12-01 06:13:00+00:00,86145.16,86145.16,85962.14,86002.37,256.22965,0.000130,-0.000242,-0.001995,-0.001658,...,86437.853729,86247.7495,86057.645271,0.004408,15.614450,0,0.002606,0.002606,-0.003899,1
2025-12-01 06:01:00+00:00,2025-12-01 08:01:00+00:00,86063.10,86193.93,86063.10,86189.41,27.81431,0.000721,0.001521,0.001029,0.001469,...,86134.101949,85944.0370,85753.972051,0.004423,16.387390,0,0.003604,0.004320,-0.002197,1
2025-12-01 07:43:00+00:00,2025-12-01 09:43:00+00:00,86458.01,86487.31,86458.01,86475.42,22.11699,0.000216,-0.000012,0.000216,0.000201,...,86492.831256,86306.8920,86120.952744,0.004309,24.948019,0,0.003869,0.005059,-0.000867,1
2025-12-01 08:59:00+00:00,2025-12-01 10:59:00+00:00,86854.50,86890.77,86847.75,86869.51,26.93154,0.000338,0.000406,0.000423,0.000173,...,86881.305173,86804.1060,86726.906827,0.001779,43.592535,0,-0.002553,0.000500,-0.003885,1
2025-12-01 11:18:00+00:00,2025-12-01 13:18:00+00:00,86447.01,86466.89,86442.17,86445.44,17.55845,-0.000852,-0.000753,0.000014,-0.000018,...,86684.328291,86537.8260,86391.323709,0.003386,19.109180,-1,-0.011316,0.000518,-0.018795,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-12-30 16:18:00+00:00,2025-12-30 18:18:00+00:00,89133.39,89284.68,89117.38,89268.32,44.64159,-0.000149,0.001148,-0.000023,0.001514,...,89211.760042,89013.6555,88815.550958,0.004451,32.260257,-1,-0.010025,0.001374,-0.011927,0
2025-12-30 16:35:00+00:00,2025-12-30 18:35:00+00:00,88960.99,89040.00,88852.56,88883.34,80.35831,-0.001213,-0.000532,-0.001219,-0.000873,...,89413.549000,89182.2260,88950.903000,0.005188,24.350434,0,-0.005591,0.000073,-0.007647,1
2025-12-30 17:45:00+00:00,2025-12-30 19:45:00+00:00,88533.43,88533.43,88333.34,88373.43,92.94989,0.000044,-0.000127,-0.002196,-0.001807,...,88735.019439,88604.8085,88474.597561,0.002939,23.336862,0,-0.000938,0.000927,-0.004143,1
2025-12-30 20:52:00+00:00,2025-12-30 22:52:00+00:00,88000.00,88002.49,87978.36,87978.36,5.42977,0.000090,-0.000110,-0.000155,-0.000246,...,88172.325532,88064.0210,87955.716468,0.002460,25.686572,0,0.004338,0.005588,-0.001014,1


In [5]:
weights = compute_sample_weight(
    class_weight='balanced',
    y=labeled_dataframe['mapped_label'].values,
)
labeled_dataframe['sample_weight'] = weights

In [6]:
features = [
    'open_pct_change',
    'high_pct_change',
    'low_pct_change',
    'close_pct_change',
    'sma_50',
    'sma_200',
    'rsi_14',
    'stoch_slowk',
    'stoch_slowd',
    'macd',
    'macd_signal',
    'macd_hist',
    'atr_14',
    'bb_upper',
    'bb_middle',
    'bb_lower',
    'bb_width',
    'adx_14'
]

# Backward Feature Elimination

In [7]:
labeled_dataframe.value_counts('mapped_label')

mapped_label
1    420
2    152
0    128
Name: count, dtype: int64

In [8]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

features = backward_feature_elimination(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    threshold=0.65,
    verbose=True,
)

features

Score 0.7424. Removed feature: sma_200
Stopping elimination. Score 0.6125 below threshold 0.6500.


['open_pct_change',
 'high_pct_change',
 'low_pct_change',
 'close_pct_change',
 'sma_50',
 'rsi_14',
 'stoch_slowk',
 'stoch_slowd',
 'macd',
 'macd_signal',
 'macd_hist',
 'atr_14',
 'bb_upper',
 'bb_middle',
 'bb_lower',
 'bb_width',
 'adx_14']

# Feature Importance

## Mean Decrease in Accuracy

In [9]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

In [10]:
mda_feature_importances = calculate_mda_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    target_col='label',
    sample_weight_col='sample_weight',
    n_splits=20,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances

Unnamed: 0,feature_importance,std,feature_sharpe,pos_ratio
bb_width,0.022846,0.011861,1.926046,0.4
open_pct_change,0.018057,0.00633,2.852709,0.45
stoch_slowd,0.014325,0.005654,2.533418,0.4
macd,0.010688,0.005046,2.118071,0.35
stoch_slowk,0.010154,0.006595,1.539716,0.4
low_pct_change,0.009647,0.006778,1.423346,0.35
high_pct_change,0.009457,0.006122,1.544586,0.3
macd_hist,0.008232,0.005071,1.623287,0.4
macd_signal,0.007948,0.008465,0.938925,0.4
bb_lower,0.006299,0.004698,1.340855,0.2


## Single Feature Importance (SFI)

In [11]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = calculate_sfi_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    target_col='label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
    show_progress=True
)
sfi_feature_importances

Calculating SFI feature importance: 100%|██████████| 17/17 [00:33<00:00,  1.95s/it]


Unnamed: 0,mean,std
macd_hist,0.366525,0.016567
macd,0.357548,0.0188
atr_14,0.353281,0.028845
stoch_slowk,0.348659,0.020776
open_pct_change,0.345848,0.01027
close_pct_change,0.344558,0.023782
high_pct_change,0.335889,0.027685
bb_width,0.332595,0.034302
low_pct_change,0.329931,0.018233
sma_50,0.325413,0.01706


## Orthogonal Features

In [12]:
orth_features = orthogonal_features(
    df_X=dataframe[features],
    threshold=0.95
)

orth_features

array([[ 0.94612198,  2.84612653,  1.25045433, ..., -0.31765089,
         0.93881088,  0.03850242],
       [ 1.31223542,  3.01387542,  0.19324437, ..., -0.62039552,
         1.78757752, -0.31581611],
       [-0.62912153,  2.29612621,  2.83260638, ..., -0.73774109,
        -0.76685941, -0.30851215],
       ...,
       [-0.23067082,  0.5656359 , -0.44076602, ..., -0.3934236 ,
         0.14468785,  0.2506892 ],
       [ 0.54924576,  0.85677809, -0.77484408, ..., -0.3400753 ,
        -0.53979393, -0.06544639],
       [ 1.55265517,  1.23155523, -1.37949664, ..., -0.21746054,
        -0.28598656, -0.33010175]], shape=(43002, 9))

In [13]:
orth_features.shape

(43002, 9)

# Select Features

In [14]:
mda_feature_importances_mean = mda_feature_importances['feature_importances'].mean()
sfi_feature_importances_mean = sfi_feature_importances['mean'].mean()

KeyError: 'feature_importances'

In [None]:
mda_features = mda_feature_importances[
    mda_feature_importances['feature_importances'] >= mda_feature_importances_mean
].index.tolist()

sfi_features = sfi_feature_importances[
    sfi_feature_importances['mean'] >= sfi_feature_importances_mean
].index.tolist()

In [None]:
# Collect only features that are selected by all three methods
final_features = []

for i in range(len(features)):
    if features[i] in mda_features and features[i] in sfi_features:
        final_features.append(features[i])
print('Final Features:')
for feature in final_features:
    print(f'  - {feature}')

# Re-calculate Feature Importance

## Mean Decrease in Accuracy

In [None]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

In [None]:
mda_feature_importances = calculate_mda_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=final_features,
    target_col='mapped_label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances

## Single Feature Importance (SFI)

In [None]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = calculate_sfi_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=final_features,
    target_col='label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
    show_progress=True
)
sfi_feature_importances