In [1]:
from provenquant.utils.merge import match_merge_series
from provenquant.core.feature_selection import feature_importance_mda, feature_importance_sfi, orthogonal_features
from provenquant.core.labeling import get_horizontal_barrier_events, add_vertical_barrier_to_horizontal_barrier_events, get_binary_labels, get_triple_barrier_labels
import os
import pandas as pd
import talib as ta
from sklearn.ensemble import RandomForestClassifier

In [2]:
current_dir = os.getcwd()
dataframe_path = os.path.join(current_dir, '..', '..', 'data', 'btc_usdt.feather')
dataframe = pd.read_feather(dataframe_path)

# Create features

In [3]:
# Price Change Features
dataframe['open_pct_change'] = dataframe['open'].pct_change()
dataframe['high_pct_change'] = dataframe['high'].pct_change()
dataframe['low_pct_change'] = dataframe['low'].pct_change()
dataframe['close_pct_change'] = dataframe['close'].pct_change()

# Trend Indicators
dataframe['sma_50'] = ta.SMA(dataframe['close'], timeperiod=50)
dataframe['sma_200'] = ta.SMA(dataframe['close'], timeperiod=200)

# Momentum Indicators
dataframe['rsi_14'] = ta.RSI(dataframe['close'], timeperiod=14)
dataframe['stoch_slowk'], dataframe['stoch_slowd'] = ta.STOCH(dataframe['high'], dataframe['low'], dataframe['close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
dataframe['macd'], dataframe['macd_signal'], dataframe['macd_hist'] = ta.MACD(dataframe['close'], fastperiod=12, slowperiod=26, signalperiod=9)

# Volatility Indicators
dataframe['atr_14'] = ta.ATR(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)
dataframe['bb_upper'], dataframe['bb_middle'], dataframe['bb_lower'] = ta.BBANDS(dataframe['close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
dataframe['bb_width'] = (dataframe['bb_upper'] - dataframe['bb_lower']) / dataframe['bb_middle']
dataframe['adx_14'] = ta.ADX(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)

dataframe = dataframe.dropna()
dataframe.reset_index(drop=True, inplace=True)

In [4]:
events = get_horizontal_barrier_events(
    dataframe,
    threshold=0.005,
    datetime_col='date'
)

events = add_vertical_barrier_to_horizontal_barrier_events(
    dataframe,
    events,
    vertical_barrier_duration=pd.Timedelta('4h'),
    datetime_col='date'
)

binary_labels = get_binary_labels(
    events,
    min_ret=0.001,
    side='long'
)

In [5]:
dataframe = match_merge_series(
    from_series=binary_labels,
    to_df=dataframe,
    to_datetime_col='date',
    series_name='label'
)
dataframe.dropna(subset=['label'], inplace=True)
dataframe.reset_index(drop=True, inplace=True)
dataframe

Unnamed: 0,date,open,high,low,close,volume,open_pct_change,high_pct_change,low_pct_change,close_pct_change,...,macd,macd_signal,macd_hist,atr_14,bb_upper,bb_middle,bb_lower,bb_width,adx_14,label
0,2025-12-01 04:13:00+00:00,86145.16,86145.16,85962.14,86002.37,256.22965,0.000130,-0.000242,-0.001995,-0.001658,...,-59.538386,-40.528287,-19.010099,87.670905,86437.853729,86247.7495,86057.645271,0.004408,15.614450,1.0
1,2025-12-01 06:01:00+00:00,86063.10,86193.93,86063.10,86189.41,27.81431,0.000721,0.001521,0.001029,0.001469,...,59.927903,41.309566,18.618336,57.465032,86134.101949,85944.0370,85753.972051,0.004423,16.387390,1.0
2,2025-12-01 07:38:00+00:00,86339.68,86470.60,86339.67,86450.00,48.80094,0.000074,0.001451,0.000150,0.001278,...,51.994115,30.773165,21.220950,56.691639,86420.049763,86228.1550,86036.260237,0.004451,18.994451,0.0
3,2025-12-01 08:42:00+00:00,86727.00,86767.43,86717.84,86767.43,6.95080,-0.000267,0.000199,0.000091,0.000466,...,55.798040,39.955455,15.842585,54.345392,86786.531890,86613.7230,86440.914110,0.003990,28.449032,0.0
4,2025-12-01 11:15:00+00:00,86487.27,86496.00,86458.36,86474.07,19.35051,0.000014,-0.000046,-0.000320,-0.000153,...,-42.172615,-34.382425,-7.790190,32.346868,86713.886718,86564.1250,86414.363282,0.003460,20.138364,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779,2025-12-30 16:35:00+00:00,88960.99,89040.00,88852.56,88883.34,80.35831,-0.001213,-0.000532,-0.001219,-0.000873,...,-9.207401,38.752044,-47.959445,102.781770,89413.549000,89182.2260,88950.903000,0.005188,24.350434,0.0
780,2025-12-30 17:45:00+00:00,88533.43,88533.43,88333.34,88373.43,92.94989,0.000044,-0.000127,-0.002196,-0.001807,...,-47.327331,-30.390308,-16.937023,60.838834,88735.019439,88604.8085,88474.597561,0.002939,23.336862,0.0
781,2025-12-30 19:08:00+00:00,88022.11,88075.79,88007.27,88007.27,4.80480,-0.000139,0.000216,0.000269,-0.000169,...,-67.219082,-41.805578,-25.413504,60.858211,88401.039408,88212.4010,88023.762592,0.004277,28.385909,1.0
782,2025-12-30 20:53:00+00:00,87978.37,87978.38,87920.00,87920.01,31.21459,-0.000246,-0.000274,-0.000663,-0.000663,...,-42.039017,-32.503493,-9.535524,31.455002,88173.785657,88053.4025,87933.019343,0.002734,27.365536,1.0


In [6]:
features = [
    'open_pct_change',
    'high_pct_change',
    'low_pct_change',
    'close_pct_change',
    'sma_50',
    'sma_200',
    'rsi_14',
    'stoch_slowk',
    'stoch_slowd',
    'macd',
    'macd_signal',
    'macd_hist',
    'atr_14',
    'bb_upper',
    'bb_middle',
    'bb_lower',
    'bb_width',
    'adx_14'
]

# Feature Importance

## Mean Decrease in Accuracy

In [7]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

X = dataframe[features].values
y = dataframe['label'].values

In [8]:
mda_feature_importances = feature_importance_mda(
    model=model,
    dataframe=dataframe,
    feature_cols=features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances = mda_feature_importances.sort_values(by='feature_importances', ascending=False)
mda_feature_importances

Unnamed: 0,feature_importances,mean_score
sma_200,0.010207,0.450212
low_pct_change,0.003822,0.450212
rsi_14,0.003822,0.450212
bb_upper,0.003813,0.450212
high_pct_change,0.001274,0.450212
open_pct_change,0.001266,0.450212
bb_middle,8e-06,0.450212
sma_50,8e-06,0.450212
bb_width,-0.001258,0.450212
stoch_slowk,-0.001266,0.450212


## Single Feature Importance (SFI)

In [9]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = feature_importance_sfi(
    model=model,
    dataframe=dataframe,
    feature_cols=features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy'
)
sfi_feature_importances = sfi_feature_importances.sort_values(by='mean', ascending=False)
sfi_feature_importances

Unnamed: 0,mean,std
open_pct_change,0.52551,0.008546
rsi_14,0.520439,0.010593
macd_signal,0.519157,0.017384
stoch_slowd,0.515327,0.018868
bb_upper,0.5012,0.023997
low_pct_change,0.498751,0.014961
bb_lower,0.496105,0.02771
close_pct_change,0.494921,0.020217
stoch_slowk,0.493622,0.012802
macd,0.4923,0.011767


## Orthogonal Features

In [10]:
orth_features = orthogonal_features(
    df_X=dataframe[features],
    threshold=0.95
)

orth_features

array([[-1.03940065,  3.0586876 , -1.16193994, ..., -0.30571228,
         0.44917189, -0.55394899],
       [ 3.56923571,  2.00309867, -1.17449346, ...,  0.32619264,
         0.06962357,  0.10754384],
       [ 3.07973001,  1.92149443, -1.18054292, ...,  0.77853249,
        -0.3132772 , -0.22525806],
       ...,
       [-1.14048144,  1.24703503, -0.69514533, ..., -0.842236  ,
        -0.3579653 ,  0.27935987],
       [-1.42384108,  1.50436276, -1.39480062, ..., -0.74738507,
        -0.29237656, -0.04774114],
       [ 1.97242472,  0.30204905, -0.83932196, ...,  0.74301661,
         0.05972815, -0.22644481]], shape=(784, 8))

In [11]:
orth_features.shape

(784, 8)

# Select Features

In [12]:
mda_feature_importances_mean = mda_feature_importances['feature_importances'].mean()
sfi_feature_importances_mean = sfi_feature_importances['mean'].mean()

In [13]:
mda_features = mda_feature_importances[
    mda_feature_importances['feature_importances'] >= mda_feature_importances_mean
].index.tolist()

sfi_features = sfi_feature_importances[
    sfi_feature_importances['mean'] >= sfi_feature_importances_mean
].index.tolist()

In [14]:
# Collect only features that are selected by all three methods
final_features = []

for i in range(len(features)):
    if features[i] in mda_features and features[i] in sfi_features:
        final_features.append(features[i])
print('Final Features:')
for feature in final_features:
    print(f'  - {feature}')

Final Features:
  - open_pct_change
  - low_pct_change
  - rsi_14
  - bb_upper


# Re-calculate Feature Importance

## Mean Decrease in Accuracy

In [15]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

X = dataframe[final_features].values
y = dataframe['label'].values

In [16]:
mda_feature_importances = feature_importance_mda(
    model=model,
    dataframe=dataframe,
    feature_cols=final_features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances = mda_feature_importances.sort_values(by='feature_importances', ascending=False)
mda_feature_importances

Unnamed: 0,feature_importances,mean_score
bb_upper,0.006353,0.484575
rsi_14,0.001233,0.484575
low_pct_change,-0.006435,0.484575
open_pct_change,-0.011522,0.484575


## Single Feature Importance (SFI)

In [17]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = feature_importance_sfi(
    model=model,
    dataframe=dataframe,
    feature_cols=final_features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy'
)
sfi_feature_importances = sfi_feature_importances.sort_values(by='mean', ascending=False)
sfi_feature_importances

Unnamed: 0,mean,std
open_pct_change,0.52551,0.008546
rsi_14,0.520439,0.010593
bb_upper,0.5012,0.023997
low_pct_change,0.498751,0.014961
