In [1]:
from provenquant.core.feature_selection import backward_feature_elimination, calculate_mda_feature_importances, calculate_sfi_feature_importances, orthogonal_features
from provenquant.core.labeling import filtrate_tripple_label_barrier, get_tripple_label_barrier
from provenquant.utils import get_volatility
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
import numpy as np
import os
import pandas as pd
import talib as ta

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
current_dir = os.getcwd()
dataframe_path = os.path.join(current_dir, '..', '..', 'data', 'btc_usdt.feather')
dataframe = pd.read_feather(dataframe_path)

# Create features

In [3]:
# Price Change Features
dataframe['open_pct_change'] = dataframe['open'].pct_change()
dataframe['high_pct_change'] = dataframe['high'].pct_change()
dataframe['low_pct_change'] = dataframe['low'].pct_change()
dataframe['close_pct_change'] = dataframe['close'].pct_change()

# Trend Indicators
dataframe['sma_50'] = ta.SMA(dataframe['close'], timeperiod=50)
dataframe['sma_200'] = ta.SMA(dataframe['close'], timeperiod=200)

# Momentum Indicators
dataframe['rsi_14'] = ta.RSI(dataframe['close'], timeperiod=14)
dataframe['stoch_slowk'], dataframe['stoch_slowd'] = ta.STOCH(dataframe['high'], dataframe['low'], dataframe['close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
dataframe['macd'], dataframe['macd_signal'], dataframe['macd_hist'] = ta.MACD(dataframe['close'], fastperiod=12, slowperiod=26, signalperiod=9)

# Volatility Indicators
dataframe['atr_14'] = ta.ATR(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)
dataframe['bb_upper'], dataframe['bb_middle'], dataframe['bb_lower'] = ta.BBANDS(dataframe['close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
dataframe['bb_width'] = (dataframe['bb_upper'] - dataframe['bb_lower']) / dataframe['bb_middle']
dataframe['adx_14'] = ta.ADX(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)

dataframe = dataframe.dropna()
dataframe.reset_index(drop=True, inplace=True)

In [4]:
volatility = get_volatility(
    dataframe,
    window=100,
)
threshold = volatility.mean() * 10

filtered_dataframe = filtrate_tripple_label_barrier(
    dataframe,
    cusum_threshold=threshold,
    vertical_barrier=120,
    datetime_col='date'
)
filtered_dataframe

close_series = pd.Series(data=dataframe['close'].values, index=dataframe['date'])

labeled_dataframe = get_tripple_label_barrier(
    dataframe=filtered_dataframe,
    close_series=close_series,
    tp=0.01,
    sl=0.01,
)

labeled_dataframe

Unnamed: 0,t1,open,high,low,close,volume,open_pct_change,high_pct_change,low_pct_change,close_pct_change,...,macd_hist,atr_14,bb_upper,bb_middle,bb_lower,bb_width,adx_14,label,return,mapped_label
2025-12-01 04:13:00+00:00,2025-12-01 06:13:00+00:00,86145.16,86145.16,85962.14,86002.37,256.22965,0.000130,-0.000242,-0.001995,-0.001658,...,-19.010099,87.670905,86437.853729,86247.7495,86057.645271,0.004408,15.614450,0,0.002606,1
2025-12-01 06:01:00+00:00,2025-12-01 08:01:00+00:00,86063.10,86193.93,86063.10,86189.41,27.81431,0.000721,0.001521,0.001029,0.001469,...,18.618336,57.465032,86134.101949,85944.0370,85753.972051,0.004423,16.387390,0,0.003604,1
2025-12-01 07:43:00+00:00,2025-12-01 09:43:00+00:00,86458.01,86487.31,86458.01,86475.42,22.11699,0.000216,-0.000012,0.000216,0.000201,...,17.891708,59.613469,86492.831256,86306.8920,86120.952744,0.004309,24.948019,0,0.003869,1
2025-12-01 08:59:00+00:00,2025-12-01 10:59:00+00:00,86854.50,86890.77,86847.75,86869.51,26.93154,0.000338,0.000406,0.000423,0.000173,...,-2.269422,45.220887,86881.305173,86804.1060,86726.906827,0.001779,43.592535,0,-0.002553,1
2025-12-01 11:18:00+00:00,2025-12-01 13:18:00+00:00,86447.01,86466.89,86442.17,86445.44,17.55845,-0.000852,-0.000753,0.000014,-0.000018,...,-5.373153,37.579913,86684.328291,86537.8260,86391.323709,0.003386,19.109180,-1,-0.011316,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-12-30 16:18:00+00:00,2025-12-30 18:18:00+00:00,89133.39,89284.68,89117.38,89268.32,44.64159,-0.000149,0.001148,-0.000023,0.001514,...,12.112564,98.120295,89211.760042,89013.6555,88815.550958,0.004451,32.260257,-1,-0.010025,0
2025-12-30 16:35:00+00:00,2025-12-30 18:35:00+00:00,88960.99,89040.00,88852.56,88883.34,80.35831,-0.001213,-0.000532,-0.001219,-0.000873,...,-47.959445,102.781770,89413.549000,89182.2260,88950.903000,0.005188,24.350434,0,-0.005591,1
2025-12-30 17:45:00+00:00,2025-12-30 19:45:00+00:00,88533.43,88533.43,88333.34,88373.43,92.94989,0.000044,-0.000127,-0.002196,-0.001807,...,-16.937023,60.838834,88735.019439,88604.8085,88474.597561,0.002939,23.336862,0,-0.000938,1
2025-12-30 20:52:00+00:00,2025-12-30 22:52:00+00:00,88000.00,88002.49,87978.36,87978.36,5.42977,0.000090,-0.000110,-0.000155,-0.000246,...,-6.165715,29.383849,88172.325532,88064.0210,87955.716468,0.002460,25.686572,0,0.004338,1


In [5]:
weights = compute_sample_weight(
    class_weight='balanced',
    y=labeled_dataframe['mapped_label'].values,
)
labeled_dataframe['sample_weight'] = weights

In [6]:
features = [
    'open_pct_change',
    'high_pct_change',
    'low_pct_change',
    'close_pct_change',
    'sma_50',
    'sma_200',
    'rsi_14',
    'stoch_slowk',
    'stoch_slowd',
    'macd',
    'macd_signal',
    'macd_hist',
    'atr_14',
    'bb_upper',
    'bb_middle',
    'bb_lower',
    'bb_width',
    'adx_14'
]

# Backward Feature Elimination

In [7]:
labeled_dataframe.value_counts('mapped_label')

mapped_label
1    420
2    152
0    128
Name: count, dtype: int64

In [8]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

features = backward_feature_elimination(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    threshold=0.65,
    verbose=True,
)

features

Score 0.6782. Removed feature: sma_200
Score 0.6682. Removed feature: bb_upper
Stopping elimination. Score 0.5628 below threshold 0.6500.


['open_pct_change',
 'high_pct_change',
 'low_pct_change',
 'close_pct_change',
 'sma_50',
 'rsi_14',
 'stoch_slowk',
 'stoch_slowd',
 'macd',
 'macd_signal',
 'macd_hist',
 'atr_14',
 'bb_middle',
 'bb_lower',
 'bb_width',
 'adx_14']

# Feature Importance

## Mean Decrease in Accuracy

In [9]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

In [10]:
mda_feature_importances = calculate_mda_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    target_col='label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances

Unnamed: 0,feature_importances,mean_score
open_pct_change,0.018694,0.367468
atr_14,0.014588,0.367468
bb_width,0.011547,0.367468
macd,0.008994,0.367468
macd_signal,0.007242,0.367468
adx_14,0.006505,0.367468
stoch_slowd,0.004932,0.367468
low_pct_change,0.004574,0.367468
sma_50,0.004172,0.367468
rsi_14,0.002209,0.367468


## Single Feature Importance (SFI)

In [11]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = calculate_sfi_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    target_col='label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
    show_progress=True
)
sfi_feature_importances

Calculating SFI feature importance: 100%|██████████| 16/16 [00:32<00:00,  2.03s/it]


Unnamed: 0,mean,std
macd_hist,0.366525,0.016567
macd,0.357548,0.0188
atr_14,0.353281,0.028845
stoch_slowk,0.348659,0.020776
open_pct_change,0.345848,0.01027
close_pct_change,0.344558,0.023782
high_pct_change,0.335889,0.027685
bb_width,0.332595,0.034302
low_pct_change,0.329931,0.018233
sma_50,0.325413,0.01706


## Orthogonal Features

In [12]:
orth_features = orthogonal_features(
    df_X=dataframe[features],
    threshold=0.95
)

orth_features

array([[-1.74728097, -2.16854033, -1.22283719, ...,  0.33182675,
        -0.93875009, -0.03849986],
       [-2.14700592, -2.2050533 , -0.1652475 , ...,  0.63015389,
        -1.78751624,  0.31581862],
       [-0.08032335, -2.10529621, -2.80575629, ...,  0.74818463,
         0.76692171,  0.30851492],
       ...,
       [ 0.05146004, -0.51237212,  0.44656589, ...,  0.39693795,
        -0.14465962, -0.25068794],
       [-0.77940187, -0.56245734,  0.78100873, ...,  0.34419013,
         0.53982223,  0.06544772],
       [-1.84835439, -0.626989  ,  1.38625873, ...,  0.22158786,
         0.28601341,  0.33010299]], shape=(43002, 9))

In [13]:
orth_features.shape

(43002, 9)

# Select Features

In [14]:
mda_feature_importances_mean = mda_feature_importances['feature_importances'].mean()
sfi_feature_importances_mean = sfi_feature_importances['mean'].mean()

In [15]:
mda_features = mda_feature_importances[
    mda_feature_importances['feature_importances'] >= mda_feature_importances_mean
].index.tolist()

sfi_features = sfi_feature_importances[
    sfi_feature_importances['mean'] >= sfi_feature_importances_mean
].index.tolist()

In [16]:
# Collect only features that are selected by all three methods
final_features = []

for i in range(len(features)):
    if features[i] in mda_features and features[i] in sfi_features:
        final_features.append(features[i])
print('Final Features:')
for feature in final_features:
    print(f'  - {feature}')

Final Features:
  - open_pct_change
  - macd
  - atr_14
  - bb_width


# Re-calculate Feature Importance

## Mean Decrease in Accuracy

In [17]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

In [18]:
mda_feature_importances = calculate_mda_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=final_features,
    target_col='mapped_label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances

Unnamed: 0,feature_importances,mean_score
bb_width,-0.008758,0.326574
atr_14,-0.014431,0.326574
open_pct_change,-0.016116,0.326574
macd,-0.027877,0.326574


## Single Feature Importance (SFI)

In [19]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = calculate_sfi_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=final_features,
    target_col='label',
    sample_weight_col='sample_weight',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
    show_progress=True
)
sfi_feature_importances

Calculating SFI feature importance: 100%|██████████| 4/4 [00:08<00:00,  2.01s/it]


Unnamed: 0,mean,std
macd,0.357548,0.0188
atr_14,0.353281,0.028845
open_pct_change,0.345848,0.01027
bb_width,0.332595,0.034302
