In [1]:
from provenquant.core.feature_selection import calculate_mda_feature_importances, calculate_sfi_feature_importances, orthogonal_features
from provenquant.core.labeling import filtrate_tripple_label_barrier, get_tripple_label_barrier
from provenquant.utils import get_volatility
from sklearn.ensemble import RandomForestClassifier
import os
import pandas as pd
import talib as ta

In [2]:
current_dir = os.getcwd()
dataframe_path = os.path.join(current_dir, '..', '..', 'data', 'btc_usdt.feather')
dataframe = pd.read_feather(dataframe_path)

# Create features

In [3]:
# Price Change Features
dataframe['open_pct_change'] = dataframe['open'].pct_change()
dataframe['high_pct_change'] = dataframe['high'].pct_change()
dataframe['low_pct_change'] = dataframe['low'].pct_change()
dataframe['close_pct_change'] = dataframe['close'].pct_change()

# Trend Indicators
dataframe['sma_50'] = ta.SMA(dataframe['close'], timeperiod=50)
dataframe['sma_200'] = ta.SMA(dataframe['close'], timeperiod=200)

# Momentum Indicators
dataframe['rsi_14'] = ta.RSI(dataframe['close'], timeperiod=14)
dataframe['stoch_slowk'], dataframe['stoch_slowd'] = ta.STOCH(dataframe['high'], dataframe['low'], dataframe['close'], fastk_period=14, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)
dataframe['macd'], dataframe['macd_signal'], dataframe['macd_hist'] = ta.MACD(dataframe['close'], fastperiod=12, slowperiod=26, signalperiod=9)

# Volatility Indicators
dataframe['atr_14'] = ta.ATR(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)
dataframe['bb_upper'], dataframe['bb_middle'], dataframe['bb_lower'] = ta.BBANDS(dataframe['close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
dataframe['bb_width'] = (dataframe['bb_upper'] - dataframe['bb_lower']) / dataframe['bb_middle']
dataframe['adx_14'] = ta.ADX(dataframe['high'], dataframe['low'], dataframe['close'], timeperiod=14)

dataframe = dataframe.dropna()
dataframe.reset_index(drop=True, inplace=True)

In [4]:
volatility = get_volatility(
    dataframe,
    window=100,
)
threshold = volatility.mean() * 10

filtered_dataframe = filtrate_tripple_label_barrier(
    dataframe,
    cusum_threshold=threshold,
    vertical_barrier=20,
    datetime_col='date'
)
filtered_dataframe

close_series = pd.Series(data=dataframe['close'].values, index=dataframe['date'])

labeled_dataframe = get_tripple_label_barrier(
    dataframe=filtered_dataframe,
    close_series=close_series,
    threshold=threshold,
    pt=2,
    sl=1,
)

labeled_dataframe

Unnamed: 0,t1,open,high,low,close,volume,open_pct_change,high_pct_change,low_pct_change,close_pct_change,...,macd_signal,macd_hist,atr_14,bb_upper,bb_middle,bb_lower,bb_width,adx_14,label,return
2025-12-01 04:13:00+00:00,2025-12-01 04:33:00+00:00,86145.16,86145.16,85962.14,86002.37,256.22965,0.000130,-0.000242,-0.001995,-0.001658,...,-40.528287,-19.010099,87.670905,86437.853729,86247.7495,86057.645271,0.004408,15.614450,0,-0.001953
2025-12-01 06:01:00+00:00,2025-12-01 06:21:00+00:00,86063.10,86193.93,86063.10,86189.41,27.81431,0.000721,0.001521,0.001029,0.001469,...,41.309566,18.618336,57.465032,86134.101949,85944.0370,85753.972051,0.004423,16.387390,0,-0.001913
2025-12-01 07:43:00+00:00,2025-12-01 08:03:00+00:00,86458.01,86487.31,86458.01,86475.42,22.11699,0.000216,-0.000012,0.000216,0.000201,...,52.037300,17.891708,59.613469,86492.831256,86306.8920,86120.952744,0.004309,24.948019,0,-0.000294
2025-12-01 08:59:00+00:00,2025-12-01 09:19:00+00:00,86854.50,86890.77,86847.75,86869.51,26.93154,0.000338,0.000406,0.000423,0.000173,...,58.512013,-2.269422,45.220887,86881.305173,86804.1060,86726.906827,0.001779,43.592535,0,-0.001721
2025-12-01 11:18:00+00:00,2025-12-01 11:38:00+00:00,86447.01,86466.89,86442.17,86445.44,17.55845,-0.000852,-0.000753,0.000014,-0.000018,...,-37.821716,-5.373153,37.579913,86684.328291,86537.8260,86391.323709,0.003386,19.109180,0,-0.000335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-12-30 16:18:00+00:00,2025-12-30 16:38:00+00:00,89133.39,89284.68,89117.38,89268.32,44.64159,-0.000149,0.001148,-0.000023,0.001514,...,61.712495,12.112564,98.120295,89211.760042,89013.6555,88815.550958,0.004451,32.260257,0,-0.004433
2025-12-30 16:35:00+00:00,2025-12-30 16:55:00+00:00,88960.99,89040.00,88852.56,88883.34,80.35831,-0.001213,-0.000532,-0.001219,-0.000873,...,38.752044,-47.959445,102.781770,89413.549000,89182.2260,88950.903000,0.005188,24.350434,0,-0.002063
2025-12-30 17:45:00+00:00,2025-12-30 18:05:00+00:00,88533.43,88533.43,88333.34,88373.43,92.94989,0.000044,-0.000127,-0.002196,-0.001807,...,-30.390308,-16.937023,60.838834,88735.019439,88604.8085,88474.597561,0.002939,23.336862,0,-0.000135
2025-12-30 20:52:00+00:00,2025-12-30 21:12:00+00:00,88000.00,88002.49,87978.36,87978.36,5.42977,0.000090,-0.000110,-0.000155,-0.000246,...,-30.119612,-6.165715,29.383849,88172.325532,88064.0210,87955.716468,0.002460,25.686572,0,0.000950


In [5]:
features = [
    'open_pct_change',
    'high_pct_change',
    'low_pct_change',
    'close_pct_change',
    'sma_50',
    'sma_200',
    'rsi_14',
    'stoch_slowk',
    'stoch_slowd',
    'macd',
    'macd_signal',
    'macd_hist',
    'atr_14',
    'bb_upper',
    'bb_middle',
    'bb_lower',
    'bb_width',
    'adx_14'
]

# Feature Importance

## Mean Decrease in Accuracy

In [6]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

X = labeled_dataframe[features].values
y = labeled_dataframe['label'].values

In [7]:
mda_feature_importances = calculate_mda_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances

Unnamed: 0,feature_importances,mean_score
low_pct_change,0.004286,0.881429
macd_signal,0.0,0.881429
atr_14,-0.001429,0.881429
bb_width,-0.001429,0.881429
rsi_14,-0.001429,0.881429
close_pct_change,-0.001429,0.881429
stoch_slowd,-0.002857,0.881429
macd_hist,-0.004286,0.881429
bb_lower,-0.004286,0.881429
sma_200,-0.004286,0.881429


## Single Feature Importance (SFI)

In [8]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = calculate_sfi_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
    show_progress=True
)
sfi_feature_importances

Calculating SFI feature importance: 100%|██████████| 18/18 [00:31<00:00,  1.75s/it]


Unnamed: 0,mean,std
bb_upper,0.874286,0.018361
bb_lower,0.848571,0.021131
bb_middle,0.848571,0.018995
close_pct_change,0.837143,0.012189
sma_200,0.837143,0.032033
bb_width,0.834286,0.012355
high_pct_change,0.83,0.017071
atr_14,0.83,0.027804
macd,0.828571,0.016782
stoch_slowk,0.825714,0.020859


## Orthogonal Features

In [9]:
orth_features = orthogonal_features(
    df_X=dataframe[features],
    threshold=0.95
)

orth_features

array([[ 1.96222716, -2.36341805, -1.28372975, ..., -4.48768089,
         0.30597138,  0.94024672],
       [ 1.91440145, -2.76447369, -0.22799368, ...,  2.42339402,
         0.61827335,  1.78950397],
       [ 2.34009181, -0.73844725, -2.86772602, ...,  0.68489646,
         0.73320296, -0.76490224],
       ...,
       [ 0.68765777, -0.13073151,  0.42368496, ...,  0.14331009,
         0.39513704,  0.14379223],
       [ 0.51402625, -0.94485524,  0.75856363, ..., -0.49524019,
         0.34053176, -0.53995033],
       [ 0.29409077, -1.99317192,  1.36505399, ..., -0.06612314,
         0.21788011, -0.28553375]], shape=(43002, 8))

In [10]:
orth_features.shape

(43002, 8)

# Select Features

In [11]:
mda_feature_importances_mean = mda_feature_importances['feature_importances'].mean()
sfi_feature_importances_mean = sfi_feature_importances['mean'].mean()

In [12]:
mda_features = mda_feature_importances[
    mda_feature_importances['feature_importances'] >= mda_feature_importances_mean
].index.tolist()

sfi_features = sfi_feature_importances[
    sfi_feature_importances['mean'] >= sfi_feature_importances_mean
].index.tolist()

In [13]:
# Collect only features that are selected by all three methods
final_features = []

for i in range(len(features)):
    if features[i] in mda_features and features[i] in sfi_features:
        final_features.append(features[i])
print('Final Features:')
for feature in final_features:
    print(f'  - {feature}')

Final Features:
  - close_pct_change
  - atr_14
  - bb_width


# Re-calculate Feature Importance

## Mean Decrease in Accuracy

In [14]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

X = labeled_dataframe[final_features].values
y = labeled_dataframe['label'].values

In [15]:
mda_feature_importances = calculate_mda_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=final_features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
)
mda_feature_importances

Unnamed: 0,feature_importances,mean_score
bb_width,-0.004286,0.88
atr_14,-0.005714,0.88
close_pct_change,-0.007143,0.88


## Single Feature Importance (SFI)

In [16]:
model = RandomForestClassifier(
    n_estimators=1000,
    criterion='entropy',
    bootstrap=False,
    class_weight='balanced_subsample',
)

sfi_feature_importances = calculate_sfi_feature_importances(
    model=model,
    dataframe=labeled_dataframe,
    feature_cols=final_features,
    target_col='label',
    n_splits=5,
    purge=240,
    embargo=240,
    scoring='accuracy',
    show_progress=True
)
sfi_feature_importances

Calculating SFI feature importance: 100%|██████████| 3/3 [00:05<00:00,  1.71s/it]


Unnamed: 0,mean,std
close_pct_change,0.837143,0.012189
bb_width,0.834286,0.012355
atr_14,0.83,0.027804
