In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='shap')

# seed
np.random.seed(42)

# import classes
from Tools import DateTimeSeriesSplit, Kraken

# model and metric for classification
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# example 2: classification
c_clf = 30000  # Увеличили количество строк

# create dataset parts
Xc1 = pd.DataFrame()

# var_1, var_2, var_3 - features that affect target
Xc1['var_1'] = np.random.rand(c_clf)
Xc1['var_2'] = np.random.rand(c_clf)
Xc1['var_3'] = np.random.rand(c_clf)

# var_4..var_49 - noise features (46 шумовых признаков вместо 6)
for col_i in range(4, 50):
    Xc1[f'var_{col_i}'] = np.random.rand(c_clf)

# date
Xc1['date'] = pd.date_range(start='2005-01-01', periods=c_clf, freq='D')

# target
y_c1_float = 4 * Xc1['var_1'] + 5 * Xc1['var_2'] + (2*Xc1['var_3'])**2 + 1.0 * np.random.rand(c_clf)
y_c1 = (y_c1_float > 6.0).astype(int)

Xc2 = Xc1.copy()
y_c2 = ((2 * Xc2['var_1'] + 2 * Xc2['var_2'] +  (2*Xc2['var_3'])**1.9) + 1.0*np.random.rand(c_clf) > 6.0).astype(int)
Xc3 = Xc1.copy()
y_c3 = ((3 * Xc3['var_1'] + 3 * Xc3['var_2'] +  (2*Xc3['var_3'])**1.5)  + 1.0*np.random.rand(c_clf) > 6.0).astype(int)
Xc = pd.concat([Xc1, Xc2, Xc3], axis=0)
y_c = pd.concat([y_c1, y_c2, y_c3], axis=0).reset_index(drop=True)
print("Classification dataset shape:", Xc.shape)

cv_datetime_clf = DateTimeSeriesSplit(window=1500, n_splits=3, test_size=300, margin=0)
group_dt_clf = Xc['date']

# get feature list
vars_for_clf = [col for col in Xc.columns if col not in ['date', 'index_time']]
model_clf = LGBMClassifier(
    max_depth=3, 
    objective='binary', 
    verbosity=-1,
    random_state=42
)

# metric - accuracy (higher is better)
def my_accuracy(y_true, y_pred_prob):
    y_bin = (y_pred_prob > 0.5).astype(int)
    return accuracy_score(y_true, y_bin)

selector_clf = Kraken(
    estimator=model_clf,
    cv=cv_datetime_clf,
    metric=my_accuracy,
    meta_info_name='example_classification',
    task_type='classification',
    greater_is_better=True,
    which_class_for_shap=1,
    comparison_precision=2
)

# calculate SHAP importance
selector_clf.get_rank_dict(Xc, y_c, vars_for_clf, group_dt_clf)
print("Rank dict (classification) top-5:", dict(list(selector_clf.rank_dict.items())[:5]))

# greedy feature selection
best_vars_clf = selector_clf.get_vars(
    X=Xc, 
    y=y_c, 
    rank_dict=selector_clf.rank_dict,
    group_dt=group_dt_clf,
    max_feature_search_rounds=15,  # Уменьшили лимит поиска
    top_n_for_first_step=10  # Добавили параметр для первого шага
)
print("Selected vars (classification):", best_vars_clf)

Classification dataset shape: (90000, 50)
[get_rank_dict] Starting combined baseline evaluation and SHAP calculation...
Fold: 1/3 | Status: Done (0.28s)              | Fold Time:   0.28s | Total Time:    0.36s                              
Fold: 2/3 | Status: Done (0.28s)              | Fold Time:   0.28s | Total Time:    0.64s                              
Fold: 3/3 | Status: Done (0.21s)              | Fold Time:   0.21s | Total Time:    0.85s                              
------------------------------
[get_rank_dict] >> FINAL Baseline Performance (All Features)
    Mean CV Score: 0.83
    Fold Scores: [0.83 0.82 0.84]
[get_rank_dict] Completed calculation. Total time: 0.91 seconds.
Rank dict (classification) top-5: {'var_2': 1, 'var_3': 2, 'var_1': 3, 'var_35': 4, 'var_14': 5}
[get_vars] Evaluating initial feature set (if any)...
[get_vars] Starting feature selection procedure...
[get_vars] Starting from scratch (will check top 10 features first).

--- Starting Step: Selecting feat