In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='shap')

# seed
np.random.seed(42)

# import classes
from Tools import DateTimeSeriesSplit, Kraken

# model and metric for classification
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# example 2: classification
c_clf = 30000  # Увеличили количество строк

# create dataset parts
Xc1 = pd.DataFrame()

# Основные значимые фичи
Xc1['shadow_of_the_north'] = np.random.rand(c_clf)       # Бывший var_1
Xc1['moonlight_whisper'] = np.random.rand(c_clf)         # Бывший var_2  
Xc1['echoes_from_abyss'] = np.random.rand(c_clf)         # Бывший var_3

# Шумовые фичи с мифическими названиями
noise_features = [
    'dragon_scale', 'phoenix_feather', 'unicorn_tear',
    'griffin_claw', 'mermaid_song', 'basilisk_gaze',
    'kraken_tentacle', 'valkyrie_helm', 'sphinx_riddle',
    'centaur_hoof', 'pixie_dust', 'werewolf_fang'
]

for idx in range(4, 50):
    feature_name = f"{noise_features[(idx-4)%12]}_{idx}_lost" if idx < 40 else f"forgotten_artifact_{idx}"
    Xc1[feature_name] = np.random.rand(c_clf)

# Временная метка
Xc1['scroll_of_chronicles'] = pd.date_range(start='2005-01-01', periods=c_clf, freq='D')

# Целевая переменная
y_c1_float = (4 * Xc1['shadow_of_the_north'] + 
              5 * Xc1['moonlight_whisper'] + 
              (2*Xc1['echoes_from_abyss'])**2 + 
              1.0 * np.random.rand(c_clf))
y_c1 = (y_c1_float > 6.0).astype(int)

# Создаем копии с небольшими вариациями
Xc2 = Xc1.copy()
y_c2 = ((2 * Xc2['shadow_of_the_north'] + 
         2 * Xc2['moonlight_whisper'] + 
         (2*Xc2['echoes_from_abyss'])**1.9) + 
        1.0*np.random.rand(c_clf) > 6.0).astype(int)

Xc3 = Xc1.copy()
y_c3 = ((3 * Xc3['shadow_of_the_north'] + 
         3 * Xc3['moonlight_whisper'] + 
         (2*Xc3['echoes_from_abyss'])**1.5) + 
        1.0*np.random.rand(c_clf) > 6.0).astype(int)

Xc = pd.concat([Xc1, Xc2, Xc3], axis=0)
y_c = pd.concat([y_c1, y_c2, y_c3], axis=0).reset_index(drop=True)
print("Classification dataset shape:", Xc.shape)

cv_datetime_clf = DateTimeSeriesSplit(window=1500, n_splits=3, test_size=300, margin=0)
group_dt_clf = Xc['scroll_of_chronicles']

# Формируем список признаков
vars_for_clf = [col for col in Xc.columns if col not in ['scroll_of_chronicles', 'index_time']]
model_clf = LGBMClassifier(
    max_depth=3, 
    objective='binary', 
    verbosity=-1,
    random_state=42
)

# Метрика accuracy
def my_accuracy(y_true, y_pred_prob):
    y_bin = (y_pred_prob > 0.5).astype(int)
    return accuracy_score(y_true, y_bin)

selector_clf = Kraken(
    estimator=model_clf,
    cv=cv_datetime_clf,
    metric=my_accuracy,
    meta_info_name='example_classification',
    task_type='classification',
    greater_is_better=True,
    which_class_for_shap=1,
    comparison_precision=3
)

# Расчет важности признаков
selector_clf.get_rank_dict(Xc, y_c, vars_for_clf, group_dt_clf)
print("Rank dict (classification) top-5:", dict(list(selector_clf.rank_dict.items())[:5]))

# Жадный отбор признаков
best_vars_clf = selector_clf.get_vars(
    X=Xc, 
    y=y_c, 
    rank_dict=selector_clf.rank_dict,
    group_dt=group_dt_clf,
    max_feature_search_rounds=15,
    top_n_for_first_step=10
)
print("Selected vars (classification):", best_vars_clf)

Classification dataset shape: (90000, 50)
[get_rank_dict] Starting combined baseline evaluation and SHAP calculation...
Fold: 1/3 | Status: Done (0.45s)              | Fold Time:   0.45s | Total Time:    0.57s                              
Fold: 2/3 | Status: Done (0.28s)              | Fold Time:   0.28s | Total Time:    0.85s                              
Fold: 3/3 | Status: Done (0.28s)              | Fold Time:   0.28s | Total Time:    1.13s                              
------------------------------
[get_rank_dict] >> FINAL Baseline Performance (All Features)
    Mean CV Score: 0.829
    Fold Scores: [0.828 0.817 0.842]
[get_rank_dict] Completed calculation. Total time: 1.22 seconds.
Rank dict (classification) top-5: {'moonlight_whisper': 1, 'echoes_from_abyss': 2, 'shadow_of_the_north': 3, 'valkyrie_helm_35_lost': 4, 'pixie_dust_14_lost': 5}
[get_vars] Evaluating initial feature set (if any)...
[get_vars] Starting feature selection procedure...
[get_vars] Starting from scratch (