In [16]:
import numpy as np
import pandas as pd

from imblearn.over_sampling import SMOTE

In [12]:
X = pd.read_parquet("clear_data.parquet")

y = X["target"]
X.drop(columns = ["target"], inplace = True)

In [13]:
cat_features = []
for col in X.columns:
    if len(X[col].unique()) <= 3 and all(v in [0, 1, 2] for v in X[col].unique()):
        cat_features.append(col)

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from catboost import CatBoostClassifier


# Шаг 2: Стратифицированное разделение на обучающий и тестовый наборы данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Шаг 3: Преобразование категориальных признаков в числовые
label_encoders = {}
for feature in cat_features:
    label_encoders[feature] = LabelEncoder()
    X_train[feature] = label_encoders[feature].fit_transform(X_train[feature])
    X_test[feature] = label_encoders[feature].transform(X_test[feature])
    
X_train[cat_features] = X_train[cat_features].astype(str)
X_test[cat_features] = X_train[cat_features].astype(str)

# Шаг 4: Семплирование данных
oversampler = RandomOverSampler(sampling_strategy=0.5, random_state=42)
undersampler = RandomUnderSampler(sampling_strategy=0.8, random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_resampled, y_train_resampled)

# Шаг 5: Предобработка данных
scaler = StandardScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Шаг 6: Обучение модели CatBoost с учетом дисбаланса классов
scale_pos_weight = len(y_train_resampled[y_train_resampled == 0]) / len(y_train_resampled[y_train_resampled == 1])
model = CatBoostClassifier(iterations=1000, depth=6, l2_leaf_reg=1, loss_function='Logloss', scale_pos_weight=scale_pos_weight)
model.fit(X_train_resampled_scaled, y_train_resampled, eval_set=(X_test_scaled, y_test), early_stopping_rounds=50, verbose=100)

# Шаг 7: Оценка модели
accuracy = model.score(X_test_scaled, y_test)
print(f"Accuracy: {accuracy}")

# Шаг 8: Отбор признаков
feature_importance = model.get_feature_importance(prettified=True)
selected_features = [feature[0] for feature in feature_importance if feature[1] > 0]
X_train_selected = X_train_resampled[selected_features]
X_test_selected = X_test[selected_features]

# Шаг 9: Повторное обучение модели на отобранных признаках
model_selected = CatBoostClassifier(iterations=1000, depth=10, l2_leaf_reg=1, loss_function='Logloss', scale_pos_weight=scale_pos_weight, bagging_temperature=0.2)
model_selected.fit(X_train_selected, y_train_resampled, cat_features=cat_features, eval_set=(X_test_selected, y_test), early_stopping_rounds=50, verbose=100)

# Шаг 10: Оценка модели на отобранных признаках
accuracy_selected = model_selected.score(X_test_selected, y_test)
print(f"Accuracy after feature selection: {accuracy_selected}")


KeyboardInterrupt: 