In [None]:
!pip install catboost --no-cache-dir
!pip install numpy
!pip install pandas


In [8]:
import pandas as pd

train_cleaned = pd.read_csv('D:/Code/intensiv/train_cleaned.csv', low_memory=False) 
valid_cleaned = pd.read_csv('D:/Code/intensiv/valid_cleaned.csv', low_memory=False) 


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

# Загрузка данных
X = train_cleaned.drop(columns=['client_id', 'target', 'report_date'])
y = train_cleaned['target']

# Определение числовых и категориальных признаков
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Преобразование категориальных данных в строковый формат
X[categorical_features] = X[categorical_features].astype(str)

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Создание и обучение модели CatBoost
catboost_model = CatBoostClassifier(
    iterations=500,           # Количество итераций
    learning_rate=0.05,       # Темп обучения
    depth=8,                  # Глубина деревьев
    random_state=42,
    cat_features=categorical_features,  # Указываем категориальные признаки как список
    verbose=50,               # Вывод промежуточных результатов
    auto_class_weights='Balanced'       # Балансировка классов
)

# Обучение модели
catboost_model.fit(X_train, y_train)


# Предсказание вероятностей
test_preds = catboost_model.predict_proba(X_test)[:, 1]

# Оценка модели с помощью ROC-AUC
test_score = roc_auc_score(y_test, test_preds)
print(f"ROC-AUC на тестовых данных: {test_score:.4f}")

# Сохранение результатов
submission = pd.DataFrame({'client_id': train_cleaned.loc[X_test.index, 'client_id'], 'score': test_preds})
submission.to_csv('submission_catboost.csv', index=False)
print("Результаты сохранены в submission_catboost.csv")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Получаем ROC-кривую
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
roc_auc = auc(fpr, tpr)

# Визуализация ROC-кривой
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC-кривая (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')  # Диагональ случайного классификатора
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.grid()
plt.show()


In [None]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Матрица ошибок
print(confusion_matrix(y_test, catboost_model.predict(X_test)))

# Полный отчет о метриках
print(classification_report(y_test, catboost_model.predict(X_test)))


In [None]:
feature_importance = catboost_model.get_feature_importance(prettified=True)
print(feature_importance)


In [None]:
import matplotlib.pyplot as plt

# Получим топ-20 признаков по важности
top_features = feature_importance.sort_values(by="Importances", ascending=False).head(20)

# Построим график
plt.figure(figsize=(12, 6))
plt.barh(top_features['Feature Id'], top_features['Importances'], color='royalblue')
plt.gca().invert_yaxis()
plt.title("Top-20 наиболее важных признаков")
plt.xlabel("Важность признака")
plt.ylabel("Признаки")
plt.show()


In [None]:
valid_cleaning.col154

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import make_scorer, f1_score

f1_scorer = make_scorer(f1_score, pos_label=1)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = cross_val_score(catboost_model, X, y, scoring=f1_scorer, cv=skf)
print(f"Средний F1-Score для класса '1': {f1_scores.mean():.4f}")


In [None]:
!pip install imbalanced-learn

In [22]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer

# Импорты библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from catboost import CatBoostClassifier

# Загрузка данных
X = train_cleaned.drop(columns=['client_id', 'target', 'report_date'])
y = train_cleaned['target']

# Определение числовых и категориальных признаков
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Преобразование категориальных признаков в числовой формат
label_encoders = {}
for cat_col in categorical_features:
    le = LabelEncoder()
    X[cat_col] = le.fit_transform(X[cat_col].astype(str))
    label_encoders[cat_col] = le

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Определение SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)

# Определение модели CatBoost
catboost_model = CatBoostClassifier(
    cat_features=categorical_features,
    verbose=0,
    random_state=42
)

# Создание Pipeline для SMOTE + CatBoost
pipeline = Pipeline([
    ('smote', smote),  # Применение SMOTE
    ('catboost', catboost_model)
])

# Настройка параметров для GridSearchCV
param_grid = {
    'catboost__depth': [6, 8],
    'catboost__learning_rate': [0.05, 0.1],
    'catboost__iterations': [500],
    'catboost__scale_pos_weight': [20, 30]
}

# GridSearchCV с кросс-валидацией и SMOTE
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
)

# Обучение GridSearchCV
grid_search.fit(X_train, y_train)

# Вывод лучших параметров
print("Лучшие параметры:", grid_search.best_params_)

# Обучение лучшей модели на всех обучающих данных
best_model = grid_search.best_estimator_

# Предсказание на тестовых данных
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Метрики качества
print("\nROC-AUC на тестовых данных:", roc_auc_score(y_test, y_pred_proba))
print("\nClassification Report на тестовых данных:\n", classification_report(y_test, y_pred))




ValueError: 
All the 24 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\imblearn\pipeline.py", line 265, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\imblearn\pipeline.py", line 1057, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\imblearn\base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\imblearn\base.py", line 106, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
                       ^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\imblearn\base.py", line 161, in _check_X_y
    X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\base.py", line 480, in _validate_data
    return validate_data(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1107, in check_array
    _assert_all_finite(
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "d:\Code\intensiv\.venv\Lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
SMOTE does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
