In [1]:
import pickle
import warnings
from copy import deepcopy
from pprint import pprint

import optuna
from optuna.integration import CatBoostPruningCallback
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, f1_score, make_scorer,
                             precision_score, recall_score, roc_auc_score,
                             roc_curve)
from sklearn.model_selection import (RandomizedSearchCV, StratifiedKFold,
                                     train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

sns.set()
warnings.filterwarnings('ignore')

In [2]:
RANDOM_STATE = 42
RANDOM_SEED = RANDOM_STATE

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

In [4]:
df = pd.read_csv('../data/clean/train.csv')

In [None]:
df.head()

Мы будем решать задачу бинарной классификации, где:  
класс 1 соответствует "плохой" заявке, которую мы хотим отклонить;  
класс 0 же соответствует "хорошей" заявке, хотим одобрить.

### Feature generation and data split

In [6]:
# Сгенерируем дополнительные фичи
# Для каждого статуса проверки, мы группируем данные по 'checking_status' и вычисляем стандартное отклонение, медиану и среднее значение 'duration_months' и 'credit_amount'
df['check_status_dur_std'] = df.groupby('checking_status')['duration_months'].transform('std')
df['check_status_dur_median'] = df.groupby('checking_status')['duration_months'].transform('median')
df['check_status_dur_mean'] = df.groupby('checking_status')['duration_months'].transform('mean')

# Аналогично для 'credit_amount'
df['check_status_credit_am_std'] = df.groupby('checking_status')['credit_amount'].transform('std')
df['check_status_credit_am_meadian'] = df.groupby('checking_status')['credit_amount'].transform('median')
df['check_status_credit_am_mean'] = df.groupby('checking_status')['credit_amount'].transform('mean')

# Аналогичная группировка данных по 'credit_history'
df['credit_history_dur_std'] = df.groupby('credit_history')['duration_months'].transform('std')
df['credit_history_dur_median'] = df.groupby('credit_history')['duration_months'].transform('median')
df['credit_history_dur_mean'] = df.groupby('credit_history')['duration_months'].transform('mean')
df['credit_history_credit_am_std'] = df.groupby('credit_history')['credit_amount'].transform('std')
df['credit_history_credit_am_median'] = df.groupby('credit_history')['credit_amount'].transform('median')
df['credit_history_credit_am_mean'] = df.groupby('credit_history')['credit_amount'].transform('mean')

# Дополнительные фичи, которые могут характеризовать условную платёжную нагрузку в единицу времени
df['amount_to_duration'] = df['credit_amount'] / df['duration_months']
df['installment_mul_duration'] = df['installment_commitment'] * df['duration_months']

In [7]:
X = df.drop('target', axis=1).columns
y = ['target']

In [8]:
features2drop = []
target = ['target']
filtered_features = [c for c in df.columns if (c not in target and c not in features2drop)]
num_features = [c for c in df.columns if c not in target and pd.api.types.is_numeric_dtype(df[c])]
cat_features = [c for c in df.columns if c not in target and c not in num_features]
assert len(target) + len(num_features) + len(cat_features) + len(features2drop) == len(df.columns)
assert len(filtered_features) == len(num_features) + len(cat_features)

Учитывая малый размер выборки, мы разобьём её только на две части  
- *train* - для обучения моделей с использованием кросс-валидации  
- *test* - для тестирования лучшей модели

In [None]:
train, test = train_test_split(df, test_size=0.25, random_state=RANDOM_STATE, stratify=df[y])
print(f'Размер обучающей выборки {train.shape}, это составляет {len(train)/len(df):.0%} данных')
print(f'Размер тестовой выборки {test.shape}, это составляет {len(test)/len(df):.0%} данных')

In [10]:
kfold = StratifiedKFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)

### Declaring functions

Создадим кастомный скоринг weighted_accuracy.

Описание данных сообщает нам валюту счёта - `Deutsche Mark`.  
Учитывая низкий уровень инфляции (как следствие - низкий уровень процентных ставок по кредитным продуктам), одобрение "плохой" заявки может нести существенные финансовые потери.  
В этой связи мы будем сильнее штрафовать модель за FN (то есть одобрение "плохой" заявки) будем использовать вес -10.  
Вес верного предсказания отрицательного класса (TN) будет 2, мы хотим одобрять "хорошие" заявки и зарабатывать на этом.  
Вес неверного предсказания положительного класса (FP) будет -1, мы не хотим терять потенциал и отклонять "хорошие" заявки.  
Вес верно отклонённой заявки (TP) будет равен 0.  

Таким образом мы будем стараться максимизировать показатель `weighted_accuracy`.

In [42]:
def weighted_accuracy(y_true, y_pred):
    """
    Custom scoring function for evaluating model accuracy with weighted penalties.

    Args:
        y_true (array-like): Ground truth target values.
        y_pred (array-like): Predicted target values.

    Returns:
        float: Weighted accuracy score, representing the proportion of correctly classified instances with penalties applied.

    Notes:
        The cost matrix is defined as follows:
            - Cost of predicting class 0 when true label is 0: 2
            - Cost of predicting class 0 when true label is 1: -10
            - Cost of predicting class 1 when true label is 0: -1
            - Cost of predicting class 1 when true label is 1: 0
    """

    cost_matrix = np.array([
        [2, -1],  # weight for actual class 0
        [-10, 0]  # weight for actual class 1
    ])

    costs = cost_matrix[y_true, y_pred]

    return round(np.sum(costs) / len(costs), 3)

In [43]:
weighted_accuracy_scorer = make_scorer(weighted_accuracy, greater_is_better=True)

In [44]:
def rs(model, grid_params, n_iter, x_train, y_train, verbose=0, random_state=RANDOM_STATE):
    """
    Hyperparameter Tuning with Randomized Grid Search.

    This function performs hyperparameter tuning using RandomizedSearchCV from scikit-learn.
    It returns the best estimator and the search object itself.

        Args:
        model (object): The base estimator to be tuned.
        grid_params (dict): Dictionary of parameters to be searched over.
        n_iter (int): Number of random samples drawn from the parameter space to perform the search.
        x_train (array-like): Training data.
        y_train (array-like): Target variable data.
        verbose (int, optional): Verbosity level. Can be 0 or 1. Defaults to 0.
        random_state (int, optional): Random seed for reproducibility. Defaults to RANDOM_STATE.

    Returns:
        tuple: A tuple containing the best estimator and the search object itself.

    Notes:
        The grid search is performed with a stratified k-fold cross-validation strategy using 5 folds.
        The scoring metric used is the weighted accuracy, which takes into account the cost of misclassifying each instance.
        The search process can be verbose, printing out the best parameters found at each iteration.
    """

    print('Performing grid search...')
    print('Hyperparameters to be evaluated:')
    pprint(grid_params)
    print()

    clf_rnd_gs = RandomizedSearchCV(
        model,
        grid_params,
        random_state=random_state,
        n_iter=n_iter,
        cv=5,
        verbose=verbose,
        n_jobs=-1,
        scoring=weighted_accuracy_scorer,
    )

    clf_rnd_gs.fit(x_train, y_train)

    print('Best parameters:')
    for elem in sorted(clf_rnd_gs.best_params_):
        print(f'{elem}: {clf_rnd_gs.best_params_.get(elem)}')

    return (clf_rnd_gs.best_estimator_, clf_rnd_gs)

In [45]:
def fit_catboost(trial, train, valid, cf=cat_features):
    """
    Optimizes and trains a CatBoostClassifier using Optuna for hyperparameter tuning.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        A single trial object from Optuna, used to suggest hyperparameters for the model.

    train : tuple (X_train, y_train)
        Training data. `X_train` is the feature matrix and `y_train` is the target array.

    valid : tuple (X_valid, y_valid)
        Validation data. `X_valid` is the feature matrix for validation, and `y_valid` is the target array for validation.

    cf : list, optional
        List of categorical feature indices or column names to be used by CatBoost, by default `cat_features`.

    Returns:
    --------
    y_pred : array-like
        Predicted labels for the validation data.

    Notes:
    ------
    - The function suggests hyperparameters for learning rate, L2 regularization, column sample by level, class weights,
      depth of the tree, and boosting type using Optuna's `trial.suggest_*` methods.
    - Depending on the selected `bootstrap_type`, the function also suggests values for `bagging_temperature` or `subsample`.
    - The classifier is fitted with early stopping (50 rounds) based on validation performance.
    """

    X_train, y_train = train
    X_valid, y_valid = valid

    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 2, 100),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.75),

        'auto_class_weights': trial.suggest_categorical('auto_class_weights', ['SqrtBalanced', 'Balanced', 'None']),
        'depth': trial.suggest_int('depth', 3, 12),

        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'used_ram_limit': '14gb',
    }


    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 10, 20)

    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.1, 0.75)

    clf = CatBoostClassifier(
        **params,
        random_state=RANDOM_STATE,
        cat_features=cf,
        verbose=0,
        eval_metric='AUC',
        )

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=50
    )

    y_pred = clf.predict(X_valid)
    return y_pred

In [46]:
def objective(trial):
    """
    Objective function for optimizing a CatBoostClassifier using Optuna and cross-validation.

    Parameters:
    -----------
    trial : optuna.trial.Trial
        A single trial object from Optuna, used to suggest hyperparameters for the CatBoost model.

    Returns:
    --------
    result : float
        The mean weighted accuracy score from cross-validation.

    Notes:
    ------
    - The function uses Stratified K-Fold cross-validation with `n_splits=3` to evaluate the model performance.
    - Features are selected from the `train` dataset using `filtered_features`, with the target column excluded.
    - For each fold, the function fits the CatBoost model using the `fit_catboost` function and records the weighted accuracy score for the validation set.
    - The final result is the mean of the scores from all the folds, which is used as the objective to be minimized or maximized by Optuna.
    """

    n_splits = 3
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    X_train = train[filtered_features].drop(target, axis=1, errors="ignore")
    y_train = train[target]
    scores = []

    for train_idx, valid_idx in kf.split(X_train, y_train):
        train_data = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        y_pred = fit_catboost(trial, train_data, valid_data)
        scores.append(weighted_accuracy(valid_data[1], y_pred))

    result = np.mean(scores)

    return result

In [47]:
def calc_metrics(actual_classes, predicted_classes, predicted_proba):
    """
    Calculate various metrics for a classification model.

    Args:
        actual_classes (numpy.ndarray): The true class labels.
        predicted_classes (numpy.ndarray): The predicted class labels.
        predicted_proba (numpy.ndarray): The predicted probabilities.

    Returns:
        tuple: A tuple containing the following metrics:
            `rocauc`: The receiver operating characteristic area under the curve (ROC-AUC) score.
            `accuracy`: The accuracy score, i.e., the proportion of correctly classified instances.
            `precision`: The precision score, i.e., the ratio of true positives to the sum of true positives and false positives.
            `recall`: The recall score, i.e., the ratio of true positives to the sum of true positives and false negatives.
            `f1`: The F1 score, which is the harmonic mean of precision and recall.
            `weighted_acc`: The custom weighted accuracy score.
    """
    rocauc = roc_auc_score(actual_classes, predicted_proba[:,1])
    accuracy = accuracy_score(actual_classes, predicted_classes)
    precision = precision_score(actual_classes, predicted_classes)
    recall = recall_score(actual_classes, predicted_classes)
    f1 = f1_score(actual_classes, predicted_classes)
    weighted_acc = weighted_accuracy(actual_classes, predicted_classes)

    return (rocauc, accuracy, precision, recall, f1, weighted_acc)


In [48]:
def plot_cm_rocauc(actual_classes, predicted_classes, predicted_proba, sorted_labels=['Good', 'Bad']):
    """
    Plot the confusion matrix and ROC curve for a classification model.

    Args:
        actual_classes (numpy.ndarray): The true class labels.
        predicted_classes (numpy.ndarray): The predicted class labels.
        predicted_proba (numpy.ndarray): The predicted probabilities.
        sorted_labels (list[str], optional): A list of labels to use for the confusion matrix and ROC curve. Defaults to ["Good", "Bad"].

    Returns:
        None
    """

    matrix = confusion_matrix(actual_classes, predicted_classes)
    plt.figure(figsize=(10, 6))
    sns.heatmap(
        matrix,
        annot=True,
        xticklabels=sorted_labels,
        yticklabels=sorted_labels,
        cmap='gnuplot',
        fmt='g',
    )
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

    probabilities_one = predicted_proba[:, 1]

    sns.set(rc={'figure.figsize': (9, 6)})

    fpr, tpr, thresholds = roc_curve(actual_classes, probabilities_one)
    plt.figure()

    plt.plot(fpr, tpr)

    plt.plot([0, 1], [0, 1], linestyle='--')

    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    plt.title('ROC')

    plt.show()

    print(classification_report(actual_classes, predicted_classes))


In [49]:

def cross_val_predict(model, kfold, X, y):
    """
    Perform stratified k-fold cross-validation and predict classes using the given model.

    Args:
        model (BaseEstimator): The classification model to use for prediction.
        kfold (StratifiedKFold): A StratifiedKFold object for splitting the data into training and testing sets.
        X (pd.DataFrame): The feature matrix.
        y (pd.Series): The target variable.

    Returns:
        tuple: A tuple containing the ROC-AUC score, weighted accuracy, and other performance metrics.

    """

    model_ = deepcopy(model)

    no_classes = len(np.unique(y))

    actual_classes = np.empty([0], dtype=int)
    predicted_classes = np.empty([0], dtype=int)
    predicted_proba = np.empty([0, no_classes])

    for train_ndx, test_ndx in kfold.split(X, y):
        train_X, train_y, test_X, test_y = (
            X.iloc[train_ndx],
            y.iloc[train_ndx],
            X.iloc[test_ndx],
            y.iloc[test_ndx],
        )

        actual_classes = np.append(actual_classes, test_y)

        model_.fit(train_X, train_y)
        predicted_classes = np.append(predicted_classes, model_.predict(test_X))

        try:
            predicted_proba = np.append(
                predicted_proba, model_.predict_proba(test_X), axis=0
            )
        except Exception:
            predicted_proba = np.append(
                predicted_proba,
                np.zeros((len(test_X), no_classes), dtype=float),
                axis=0,
            )

    plot_cm_rocauc(actual_classes, predicted_classes, predicted_proba)

    cur_metrics = calc_metrics(actual_classes, predicted_classes, predicted_proba)

    print('Train data StratifiedKFold CV:')
    print(f'ROC-AUC {cur_metrics[0]:.3f}\nWeighted_accuracy {cur_metrics[-1]:.3f}')

    return cur_metrics

In [50]:
def save_and_display_results(name, metrics):
    """
    Accumulate training results to a pandas DataFrame.

    Parameters:
        name (str): The name or identifier for the current run.
        metrics (tuple): A tuple containing the performance metrics.

    Returns:
        None

    Notes:
        This function adds a new row to the `results` DataFrame with the given name and metrics.
    """

    global results

    results.loc[name] = [*metrics]
    display(results)
    return None

In [51]:
results = pd.DataFrame({'ROC-AUC': [],
                        'Accuracy': [],
                        'Precision': [],
                        'Recall': [],
                        'F1_score' : [],
                        'Weighted_accuracy': []})

## Logistic Regression

Учитывая малый размер выборки, начнём с простой модели.

In [52]:
num_pipeline = Pipeline(steps=[
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('one-hot', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False))
])

pre_processor = ColumnTransformer(transformers=[
    ('number', num_pipeline, num_features),
    ('category', cat_pipeline, cat_features)
])

lr_pipeline = Pipeline(steps=[
    ('preprocess', pre_processor),
    ('classify', LogisticRegression(random_state=RANDOM_STATE))
])

In [None]:
# Baseline
metrics = cross_val_predict(lr_pipeline, kfold, train[X], train[y]);

In [None]:
save_and_display_results('Baseline_LogReg', metrics)

In [None]:
grid_params_lr = {
    'classify__C': np.logspace(-4, 4, 10),
    'classify__class_weight': ['balanced', None],
    'classify__max_iter': range(500, 5001, 500),
    'classify__penalty': ['l2', 'l1', 'elasticnet']
}

n_iter = 500

best_lr_model, lr_rs = rs(lr_pipeline, grid_params_lr, n_iter, train[X], train[y].values.ravel())

In [60]:
with open('../models/Logistic_regression.pkl', 'wb') as file:
    pickle.dump(best_lr_model, file)

In [None]:
metrics = cross_val_predict(best_lr_model, kfold, train[X], train[y]);

In [None]:
save_and_display_results('Logistic_regression', metrics)

## Random Forest

In [None]:
pre_processor_rf = ColumnTransformer(transformers=[
    ('category', cat_pipeline, cat_features)
])

rf_pipeline = Pipeline(steps=[
    ('preprocess', pre_processor_rf),
    ('classify', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'))
])

grid_params_rf = {
    'classify__n_estimators': range(50, 1000, 100),
    'classify__max_depth': range(1, 16, 2),
    'classify__min_samples_leaf': [10, 25, 50, 75, 100],
    'classify__criterion': ['entropy', 'gini'],
    'classify__max_features': [None, 0.75, 0.5, 0.25, 'sqrt', 'log2']
}

n_iter = 100

best_rf_model, rf_rs = rs(rf_pipeline, grid_params_rf, n_iter, train[X], train[y].values.ravel())

In [65]:
with open('../models/Random_forest.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

In [None]:
metrics = cross_val_predict(best_rf_model, kfold, train[X], train[y]);

In [None]:
save_and_display_results('Random_forest', metrics)

## CatBoost


In [68]:
params = {
    'cat_features': cat_features,
    'loss_function': 'Logloss',
    'random_seed': RANDOM_SEED,
    'verbose': 0
}

In [None]:
grid_params_cb = {
    'n_estimators': range(200, 1001, 200),
    'learning_rate': [0.001, 0.01, 0.025, 0.05, 0.1, 0.2, 0.3],
    'depth': [1, 2, 3, 4, 8, 12],
    'l2_leaf_reg': np.linspace(2, 30, 10, dtype=int),
    'colsample_bylevel': [x / 100 for x in range(10, 51, 10)] + [None],
    'subsample': [x / 100 for x in range(50, 100, 15)] + [None],
    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS'],
    'auto_class_weights': ['Balanced', None]
}

n_iter = 100

best_cb_model, cb_rs = rs(CatBoostClassifier(**params), grid_params_cb, n_iter, train[X], train[y])

In [34]:
with open('../models/Catboost.pkl', 'wb') as file:
    pickle.dump(best_cb_model, file)

In [None]:
metrics = cross_val_predict(best_cb_model, kfold, train[X], train[y])

In [None]:
save_and_display_results('Catboost', metrics)

## CatBoost optuna

In [86]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(
    objective,
    n_trials=300,
    n_jobs=-1,
    show_progress_bar=True
    )

In [None]:
print(round(study.best_trial.value, 3))
pprint(study.best_trial.params)

In [89]:
best_score = study.best_trial.value
best_params = study.best_trial.params
best_params.update({
    'cat_features': cat_features,
    'verbose': 0,
    'random_state': RANDOM_STATE
})

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
params = ['l2_leaf_reg', 'colsample_bylevel', 'bagging_temperature', 'depth', 'bootstrap_type', 'subsample', 'learning_rate', 'auto_class_weights']
optuna.visualization.plot_slice(study,
                                params=params,
                                target_name = 'weighted_ccuracy')

In [None]:
# Важность параметров
optuna.visualization.plot_param_importances(study)

In [93]:
cb_opt = CatBoostClassifier(**best_params)

In [None]:
metrics = cross_val_predict(cb_opt, kfold, train[X], train[y])

In [99]:
with open('../models/Catboost_opt.pkl', 'wb') as file:
    pickle.dump(cb_opt, file)

In [None]:
save_and_display_results('Catboost_optuna', metrics)

## DummyClassifier

In [96]:
dc = DummyClassifier(strategy='uniform', random_state=RANDOM_STATE)

In [None]:
dc.fit(train[X], train[y])
print(f'Weighted accuracy for dummy classifier (uniform) = {weighted_accuracy(train[y].values.ravel(), dc.predict(train[X]))}')

## Inference

Модель логистической регрессии показывает лучший результат `wieghted_accuracy` в совокупности с максимальным `f1_score`.  
Проверим модель Logistic_regression на тестовой выборке

In [None]:
with open('../models/Logistic_regression.pkl', 'rb') as file:
    best_model = pickle.load(file)

best_model.fit(train[X], train[y])
pred = best_model.predict(test[X])
pred_proba = best_model.predict_proba(test[X])

print(classification_report(test[y], pred))

roc_auc, accuracy, precision, recall, f1, weighted_acc = calc_metrics(test[y], pred, pred_proba)

print(f'{"lr"} | {roc_auc = :.3f} | {weighted_acc = :.3f}')

matrix = confusion_matrix(test[y], pred)
plt.figure(figsize=(10, 6))
sns.heatmap(
    matrix,
    annot=True,
    xticklabels=['Good', 'Bad'],
    yticklabels=['Good', 'Bad'],
    cmap='gnuplot',
    fmt='g',
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

Метрики классификации на тестовой выборке не притерпели значительного уменьшения, что свидетельствует об отстутствии переобучения.

Осуществим подбор порога на обучающей выборке с максимизацией weighted_accuracy

In [None]:
best_wa = float('-inf')
best_thr = 0.5
with open('../models/Logistic_regression.pkl', 'rb') as file:
    best_model = pickle.load(file)

best_model.fit(train[X], train[y])
pred_proba = best_model.predict_proba(train[X])
thrs = np.unique(pred_proba[:, 1])
scores = []

for thr in thrs:
    pred = (pred_proba[:, 1] >= thr).astype(int)
    wa = weighted_accuracy(train[y], pred)
    if wa > best_wa:
        best_wa = wa
        best_thr = thr
print(f'{best_wa = }, {best_thr = }')


In [None]:
with open('../models/Logistic_regression.pkl', 'rb') as file:
    best_model = pickle.load(file)

best_model.fit(train[X], train[y])
pred_proba = best_model.predict_proba(test[X])
pred = (pred_proba[:, 1] >= best_thr).astype(int)

print(classification_report(test[y], pred))

roc_auc, accuracy, precision, recall, f1, weighted_acc = calc_metrics(test[y], pred, pred_proba)

print(f'lr with optimized thr | {roc_auc = :.3f} | {weighted_acc = :.3f}')

matrix = confusion_matrix(test[y], pred)
plt.figure(figsize=(10, 6))
sns.heatmap(
    matrix,
    annot=True,
    xticklabels=['Good', 'Bad'],
    yticklabels=['Good', 'Bad'],
    cmap='gnuplot',
    fmt='g',
)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Feature Importance

In [None]:
result = permutation_importance(
    best_model,
    test[X],
    test[y],
    scoring=weighted_accuracy_scorer,
    random_state=RANDOM_STATE,
    n_repeats=50
)
importance_df = pd.DataFrame({'feature': test[X].columns, 'importance': result.importances_mean})
importance_df = importance_df.sort_values('importance', ascending=False)
display(importance_df[:15])


Наиболее важными являются признаки, описывающие состояние текущего и накопительного счетов, возраст, цель кредита, а также некоторые сгенерированные признаки.  

Для повышения качества рекомендуется обогатить датасет следующими данными:  

1. Кредитная история:  
    Суммарный показатель кредитоспособности, основанный на истории предыдущих кредитов, платежей, задолженностей.  
    Прямо отражает, насколько ответственно заявитель относится к финансовым обязательствам.  

2. Соотношение долга к доходу (Debt-to-Income Ratio, DTI):  
    Процент ежемесячного дохода, который идет на погашение долгов (кредиты, ипотека, алименты и др.).  
    Показывает финансовую нагрузку на заявителя и его способность погашать новый кредит.  
    Наш датасет обладает аналогичным признаком по запрашиваемому займу,  
    может быть полезным показатель DTI по всем имеющимся обязательствам.

3.  Стаж работы и доход:  
    Текущее место работы, должность, уровень  дохода.  
    Стабильный доход и работа - ключевые факторы, влияющие на способность погашать кредит.