<a href="https://colab.research.google.com/github/V-L-A-P-P/BankruptcyPrediction/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries import

In [None]:
!pip install catboost
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score




# Data loading

In [None]:
# labeled dataset with financial
data = pd.read_csv('train.csv')

# Data review

In [None]:
data.shape

(733271, 125)

In [None]:
data.head()

Unnamed: 0,ID_FIRM,P1110_B,P1110_E,P1120_B,P1120_E,P1130_B,P1130_E,P1140_B,P1140_E,P1150_B,...,P2421_E,P2430_B,P2430_E,P2450_B,P2450_E,P2460_B,P2460_E,P2400_B,P2400_E,BANKR
0,1,8,7,0,0,0,0,0,0,402,...,161,0,0,0,0,0,0,1561,621,0
1,2,0,0,3318,3318,0,0,0,0,208809,...,0,0,0,0,0,0,8551,-15296,-16123,0
2,3,0,0,0,0,0,0,0,0,237,...,0,0,0,0,0,6,0,-2166,-1375,0
3,4,0,0,0,0,0,0,0,0,15428,...,-55,0,22,0,4,0,0,-3390,502,0
4,5,0,0,0,0,0,0,0,0,340249,...,-895,0,11,0,0,105,0,26131,6100,0


# Feature Engineering

In [None]:
def add_financial_features(df):
    """
    Добавляет 15 ключевых производных фичей для прогнозирования банкротства.
    Обрабатывает деление на ноль и отрицательные значения.
    Возвращает DataFrame с новыми колонками.
    """
    # Создаем копию, чтобы не изменять исходный датафрейм
    df = df.copy()

    # 1. Финансовая устойчивость
    df['equity_to_assets_ratio'] = np.where(
        df['P1700_E'] != 0,
        df['P1300_E'] / df['P1700_E'],
        np.nan
    )

    df['debt_to_capital'] = np.where(
        df['P1700_E'] != 0,
        (df['P1400_E'] + df['P1500_E']) / df['P1700_E'],
        np.nan
    )

    # 2. Ликвидность
    df['current_ratio'] = np.where(
        df['P1500_E'] != 0,
        df['P1200_E'] / df['P1500_E'],
        np.nan
    )

    df['quick_ratio'] = np.where(
        df['P1500_E'] != 0,
        (df['P1250_E'] + df['P1230_E']) / df['P1500_E'],
        np.nan
    )

    df['cash_ratio'] = np.where(
        df['P1500_E'] != 0,
        df['P1250_E'] / df['P1500_E'],
        np.nan
    )

    # 3. Рентабельность
    avg_assets = (df['P1600_B'] + df['P1600_E']) / 2
    df['roa'] = np.where(
        avg_assets != 0,
        df['P2400_E'] / avg_assets,
        np.nan
    )

    df['ros'] = np.where(
        df['P2110_E'] != 0,
        df['P2400_E'] / df['P2110_E'],
        np.nan
    )

    avg_equity = (df['P1300_B'] + df['P1300_E']) / 2
    df['roe'] = np.where(
        avg_equity != 0,
        df['P2400_E'] / avg_equity,
        np.nan
    )

    # 4. Анализ динамики
    df['revenue_growth_pct'] = np.where(
        df['P2110_B'].abs() > 0,  # Защита от деления на 0
        (df['P2110_E'] - df['P2110_B']) / df['P2110_B'].abs(),
        np.nan
    )

    df['net_debt_change'] = (df['P1410_E'] + df['P1510_E']) - (df['P1410_B'] + df['P1510_B'])

    df['cash_flow_change'] = df['P1250_E'] - df['P1250_B']

    # 5. Структурные показатели
    df['altman_z_score'] = (
        1.2 * (df['P1200_E'] / df['P1700_E'].replace(0, np.nan)) +
        1.4 * (df['P1370_E'] / df['P1700_E'].replace(0, np.nan)) +
        3.3 * (df['P2200_E'] / df['P1700_E'].replace(0, np.nan)) +
        0.6 * (df['P1300_E'] / df['P1500_E'].replace(0, np.nan)) +
        0.999 * (df['P2110_E'] / df['P1700_E'].replace(0, np.nan)))

    avg_receivables = (df['P1230_B'] + df['P1230_E']) / 2
    df['receivables_turnover'] = np.where(
        avg_receivables != 0,
        df['P2110_E'] / avg_receivables,
        np.nan
    )

    return df

data = add_financial_features(data)
#data = data[list(importance_df.feature)[:50] + ['ID_FIRM', 'BANKR']]
#data = data.drop(columns=['P1100_B', 'P1100_E', 'P1200_B', 'P1200_E', 'P1300_B', 'P1300_E',
#           'P1400_B', 'P1400_E', 'P1500_B', 'P1500_E'])


# Selection of best parameters

In [None]:


def weighted_accuracy(y_true, y_pred):
    weights = {0: 1.0, 1: len(y_true[y_true==0]) / len(y_true[y_true==1])}  # Вес = 1/пропорция класса
    sample_weights = [weights[cls] for cls in y_true]
    return accuracy_score(y_true, y_pred, sample_weight=sample_weights)

X = data.drop(['ID_FIRM', 'BANKR'], axis=1)
y = data.BANKR

# Загрузка данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(verbose=100,
                           eval_metric='BalancedAccuracy',
                           depth=5,
                           learning_rate=0.01,
                           l2_leaf_reg=5,
                           min_data_in_leaf=10)

param_grid = {
    #'depth': [4, 5, 6],
    #'learning_rate': [0.01, 0.05],
    'iterations': [1000, 2000],
    'scale_pos_weight': [238, 300],
    #'l2_leaf_reg': [3, 5],
    #'min_data_in_leaf': [5, 10]
}

grid_search = GridSearchCV(estimator=model,
                         param_grid=param_grid,
                         cv=3,
                         scoring=make_scorer(weighted_accuracy),
                         n_jobs=-1,
                         refit=True)

grid_search.fit(X_train, y_train,
               verbose=100,
               early_stopping_rounds=50)

print("Лучшие параметры:", grid_search.best_params_)
print("Лучший score:", grid_search.best_score_)


KeyboardInterrupt: 

# Model fitting

In [None]:
X = data.drop(['ID_FIRM', 'BANKR'], axis=1)
y = data.BANKR

# Загрузка данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight(class_weight="balanced", classes=np.unique(data.BANKR), y=data.BANKR)

model = CatBoostClassifier(
    iterations=2000,       # Количество итераций (деревьев)
    learning_rate=0.01,    # Скорость обучения
    depth=5,              # Глубина деревьев
    eval_metric='BalancedAccuracy',     # Метрика для валидации
    verbose=100,          # Вывод логов каждые 100 итераций
    scale_pos_weight=300,
    #l2_leaf_reg=3,
    #min_data_in_leaf=5,

    min_data_in_leaf=5,
    grow_policy='Lossguide',
    l2_leaf_reg=10,
    random_strength=0.5,    # Рандомизация для устойчивости к шуму
    bootstrap_type='Bayesian',  # Байесовская бустировка
)

model.fit(
    X_train, y_train,
    early_stopping_rounds=50,  # Ранняя остановка, если нет улучшений
)

0:	learn: 0.8301085	total: 2.26s	remaining: 37m 39s


KeyboardInterrupt: 

# Score estimating