In [228]:
from hyperopt import hp, Trials, fmin, tpe
from sklearn import metrics
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
from functools import partial
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

In [229]:
with open('../data/train_df.pkl', 'rb') as handle:
    train_df = pickle.load(handle)

with open('../data/test_df.pkl', 'rb') as handle:
    test_df = pickle.load(handle)

In [230]:
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 33 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   inn                                4500 non-null   int64  
 1   data_ОКВЭД_Код                     4500 non-null   object 
 2   data_Налоги_СумУпл                 4500 non-null   float64
 3   data_Налоги_СумНедоим              4500 non-null   float64
 4   data_УстКап_Сумма                  4500 non-null   float64
 5   data_Санкции                       4500 non-null   int64  
 6   data_НелегалФин                    4500 non-null   int64  
 7   Возраст_компании                   4500 non-null   int64  
 8   Недостоверный_адрес                4500 non-null   int64  
 9   Особый_налоговый_режим             4500 non-null   int64  
 10  Массовый_учредитель                4500 non-null   int64  
 11  Есть_банкротство                   4500 non-null   int64

In [231]:
X, y = train_df.drop(columns=['target']), train_df['target']
X['data_ОКВЭД_Код'] = X['data_ОКВЭД_Код'].apply(lambda x: int(x.split('.')[0]))

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)


In [233]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [234]:
space = {'max_depth': hp.quniform("max_depth", 3, 18, 1), 
        'gamma': hp.uniform('gamma', 1, 9), 
        'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
        'reg_lambda': hp.uniform('reg_lambda', 0, 1), 
        'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
        'min_child_weight': hp.quniform('min_child_weight', 0, 50, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [235]:
def objective(space, X_train, y_train, X_test, y_test): 
    clf = xgb.XGBClassifier(
        n_estimators = int(space['n_estimators']),
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        reg_lambda = space['reg_lambda'],
        colsample_bytree = float(space['colsample_bytree']),
        min_child_weight = int(space['min_child_weight']),
        scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
        early_stopping_rounds=10
    )

    evalution = [(X_train, y_train), (X_test, y_test)]

    clf.fit(X_train, y_train, 
            eval_set=evalution,
            verbose=False,
           )
    
    pred_proba = clf.predict_proba(X_test)[:,1]
    roc_auc = roc_auc_score(y_test, pred_proba)
    

    return -roc_auc

In [236]:
trials = Trials()

objective_with_data = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)

best_hyperparametrs = fmin(fn = objective_with_data,
                           space = space,
                           algo= tpe.suggest,
                           max_evals=100,
                           trials=trials,
                        )

best_hyperparametrs

100%|██████████| 100/100 [00:09<00:00, 10.90trial/s, best loss: -0.8780663967611335]


{'colsample_bytree': np.float64(0.5576412103528469),
 'gamma': np.float64(2.6280355100681327),
 'max_depth': np.float64(17.0),
 'min_child_weight': np.float64(30.0),
 'reg_alpha': np.float64(45.0),
 'reg_lambda': np.float64(0.9787834967133361)}

In [237]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

final_params = {
    'n_estimators': 180, 
    'max_depth': int(best_hyperparametrs['max_depth']),
    'gamma': float(best_hyperparametrs['gamma']),
    'reg_alpha': int(best_hyperparametrs['reg_alpha']),
    'reg_lambda': float(best_hyperparametrs['reg_lambda']),
    'colsample_bytree': float(best_hyperparametrs['colsample_bytree']),
    'min_child_weight': int(best_hyperparametrs['min_child_weight']),
    'scale_pos_weight': scale_pos_weight 
}

In [238]:
final_model = xgb.XGBClassifier(**final_params)

final_model.fit(X_train, y_train)

predictions = final_model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, predictions > 0.5)

print(f"Точность на тестовых данных: {accuracy}")

Точность на тестовых данных: 0.8614285714285714


In [239]:
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, predictions)
print(conf_matrix)

print("ROC-AUC Score:")
roc_auc = roc_auc_score(y_test, predictions)
print(roc_auc)

print("Gini Coefficient:")
gini = 2 * roc_auc - 1
print(gini)

print("Отчет:")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[1468  157]
 [ 134  341]]
ROC-AUC Score:
0.8106396761133603
Gini Coefficient:
0.6212793522267206
Отчет:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      1625
           1       0.68      0.72      0.70       475

    accuracy                           0.86      2100
   macro avg       0.80      0.81      0.81      2100
weighted avg       0.86      0.86      0.86      2100



In [None]:
test_df['data_ОКВЭД_Код'] = test_df['data_ОКВЭД_Код'].apply(lambda x: int(x.split('.')[0]))

In [253]:
X_test_final = test_df
real_predictions = final_model.predict(X_test_final)
real_predictions

array([0, 0, 1, ..., 0, 0, 1])

In [250]:
real_predictions.size
test_df.shape

(4500, 33)

In [255]:
result = pd.DataFrame({'inn': test_df['inn'],
                       'target': real_predictions})

In [256]:
print("\nРаспределение предсказанных классов в тестовом наборе:")
#print(results_df['predicted_class'].value_counts(normalize=True) * 100)

print("\nСтатистика вероятностей:")
#print(results_df['probability_class_1'].describe())

result.to_csv('../results/test_predictions.csv', index=False)


Распределение предсказанных классов в тестовом наборе:

Статистика вероятностей:


In [242]:
with open('../results/checkpoints_V0.pkl', 'wb') as handle:
    pickle.dump(final_model, handle)