Библиотека PyOD

In [1]:
from pyod.models.abod import ABOD
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.mcd import MCD
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
random_state = 42

In [2]:
from pyod.models.xgbod import XGBOD

In [3]:
import os
import pickle
from tqdm import trange
from base import data, metrics

import warnings
warnings.filterwarnings("ignore")

# Данные
path = 'D:\Work_dir\Master_work\Data\Bank_Account_Fraud_Dataset_Suite'
# Папка с сохраненными  моделями
baf_path = 'BAF_model'

In [4]:
import pandas as pd

In [5]:
X_train, y_train, X_val, y_val, X_test, y_test = data.prep_data_baf(path)

In [6]:
X_train.shape

(675666, 61)

In [7]:
from imblearn.under_sampling import RandomUnderSampler

In [8]:
US = RandomUnderSampler(random_state=42)

In [9]:
X_sm, y_sm = US.fit_resample(X_train, y_train)

In [10]:
clf = XGBOD(estimator_list=None,
            standardization_flag_list=None,
            max_depth=3, learning_rate=0.1,
            n_estimators=100, silent=False,
            objective='binary:logistic',
            booster='gbtree', n_jobs=1,
            nthread=1, gamma=0,
            min_child_weight=1,
            max_delta_step=0,
            subsample=1, colsample_bytree=1,
            colsample_bylevel=1, reg_alpha=0,
            reg_lambda=1, scale_pos_weight=1,
            base_score=0.5, random_state=0)

In [11]:
%%time
clf.fit(X_sm, y_sm)

Parameters: { "silent" } are not used.

Wall time: 4min 9s


XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no...ax_features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=0,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=1, nthread=1,
   objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1,
   scale_pos_weight=1, silent=False,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, Fal

In [12]:
x_pred = clf.predict(X_test)

In [13]:
metrics.show_metrics(y_test.values, x_pred)

True positive rate (recall):  0.7810979847116053
False positive rate:  0.19115631786991732
Precision:  0.05498080074351261
Recall:  0.7810979847116053
True negative rate:  0.8088436821300826
ROC-AUC:  0.794970833420844
F1:  0.10273049240260482
MCC:  0.17369783564689292
G-mean:  0.7948497782968312


In [14]:
# Сохраним модель
with open(f'{baf_path}/XGBOD', 'wb') as fp:
    pickle.dump(clf, fp)

In [17]:
# Инициализируем Pandas DataFrame
results = pd.DataFrame(index=['XGBOD_BAF'], columns=['Recall_not_fr', 'Recall_fr', 'AUC', 'F1', 'MCC', 'G_mean'], data=0)
results

Unnamed: 0,Recall_not_fr,Recall_fr,AUC,F1,MCC,G_mean
XGBOD_BAF,0,0,0,0,0,0


In [18]:
recall_nf, recall_f, auc, f1, mcc, g_mean = metrics.write_metrics(y_test, x_pred)
tmp = {'Recall_not_fr':recall_nf,
        'Recall_fr':recall_f,
        'AUC':auc,
        'F1':f1,
        'MCC':mcc,
        'G_mean':g_mean}
results.loc['XGBOD_BAF'] = tmp

In [19]:
results

Unnamed: 0,Recall_not_fr,Recall_fr,AUC,F1,MCC,G_mean
XGBOD_BAF,0.808844,0.781098,0.794971,0.10273,0.173698,0.79485


In [20]:
results.reset_index(inplace=True)
results

Unnamed: 0,index,Recall_not_fr,Recall_fr,AUC,F1,MCC,G_mean
0,XGBOD_BAF,0.808844,0.781098,0.794971,0.10273,0.173698,0.79485


In [21]:
results.to_excel('baf_results.xlsx', index=False)