In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
from sklearn.model_selection import train_test_split
import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, MinMaxScaler, OrdinalEncoder, MaxAbsScaler, Normalizer
from boruta import BorutaPy
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit, KFold
from tqdm.notebook import tqdm
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [5]:
import time

In [6]:
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
from lifelines.utils import concordance_index

# Модель

In [9]:
df_client = pd.read_csv('clients.csv')
df_currency = pd.read_csv('currency_rk.csv')
df_mcc_codes = pd.read_csv('mcc_codes.csv')
df_report_dates = pd.read_csv('report_dates.csv')
df_transactions = pd.read_csv('transactions.csv')
df_train = pd.read_csv('train.csv')
fun_agg = ['count',  'sum','median','std','nunique']
fun_agg_min = ['sum','median','count']
df_transactions['sign'] = df_transactions['transaction_amt'].apply(lambda x: 'positive' if x >= 0 else 'negative')
df_client = df_client.merge(df_report_dates, on='report')
df_transactions = df_transactions.merge(df_client, on='user_id')
df_transactions[['currency_rk']] = df_transactions[['currency_rk']].map(lambda x: 0 if x != 1 else 1)
df_transactions['transaction_dttm'] = pd.to_datetime(df_transactions['transaction_dttm'])
df_transactions['day'] = df_transactions['transaction_dttm'].dt.day_of_year.astype(int)
df_transactions['transaction_dttm'] = pd.to_datetime(df_transactions['transaction_dttm'])
df_transactions['date'] = df_transactions['transaction_dttm'].dt.date
df_transactions['report_dt'] = pd.to_datetime(df_transactions['report_dt']).dt.date
df_transactions['delta_date'] = df_transactions['report_dt'] - df_transactions['date']
df_transactions.delta_date= df_transactions.delta_date.map(lambda x: x.days)
df_transactions['employee_count_nm'] = df_transactions['employee_count_nm'].fillna('НЕИЗВЕСТНО')
df_transactions 

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,sign,report,employee_count_nm,bankemplstatus,customer_age,report_dt,day,date,delta_date
0,3,3,1,-183.88,2022-01-28 12:05:33,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,28,2022-01-28,215
1,3,3,1,-3206.44,2022-01-28 12:52:30,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,28,2022-01-28,215
2,3,16,1,-153866.89,2022-02-16 14:45:56,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,47,2022-02-16,196
3,3,56,1,-15144.60,2022-03-09 19:58:29,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,68,2022-03-09,175
4,3,0,1,5297.91,2022-03-12 18:11:31,positive,2,ОТ 101 ДО 500,0,3,2022-08-31,71,2022-03-12,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13075018,562740,155,1,-2484.37,2023-03-20 11:52:09,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102
13075019,562740,9,1,-187.66,2023-03-20 12:10:22,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102
13075020,562740,1,1,-891.93,2023-03-20 15:53:37,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102
13075021,562740,13,1,-464.47,2023-03-20 15:54:49,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102


In [10]:
#Объедененные фичи
t = pd.read_csv('features_X_train__v3.csv')

X = t.drop('target', axis=1)
y = t['target']


#Определение категориальных
cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((64000,), (64000, 1154))

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X.drop(cat_col, axis=1), y,\
                        test_size = 0.20, random_state = 142, stratify=y)

In [12]:
t_d = df_transactions.groupby(['user_id','day'])['mcc_code'].nunique() 
# t_d = df_transactions.groupby(['user_id','day'])['day'].count() 
t1 = t_d[t_d>11].unstack()
t2 = df_train[['user_id']].merge(t1, on='user_id')

#Объедененные фичи
t = pd.read_csv('features_X_train__v3.csv')
t = t[~t['user_id'].isin(t2['user_id'])]
t = t[~t['user_id'].isin(x_test['user_id'])]
X = t.drop('target', axis=1)
y = t['target']


#Определение категориальных
cat_col = X.select_dtypes(include=['object']).columns.tolist()

x_train, y_train = X.drop(cat_col, axis=1), y
y.shape, X.shape

((51123,), (51123, 1154))

In [13]:
best_xgb_params_1 = {'n_estimators': 1023, 
                     'learning_rate': 0.0114825040369148, 
                     'gamma': 0.031623548362161816, 
                     'subsample': 0.9285796556489303, 
                     'colsample_bytree': 0.15689596063977834, 
                     'max_depth': 5, 'min_child_weight': 1, 
                     'reg_lambda': 0.4990288418087669, 
                     'reg_alpha': 0.005279274212785588}

best_xgb_params_1['booster'] = 'gbtree'
best_xgb_params_1['objective'] = 'binary:logistic'  # Change to binary:logistic for binary classification
best_xgb_params_1["device"] = "cuda"
best_xgb_params_1["verbosity"] = 0
# best_xgb_params_1["enable_categorical"] = True




In [15]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 600, 5000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 0.5),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 100.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 100.0, log=True),
    }
    
    params['booster'] = 'gbtree'
    params['grow_policy'] = 'depthwise'
    params['objective'] = 'binary:logistic'  # Change to binary:logistic for binary classification
    params["tree_method"] = "hist"
    params["device"] = "cuda"
    params["verbosity"] = 0
    
    xgb = XGBClassifier(**params)
    # cv_scores = cross_validate(estimator=xgb, 
    #                            X=x_train.values[:, boruta_selector.support_], 
    #                            y=y_train,  # Assuming target_binary is your binary target variable
    #                            scoring='roc_auc',
    #                            cv=5,
    #                            )

    # cv_evaluation = np.mean(np.abs(cv_scores['test_score']))
    # return cv_evaluation

    xgb.fit(x_train, y_train)
    
    return concordance_index(y_test, xgb.predict_proba(x_test)[:,1])

sqlite_db = "sqlite:///xgb_sqlite.db"
study_name = "binary_classification_XGB_3.5"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=50, multivariate=True, seed=142),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=5000)
print(f"best optimized roc_auc: {study.best_value:0.5f}")
print(f"best hyperparameters: {study.best_params}")

best_cls_params = study.best_params
best_cls_params['objective'] = 'binary:logistic'
params['grow_policy'] = 'depthwise'
best_cls_params["device"] = "cuda"
params["tree_method"] = "hist"
best_cls_params["verbosity"] = 0

[I 2024-04-05 19:49:22,637] Using an existing study with name 'binary_classification_XGB_3.5' instead of creating a new one.


KeyboardInterrupt: 

In [16]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")
print(f"best hyperparameters: {study.best_params}")
study.best_params

best optimized roc_auc: 0.77697
best hyperparameters: {'n_estimators': 853, 'learning_rate': 0.013661226489861992, 'gamma': 0.11696969571136726, 'subsample': 0.8049967773710163, 'colsample_bytree': 0.3339055630223215, 'max_depth': 7, 'min_child_weight': 20, 'reg_lambda': 3.889298224761356e-08, 'reg_alpha': 3.2391819292617063e-06}


{'n_estimators': 853,
 'learning_rate': 0.013661226489861992,
 'gamma': 0.11696969571136726,
 'subsample': 0.8049967773710163,
 'colsample_bytree': 0.3339055630223215,
 'max_depth': 7,
 'min_child_weight': 20,
 'reg_lambda': 3.889298224761356e-08,
 'reg_alpha': 3.2391819292617063e-06}

In [None]:
best_xgb_params_1 = study.best_params
best_xgb_params_1 ['booster'] = 'gbtree'
best_xgb_params_1 ['grow_policy'] = 'depthwise'
best_xgb_params_1 ['objective'] = 'binary:logistic'  # Change to binary:logistic for binary classification
best_xgb_params_1 ["tree_method"] = "hist"
best_xgb_params_1 ["device"] = "cuda"
best_xgb_params_1 ["verbosity"] = 0
    

In [None]:
model_xgb = XGBClassifier(**best_xgb_params_1)
model_xgb.fit(x_train,y_train)

In [None]:
print(f'Concordance Index: {concordance_index(y_test, model_xgb.predict_proba(x_test)[:,1])}')

In [None]:
print(classification_report(y_test, model_xgb.predict(x_test)))

In [None]:
model_xgb = XGBClassifier(**best_xgb_params_1)
model_xgb.fit(X,y)

In [None]:
print(f'Concordance Index: {concordance_index(y_test, model_xgb.predict_proba(x_test)[:,1])}')

In [None]:
print(classification_report(y_test, model_xgb.predict(x_test)))