In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
from sklearn.model_selection import train_test_split
import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler, MinMaxScaler, OrdinalEncoder, MaxAbsScaler, Normalizer
from boruta import BorutaPy
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit, KFold
from tqdm.notebook import tqdm
import optuna
from optuna.samplers import TPESampler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate
from sksurv.ensemble import GradientBoostingSurvivalAnalysis

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [4]:
import time

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from lifelines.utils import concordance_index
def metric_ottoc(event_times, predicted_scores, event_observed=None):
    # event_times - реальные времена событий
    # ppredicted_scores - предсказанные времена событий
    # status - статусы цензурирования (необязательно)
    concordance_index_value = concordance_index(event_times, predicted_scores, event_observed=None)
    result = max(concordance_index_value, 1 - concordance_index_value)
    return result


def corr(data):
    plt.figure(figsize=(15,15))
    sns.heatmap(data, annot=True, cmap='coolwarm', fmt='.2f', linewidths= 0.5)
    plt.title('Correlation Matrix of Features')


# Модель

In [7]:
df_client = pd.read_csv('clients.csv')
df_currency = pd.read_csv('currency_rk.csv')
df_mcc_codes = pd.read_csv('mcc_codes.csv')
df_report_dates = pd.read_csv('report_dates.csv')
df_transactions = pd.read_csv('transactions.csv')
df_train = pd.read_csv('train.csv')
fun_agg = ['count',  'sum','median','std','nunique']
fun_agg_min = ['sum','median','count']
df_transactions['sign'] = df_transactions['transaction_amt'].apply(lambda x: 'positive' if x >= 0 else 'negative')
df_client = df_client.merge(df_report_dates, on='report')
df_transactions = df_transactions.merge(df_client, on='user_id')
df_transactions[['currency_rk']] = df_transactions[['currency_rk']].map(lambda x: 0 if x != 1 else 1)
df_transactions['transaction_dttm'] = pd.to_datetime(df_transactions['transaction_dttm'])
df_transactions['day'] = df_transactions['transaction_dttm'].dt.day_of_year.astype(int)
df_transactions['transaction_dttm'] = pd.to_datetime(df_transactions['transaction_dttm'])
df_transactions['date'] = df_transactions['transaction_dttm'].dt.date
df_transactions['report_dt'] = pd.to_datetime(df_transactions['report_dt']).dt.date
df_transactions['delta_date'] = df_transactions['report_dt'] - df_transactions['date']
df_transactions.delta_date= df_transactions.delta_date.map(lambda x: x.days)
df_transactions['employee_count_nm'] = df_transactions['employee_count_nm'].fillna('НЕИЗВЕСТНО')
df_transactions 

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,sign,report,employee_count_nm,bankemplstatus,customer_age,report_dt,day,date,delta_date
0,3,3,1,-183.883957,2022-01-28 12:05:33,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,28,2022-01-28,215
1,3,3,1,-3206.437012,2022-01-28 12:52:30,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,28,2022-01-28,215
2,3,16,1,-153866.890625,2022-02-16 14:45:56,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,47,2022-02-16,196
3,3,56,1,-15144.601562,2022-03-09 19:58:29,negative,2,ОТ 101 ДО 500,0,3,2022-08-31,68,2022-03-09,175
4,3,0,1,5297.908691,2022-03-12 18:11:31,positive,2,ОТ 101 ДО 500,0,3,2022-08-31,71,2022-03-12,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13075018,562740,155,1,-2484.366211,2023-03-20 11:52:09,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102
13075019,562740,9,1,-187.658463,2023-03-20 12:10:22,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102
13075020,562740,1,1,-891.933350,2023-03-20 15:53:37,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102
13075021,562740,13,1,-464.467316,2023-03-20 15:54:49,negative,12,НЕИЗВЕСТНО,0,0,2023-06-30,79,2023-03-20,102


In [8]:
#Объедененные фичи
t = pd.read_csv('features_X_train__v3.csv')

X = t.drop('target', axis=1)
y = t['target']


#Определение категориальных
cat_col = X.select_dtypes(include=['object']).columns.tolist()
y.shape, X.shape

((64000,), (64000, 1154))

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y,\
                        test_size = 0.20, random_state = 142, stratify=y)

In [10]:
t_d = df_transactions.groupby(['user_id','day'])['mcc_code'].nunique() 
# t_d = df_transactions.groupby(['user_id','day'])['day'].count() 
t1 = t_d[t_d>11].unstack()
t2 = df_train[['user_id']].merge(t1, on='user_id')

#Объедененные фичи
t = pd.read_csv('features_X_train__v3.csv')
t = t[~t['user_id'].isin(t2['user_id'])]
t = t[~t['user_id'].isin(x_test['user_id'])]
X = t.drop('target', axis=1)
y = t['target']


#Определение категориальных
cat_col = X.select_dtypes(include=['object']).columns.tolist()

x_train, y_train = X, y
x_train.shape, y_train.shape, x_test.shape, y_test.shape 

((51123, 1154), (51123,), (12800, 1154), (12800,))

In [11]:

                       
# print(x_train.shape)
# x_concat = []
# y_concat = []
# for i in tqdm(range(35)):
#     x_x = x_train.loc[y_train[y_train == 1]]
#     y_x = y_train[y_train == 1]
#     x_concat.append(x_x)
#     y_concat.append(y_x)


# x_train = pd.concat([x_train] + x_concat)
# y_train = pd.concat([y_train] + y_concat)
# x_train.index = range(x_train.shape[0])
# y_train.index = range(x_train.shape[0])
# x_train = x_train.sample(frac=1)
# y_train = y_train.loc[x_train.index]
# x_train.index = range(x_train.shape[0])
# y_train.index = range(x_train.shape[0])
                    


# print(x_train.shape[0], y_train.shape[0])

In [None]:
def objective(trial):

    param = {
        'verbose': False,
        'random_seed': 142,
        'eval_metric': "AUC",
        'iterations': trial.suggest_int("iterations", 1000, 2000),
        'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        'depth': trial.suggest_int("depth", 5, 8),
        'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bernoulli", "MVS"]),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1, 10),
        'subsample': trial.suggest_float("subsample", 0.1, 1),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 30, log=True),
        'random_strength': trial.suggest_float("random_strength", 0.1, 1.7),
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.1),
        'min_data_in_leaf': trial.suggest_int("min_data_in_leaf", 1, 12),
    }
    param['cat_features'] = cat_col 
    param['boosting_type'] = "Plain"
    # param['task_type'] ="GPU"
    # if param["bootstrap_type"] == "Bayesian":
    #     param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 12)
    # elif param['bootstrap_type'] != 'Bayesian': 
    #     param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
    # if param["objective"] == "Logloss":
    #     param['scale_pos_weight'] = trial.suggest_float('scale_pos_weight', 0.1, 12, log=True)
 
    clf = CatBoostClassifier(**param)

    clf.fit(x_train, y_train)
    
    return concordance_index(y_test, clf.predict_proba(x_test)[:,1])

    
    # cv_scores = cross_validate(estimator=clf, 
    #                            X=x_train, 
    #                            y=y_train,  # Assuming target_binary is your binary target variable
    #                            scoring='roc_auc',
    #                            cv=5,
    #                            )

    # cv_evaluation = np.mean(np.abs(cv_scores['test_score']))
    # return cv_evaluation


sqlite_db = "sqlite:///sqlite_cat.db"
study_name = "binary_classification_Cat_4.9"
study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                            sampler=TPESampler(n_startup_trials=30, multivariate=True, seed=142),
                            direction="maximize", load_if_exists=True)

study.optimize(objective, n_trials=1000)
print(f"best optimized roc_auc: {study.best_value:0.5f}")
print(f"best hyperparameters: {study.best_params}")
study.best_params

[I 2024-04-14 06:21:10,240] Using an existing study with name 'binary_classification_Cat_4.9' instead of creating a new one.


In [None]:
print(f"best optimized roc_auc: {study.best_value:0.5f}")

params = study.best_params
params['cat_features'] = cat_col 
params['boosting_type'] = "Plain"
print(f"best hyperparameters: {params}")

params
