In [1]:
!pip install optuna
!pip install lifelines

Collecting lifelines
  Downloading lifelines-0.28.0-py3-none-any.whl.metadata (3.2 kB)
Collecting autograd>=1.5 (from lifelines)
  Downloading autograd-1.6.2-py3-none-any.whl.metadata (706 bytes)
Collecting autograd-gamma>=0.3 (from lifelines)
  Downloading autograd-gamma-0.5.0.tar.gz (4.0 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting formulaic>=0.2.2 (from lifelines)
  Downloading formulaic-1.0.1-py3-none-any.whl.metadata (6.1 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=0.2.2->lifelines)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading lifelines-0.28.0-py3-none-any.whl (349 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.2/349.2 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading autograd-1.6.2-py3-none-any.whl (49 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading formulaic-1.0.1-py3-none-

In [2]:
import gc
import pickle
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
import optuna

In [3]:
clients = pd.read_csv('/kaggle/input/data-fusion-contest-2024-2/clients.csv')
report_dates = pd.read_csv('/kaggle/input/data-fusion-contest-2024-2/report_dates.csv', parse_dates=['report_dt'])
train = pd.read_csv('/kaggle/input/data-fusion-contest-2024-2/train.csv')
transactions = pd.read_csv('/kaggle/input/data-fusion-contest-2024-2/transactions.csv.zip/transactions.csv', 
                           parse_dates=['transaction_dttm'])

In [4]:
clients = clients.merge(report_dates, how='left', on='report')
transactions = transactions.sort_values('transaction_dttm').reset_index(drop=True)
transactions = transactions.merge(clients, how='left', on='user_id')

# 1.

In [5]:
transactions['trans_positive'] = np.where( transactions['transaction_amt']>0,transactions['transaction_amt'],np.nan)
transactions['trans_negative'] = np.where( transactions['transaction_amt']<0,
                                          np.abs(transactions['transaction_amt']),np.nan)
transactions['days_to_report'] = (transactions['report_dt'] - transactions['transaction_dttm']).dt.days

In [6]:
min_day_trans = transactions.groupby(['user_id'])[['days_to_report']].min().\
    rename(columns={"days_to_report":"min_day_trans"}).reset_index()
max_day_trans = transactions.groupby(['user_id'])[['days_to_report']].max().\
    rename(columns={"days_to_report":"max_day_trans"}).reset_index()
count_trans = transactions.groupby(['user_id'])[['days_to_report']].count().\
    rename(columns={"days_to_report":"count_trans"}).reset_index()
nunique_days = transactions.groupby('user_id')[['days_to_report']].nunique().\
    rename({'days_to_report': 'nunique_days'}, axis=1).reset_index()

# 2.

In [7]:
transactions = transactions.merge(min_day_trans, how='left', on='user_id')

In [8]:
transactions['days_groups'] = 0
transactions.loc[transactions['days_to_report']<=transactions['min_day_trans']+10, 'days_groups'] = 10
transactions.loc[transactions['days_to_report']<=transactions['min_day_trans']+5, 'days_groups'] = 5
transactions.loc[transactions['days_to_report']==transactions['min_day_trans'], 'days_groups'] = 1

In [9]:
trans_days_groups = transactions[~transactions['days_groups'].isin([0])].pivot_table(
    index = 'user_id',
    values=['trans_positive', 'trans_negative'],
    columns=['days_groups'],
    aggfunc=['count','sum', 'max', 'min']
)
trans_days_groups.columns = [f'days_groups_{x[0]}_{x[1]}_{x[2]}' for x in trans_days_groups.columns]
trans_days_groups.reset_index(inplace=True)

In [10]:
trans_cur_groups = transactions.pivot_table(
    index = 'user_id',
    values=['trans_positive', 'trans_negative'],
    columns=['currency_rk'],
    aggfunc=['count','sum', 'max', 'min']
)
trans_cur_groups.columns = [f'cur_groups_{x[0]}_{x[1]}_{x[2]}' for x in trans_cur_groups.columns]
trans_cur_groups.reset_index(inplace=True)

# 3.

In [11]:
mcc_days_groups = transactions[~transactions['days_groups'].isin([0])].pivot_table(
    index = 'user_id',
    values=['mcc_code'],
    columns=['days_groups'],
    aggfunc=["count",'nunique']
)
mcc_days_groups.columns = [f'days_groups_mcc_{x[0]}_{x[1]}_{x[2]}' for x in mcc_days_groups.columns]
mcc_days_groups.reset_index(inplace=True)

In [12]:
count_mcc_code = transactions.mcc_code.value_counts().to_frame().reset_index()

count_mcc_code20000 = np.array(count_mcc_code[(count_mcc_code['count']>20000)&
                                             (count_mcc_code['count']<1000000)].mcc_code)

mcc_code_dumm20000 = pd.get_dummies(transactions[transactions['mcc_code'].isin(count_mcc_code20000)].\
                               set_index('user_id')['mcc_code'])
mcc_code_dumm20000.columns = [f'mcc_count_{x}' for x in mcc_code_dumm20000.columns]
mcc_code_dumm20000 = mcc_code_dumm20000.groupby(['user_id']).agg('sum').reset_index()

In [13]:
count_mcc_code = transactions.mcc_code.value_counts().to_frame().reset_index()

count_mcc_code20000 = np.array(count_mcc_code[(count_mcc_code['count']>20000)&
                                             (count_mcc_code['count']<1000000)].mcc_code)

mcc_code_dumm20000 = pd.get_dummies(transactions[transactions['mcc_code'].isin(count_mcc_code20000)].\
                               set_index('user_id')['mcc_code'])
mcc_code_dumm20000.columns = [f'mcc_count_{x}' for x in mcc_code_dumm20000.columns]
mcc_code_dumm20000 = mcc_code_dumm20000.groupby(['user_id']).agg('sum').reset_index()

In [14]:
negative_mcc_code_dumn_20000 = transactions[transactions['mcc_code'].isin(count_mcc_code20000)][['user_id', 'trans_negative', 'mcc_code']]
negative_mcc_code_dumn_20000 = negative_mcc_code_dumn_20000.pivot_table(values="trans_negative",
    index="user_id",
    columns="mcc_code",
    aggfunc='sum',
    fill_value=0,)
negative_mcc_code_dumn_20000.columns = list(
    map(lambda x: "negative_mcc_code_sum_"+str(x), negative_mcc_code_dumn_20000.columns))

In [15]:
positive_mcc_code_dumn_20000 = transactions[transactions['mcc_code'].isin(count_mcc_code20000)][['user_id', 'trans_positive', 'mcc_code']]
positive_mcc_code_dumn_20000 = positive_mcc_code_dumn_20000.pivot_table(values="trans_positive",
    index="user_id",
    columns="mcc_code",
    aggfunc='sum',
    fill_value=0,)
positive_mcc_code_dumn_20000.columns = list(
    map(lambda x: "positive_mcc_code_sum_"+str(x), positive_mcc_code_dumn_20000.columns))

In [16]:
count_mcc_code10000 = np.array(count_mcc_code[count_mcc_code['count']>=1000000].mcc_code)
mcc_code_dumm10000 = transactions[transactions['mcc_code'].isin(count_mcc_code10000)][['user_id','mcc_code','trans_positive','trans_negative']]
mcc_code_dumm10000 = mcc_code_dumm10000.groupby(['user_id']).agg(
    {
        'mcc_code': 'count',
        'trans_positive' : 'sum',
        'trans_negative' : 'sum'
    }).reset_index().rename(columns={'mcc_code':'mcc_count_big', 'trans_positive':'trans_positive_big', 'trans_negative':'trans_negative_big'})

In [17]:
count_mcc_code10000 = np.array(count_mcc_code[count_mcc_code['count']<=20000].mcc_code)
mcc_code_dumm00001 = transactions[transactions['mcc_code'].isin(count_mcc_code10000)][['user_id','mcc_code','trans_positive','trans_negative']]
mcc_code_dumm00001 = mcc_code_dumm00001.groupby(['user_id']).agg(
    {
        'mcc_code': 'count',
        'trans_positive' : 'sum',
        'trans_negative' : 'sum'        
    }).reset_index().rename(columns={'mcc_code':'mcc_count_small', 'trans_positive':'trans_positive_small', 'trans_negative':'trans_negative_small'})

# 4.

In [18]:
percent_last_negative = clients[['user_id']].copy()
for x in [3, 30, 60, 90]:
    prev = transactions[transactions['days_to_report'] > x + 100].groupby('user_id')['trans_negative'].agg(['sum']).\
    reset_index().rename({'sum': f'sum_transaction_before_{x}_days'}, axis=1)
    last = transactions[transactions['days_to_report'] <= x + 100].groupby('user_id')['trans_negative'].agg(['sum']).\
    reset_index().rename({'sum': f'sum_transaction_last_{x}_days'}, axis=1)

    percent_last_negative = percent_last_negative.merge(prev, how='left', on='user_id')
    percent_last_negative = percent_last_negative.merge(last, how='left', on='user_id')
    percent_last_negative[f'sum_transaction_last_{x}_days'].fillna(.000001, inplace=True)
    percent_last_negative[f'sum_transaction_before_{x}_days'].fillna(0.000001, inplace=True)
    
    percent_last_negative[f'negative_sum_percent_last_{x}'] = np.log((percent_last_negative[f'sum_transaction_last_{x}_days'] / \
    percent_last_negative[f'sum_transaction_before_{x}_days'])*100)
    percent_last_negative[f'negative_sum_percent_last_{x}'] = percent_last_negative[f'negative_sum_percent_last_{x}'].replace(np.inf, 100)
    percent_last_negative[f'negative_sum_percent_last_{x}'] = percent_last_negative[f'negative_sum_percent_last_{x}'].replace(-np.inf, -100)
    percent_last_negative.drop(f'sum_transaction_last_{x}_days', inplace=True, axis=1)
    percent_last_negative.drop(f'sum_transaction_before_{x}_days', inplace=True, axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  percent_last_negative[f'sum_transaction_last_{x}_days'].fillna(.000001, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  percent_last_negative[f'sum_transaction_before_{x}_days'].fillna(0.000001, inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
The 

In [19]:
percent_last_positive = clients[['user_id']].copy()
for x in [3, 30, 60, 90]:
    prev = transactions[transactions['days_to_report'] > x + 100].groupby('user_id')['trans_positive'].agg(['sum']).\
    reset_index().rename({'sum': f'sum_transaction_before_{x}_days'}, axis=1)
    last = transactions[transactions['days_to_report'] <= x + 100].groupby('user_id')['trans_positive'].agg(['sum']).\
    reset_index().rename({'sum': f'sum_transaction_last_{x}_days'}, axis=1)

    percent_last_positive = percent_last_positive.merge(prev, how='left', on='user_id')
    percent_last_positive = percent_last_positive.merge(last, how='left', on='user_id')
    percent_last_positive[f'sum_transaction_last_{x}_days'].fillna(.000001, inplace=True)
    percent_last_positive[f'sum_transaction_before_{x}_days'].fillna(0.000001, inplace=True)
    
    percent_last_positive[f'positive_sum_percent_last_{x}'] = np.log((percent_last_positive[f'sum_transaction_last_{x}_days'] / \
    percent_last_positive[f'sum_transaction_before_{x}_days'])*100)
    percent_last_positive[f'positive_sum_percent_last_{x}'] = percent_last_positive[f'positive_sum_percent_last_{x}'].replace(np.inf, 100)
    percent_last_positive[f'positive_sum_percent_last_{x}'] = percent_last_positive[f'positive_sum_percent_last_{x}'].replace(-np.inf, -100)
    percent_last_positive.drop(f'sum_transaction_last_{x}_days', inplace=True, axis=1)
    percent_last_positive.drop(f'sum_transaction_before_{x}_days', inplace=True, axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  percent_last_positive[f'sum_transaction_last_{x}_days'].fillna(.000001, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  percent_last_positive[f'sum_transaction_before_{x}_days'].fillna(0.000001, inplace=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
The 

In [20]:
percent_last = clients[['user_id']].copy()
for x in [3, 30, 60, 90]:
    prev = transactions[transactions['days_to_report'] > x + 100].groupby('user_id')['report_dt'].agg(['count']).\
    reset_index().rename({'count': f'num_transaction_before_{x}_days'}, axis=1)
    last = transactions[transactions['days_to_report'] <= x + 100].groupby('user_id')['report_dt'].agg(['count']).\
    reset_index().rename({'count': f'num_transaction_last_{x}_days'}, axis=1)

    percent_last = percent_last.merge(prev, how='left', on='user_id')
    percent_last = percent_last.merge(last, how='left', on='user_id')
    percent_last[f'num_transaction_last_{x}_days'].fillna(.000001, inplace=True)
    percent_last[f'num_transaction_before_{x}_days'].fillna(0.000001, inplace=True)
    
    percent_last[f'percent_last_{x}'] = (percent_last[f'num_transaction_last_{x}_days'] / \
    percent_last[f'num_transaction_before_{x}_days'])*100
    percent_last.drop(f'num_transaction_last_{x}_days', inplace=True, axis=1)
    percent_last.drop(f'num_transaction_before_{x}_days', inplace=True, axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  percent_last[f'num_transaction_last_{x}_days'].fillna(.000001, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  percent_last[f'num_transaction_before_{x}_days'].fillna(0.000001, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work

# 5.

In [21]:
transactions['m'] = (transactions['report_dt'].dt.year-transactions['transaction_dttm'].dt.year)*12+(transactions['report_dt'].dt.month-transactions['transaction_dttm'].dt.month)

In [22]:
msumm = transactions.pivot_table(values="trans_negative",
    index="user_id",
    columns="m",
    aggfunc='sum',
    fill_value=0,)
msumm.columns = list( map(lambda x: "msumm"+str(x), msumm.columns ) )
msumm

Unnamed: 0_level_0,msumm3,msumm4,msumm5,msumm6,msumm7,msumm8,msumm9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,0.000000,0.000000,15144.601562,153866.890625,3390.320969,0.000000,0.000000
9,13289.485764,104223.187939,109800.056656,54146.384140,18500.321304,12387.577271,11087.653740
13,16394.193359,27095.248047,27650.769531,76186.732422,33908.388672,53024.051758,19224.679688
37,32719.820328,41364.700729,67081.604048,53322.076138,69769.034760,50056.823341,28284.328693
41,8045.445801,17708.008820,30354.633301,17462.199585,18175.117676,16841.208984,0.000000
...,...,...,...,...,...,...,...
562043,0.000000,142.056610,0.000000,2984.231926,11080.026783,15374.940796,0.000000
562205,10452.779190,7546.495855,6601.707232,4821.073084,6082.155629,5607.360139,1349.243835
562312,0.000000,2644.634823,1362.274536,3165.544842,3938.301605,4495.630554,2931.434910
562721,20904.990967,12082.776123,27648.169678,64254.888449,30059.568512,42680.736959,764.794891


In [23]:
clients = pd.read_csv('/kaggle/input/data-fusion-contest-2024-2/clients.csv')
df = clients.merge(
    train, on="user_id", how="left").merge(
    min_day_trans, on="user_id", how="left").merge(
    max_day_trans, on="user_id", how="left").merge(
    count_trans, on="user_id", how="left").merge(
    nunique_days, on="user_id", how="left").merge(
    trans_days_groups, on="user_id", how="left").merge(
    trans_cur_groups, on="user_id", how="left").merge(
    mcc_days_groups, on="user_id", how="left").merge(
    mcc_code_dumm20000, on="user_id", how="left").merge(
    negative_mcc_code_dumn_20000, on="user_id", how="left").merge(
    positive_mcc_code_dumn_20000, on="user_id", how="left").merge(
    mcc_code_dumm10000, on="user_id", how="left").merge(
    mcc_code_dumm00001, on="user_id", how="left").merge(
    percent_last, on="user_id", how="left").merge(
    percent_last_negative, on="user_id", how="left").merge(
    percent_last_positive, on="user_id", how="left").merge(
    msumm, on="user_id", how="left")
df['pl_days_trans']=(df['max_day_trans']-df['min_day_trans'])/df['nunique_days']
df['pl_count_trans']=df['count_trans']/df['nunique_days']

In [24]:
df.replace({'employee_count_nm':{'ОТ 101 ДО 500':1,'БОЛЕЕ 1001':2,'ОТ 501 ДО 1000':3,'ДО 10':4,
                                      'ОТ 11 ДО 50':5,'ОТ 51 ДО 100':6,'БОЛЕЕ 500':7,'ОТ 11 ДО 30':8,
                                      'ОТ 31 ДО 50':9}}, inplace=True)

  df.replace({'employee_count_nm':{'ОТ 101 ДО 500':1,'БОЛЕЕ 1001':2,'ОТ 501 ДО 1000':3,'ДО 10':4,


In [25]:
df['label'] = np.where(df['target']==0, -df['time'], df['time'])
df['time'] = df['time'].fillna(-1)
df['time'] = df['time'].astype(np.int32)
df['target'] = df['target'].fillna(-1)
df['target'] = df['target'].astype(np.int8)

In [26]:
df.head()

Unnamed: 0,user_id,report,employee_count_nm,bankemplstatus,customer_age,target,time,min_day_trans,max_day_trans,count_trans,...,msumm3,msumm4,msumm5,msumm6,msumm7,msumm8,msumm9,pl_days_trans,pl_count_trans,label
0,3,2,1.0,0,3,0,77,108,214,11,...,0.0,0.0,15144.601562,153866.890625,3390.320969,0.0,0.0,13.25,1.375,-77.0
1,9,1,2.0,0,3,-1,-1,102,283,90,...,13289.485764,104223.187939,109800.056656,54146.38414,18500.321304,12387.577271,11087.65374,3.351852,1.666667,
2,13,6,3.0,0,2,0,86,114,282,22,...,16394.193359,27095.248047,27650.769531,76186.732422,33908.388672,53024.051758,19224.679688,9.333333,1.222222,-86.0
3,37,5,2.0,0,2,0,89,104,283,315,...,32719.820328,41364.700729,67081.604048,53322.076138,69769.03476,50056.823341,28284.328693,1.376923,2.423077,-89.0
4,41,1,1.0,0,2,0,57,103,256,16,...,8045.445801,17708.00882,30354.633301,17462.199585,18175.117676,16841.208984,0.0,12.75,1.333333,-57.0


In [27]:
train_columns = [col for col in df.columns if col not in ['user_id', 'report_dt', 'label', 'target', 
                                                          'time', 'count_trans']]

In [28]:
df_train = df[(df['time']!=-1)].copy()
X = df_train[train_columns]
y = df_train[['label','time','target']]
y_label = df_train[['label']]

In [100]:
def fit_xgboost(trial, train, val):
    X_train_cur, y_train_cur = train
    X_val_cur, y_val_cur = val

    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.01),
        "max_depth": trial.suggest_int("max_depth", 4, 16),
        "min_child_weight": trial.suggest_int("min_child_weight", 6, 20),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        "gamma": trial.suggest_int("gamma", 1, 5),
        "reg_lambda": trial.suggest_float("reg_lambda",1.0, 3.0),
        "reg_alpha": trial.suggest_float("reg_alpha",1.0, 3.0),
        "subsample": trial.suggest_float("subsample", 0.75, 1.0),
        "max_bin": trial.suggest_categorical("max_bin", [128, 256, 512])
    }

    model = xgb.XGBRegressor(objective="survival:cox",
                             random_state=458,
                             tree_method = "hist",
                             n_estimators=10000,
                             device="cuda",
                             **param)
    
    model.fit(X_train_cur,
              y_train_cur[['label']],
              early_stopping_rounds=500, 
              eval_set=[(X_val_cur, y_val_cur[['label']])],
              verbose=1000)

    y_pred = model.predict(X_val_cur)
    
    return model, y_pred

In [98]:
def objective(trial, return_models=False):
    
    n_splits=5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    scores, models = [], []
    
    for train_index, val_index in skf.split(X, y_label):
        train_data = X.iloc[train_index], y.iloc[train_index]
        valid_data = X.iloc[val_index], y.iloc[val_index]
        
        model, y_pred = fit_xgboost(trial, train_data, valid_data)
        
        scores.append(concordance_index(valid_data[1].time, -y_pred, valid_data[1].target))
        models.append(model)

    result = np.mean(scores)

    if return_models:
        return result, models
    else:
        return result

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective,
               n_trials=300,
               n_jobs = -1,
               show_progress_bar=True,)

In [103]:
study.best_params

{'learning_rate': 0.0010119044289866191,
 'max_depth': 11,
 'min_child_weight': 6,
 'colsample_bylevel': 0.0964029721202083,
 'gamma': 1,
 'reg_lambda': 1.05415902329896,
 'reg_alpha': 1.2690885998724977,
 'subsample': 0.7696892304354142,
 'max_bin': 512}

[5000]	validation_0-cox-nloglik:8.74261
[6000]	validation_0-cox-nloglik:8.74087
[5000]	validation_0-cox-nloglik:8.65179
[7000]	validation_0-cox-nloglik:8.74006
[6000]	validation_0-cox-nloglik:8.65034


In [None]:
learning_rates = np.linspace(0.005, 0.001, iters).tolist()

In [47]:
n_splits=5
scores = []
models = []

learning_rates = np.linspace(0.0035, 0.0015, 8000).tolist()
scheduler = xgb.callback.LearningRateScheduler(learning_rates)

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=33)

for train_index, val_index in skf.split(X, y_label):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    dtrain = xgb.DMatrix(X_train,
                         label=y_train[['label']],
                         nthread=-1,
                         enable_categorical=True,)

    dtest = xgb.DMatrix(X_val,
                        y_val[['label']],
                        nthread=-1,
                        enable_categorical=True)

    params = {
        "objective": "survival:cox",
        "random_state": 458,
        "reg_lambda": 1.5,
        "reg_alpha":1.4,
        "subsample": .8,
        "colsample_bytree": .3,
        "gamma": 3,
        "min_child_weight": 16,
        "max_depth": 6, #max_depth=8
        "learning_rate": 0.003,#learning_rate=0.003,
        "tree_method": "hist",
        "max_bin": 512,
        #"n_estimators": 8000,#n_estimators=5000
        "device": "cuda"
    }
    
    model = xgb.train(params,
                    dtrain=dtrain,
                    early_stopping_rounds=500,
                    evals=[(dtrain, 'dtrain'), (dtest, 'dtest')],
                    #callbacks=[scheduler],
                    num_boost_round=8000,
                    verbose_eval=1000,
                    )
    
    models.append(model)

    predictions_xgb = model.predict(dtest)
    scores.append(concordance_index(y_val.time, -predictions_xgb, y_val.target))

np.mean(scores)

[0]	dtrain-cox-nloglik:10.65341	dtest-cox-nloglik:9.26792
[1000]	dtrain-cox-nloglik:9.93765	dtest-cox-nloglik:8.73830
[2000]	dtrain-cox-nloglik:9.79039	dtest-cox-nloglik:8.69916
[3000]	dtrain-cox-nloglik:9.69219	dtest-cox-nloglik:8.68811
[4000]	dtrain-cox-nloglik:9.60631	dtest-cox-nloglik:8.68443
[5000]	dtrain-cox-nloglik:9.53557	dtest-cox-nloglik:8.68359
[5936]	dtrain-cox-nloglik:9.47784	dtest-cox-nloglik:8.68382
[0]	dtrain-cox-nloglik:10.65400	dtest-cox-nloglik:9.26703
[1000]	dtrain-cox-nloglik:9.94280	dtest-cox-nloglik:8.72852
[2000]	dtrain-cox-nloglik:9.79211	dtest-cox-nloglik:8.68843
[3000]	dtrain-cox-nloglik:9.69203	dtest-cox-nloglik:8.67985
[4000]	dtrain-cox-nloglik:9.60742	dtest-cox-nloglik:8.67779
[5000]	dtrain-cox-nloglik:9.53439	dtest-cox-nloglik:8.67734
[5836]	dtrain-cox-nloglik:9.48269	dtest-cox-nloglik:8.67805
[0]	dtrain-cox-nloglik:10.65326	dtest-cox-nloglik:9.26765
[1000]	dtrain-cox-nloglik:9.94722	dtest-cox-nloglik:8.72233
[2000]	dtrain-cox-nloglik:9.80046	dtest-cox-nl

0.7763006319524732

In [29]:
n_splits=5
scores = []
models = []

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=33)

for train_index, val_index in skf.split(X, y_label):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    model = xgb.XGBRegressor(objective="survival:cox",
                             random_state=458,
                             reg_lambda=1.5,
                             reg_alpha=1.4,
                             subsample=.8,
                             colsample_bytree=.3,
                             gamma=3,
                             min_child_weight=16,
                             max_depth=8, #max_depth=6
                             learning_rate=0.003,#learning_rate=0.005,
                             tree_method = "hist",
                             n_estimators=8000,#n_estimators=5000
                             device="cuda")
    model.fit(X_train,
              y_train[['label']],
              early_stopping_rounds=500, #early_stopping_rounds=400
              eval_set=[(X_val, y_val[['label']])],
              verbose=500,
              )
    
    models.append(model)

    predictions_xgb = model.predict(X_val)
    scores.append(concordance_index(y_val.time, -predictions_xgb, y_val.target))

np.mean(scores)



[0]	validation_0-cox-nloglik:9.26806
[500]	validation_0-cox-nloglik:8.80042
[1000]	validation_0-cox-nloglik:8.72356
[1500]	validation_0-cox-nloglik:8.69810
[2000]	validation_0-cox-nloglik:8.68763
[2500]	validation_0-cox-nloglik:8.68259
[3000]	validation_0-cox-nloglik:8.68074
[3500]	validation_0-cox-nloglik:8.67973
[4000]	validation_0-cox-nloglik:8.67937
[4500]	validation_0-cox-nloglik:8.67890
[4879]	validation_0-cox-nloglik:8.67930


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[0]	validation_0-cox-nloglik:9.26695




[500]	validation_0-cox-nloglik:8.78936
[1000]	validation_0-cox-nloglik:8.71470
[1500]	validation_0-cox-nloglik:8.69071
[2000]	validation_0-cox-nloglik:8.68026
[2500]	validation_0-cox-nloglik:8.67600
[3000]	validation_0-cox-nloglik:8.67423
[3500]	validation_0-cox-nloglik:8.67364
[4000]	validation_0-cox-nloglik:8.67389
[4226]	validation_0-cox-nloglik:8.67407
[0]	validation_0-cox-nloglik:9.26754




[500]	validation_0-cox-nloglik:8.78996
[1000]	validation_0-cox-nloglik:8.71122
[1500]	validation_0-cox-nloglik:8.68604
[2000]	validation_0-cox-nloglik:8.67631
[2500]	validation_0-cox-nloglik:8.67308
[3000]	validation_0-cox-nloglik:8.67225
[3461]	validation_0-cox-nloglik:8.67238
[0]	validation_0-cox-nloglik:9.26691




[500]	validation_0-cox-nloglik:8.78157
[1000]	validation_0-cox-nloglik:8.70213
[1500]	validation_0-cox-nloglik:8.67745
[2000]	validation_0-cox-nloglik:8.66759
[2500]	validation_0-cox-nloglik:8.66293
[3000]	validation_0-cox-nloglik:8.66045
[3500]	validation_0-cox-nloglik:8.65943
[4000]	validation_0-cox-nloglik:8.65899
[4500]	validation_0-cox-nloglik:8.65898
[4806]	validation_0-cox-nloglik:8.65934
[0]	validation_0-cox-nloglik:9.26762




[500]	validation_0-cox-nloglik:8.81788
[1000]	validation_0-cox-nloglik:8.74208
[1500]	validation_0-cox-nloglik:8.71776
[2000]	validation_0-cox-nloglik:8.70889
[2500]	validation_0-cox-nloglik:8.70391
[3000]	validation_0-cox-nloglik:8.70274
[3500]	validation_0-cox-nloglik:8.70090
[3998]	validation_0-cox-nloglik:8.70137


0.7768874993890824

In [40]:
for i, model in enumerate(models):
    file_name = f"xgb_{i}.pkl"
    pickle.dump(model, open(file_name, "wb"))

In [95]:
df_imp = pd.DataFrame(list(X_train), models[0].feature_importances_)
df_imp.columns = ["Feature_Names"]
df_imp["Importances"] = df_imp.index
df_imp = df_imp.sort_values(by = "Importances", ascending = False)
df_imp.index = np.arange(0,len(df_imp))
df_imp.head(50)

Unnamed: 0,Feature_Names,Importances
0,mcc_count_51,0.062791
1,positive_mcc_code_sum_51,0.06164
2,customer_age,0.033481
3,employee_count_nm,0.031282
4,cur_groups_count_trans_negative_0,0.015299
5,msumm3,0.01087
6,min_day_trans,0.010809
7,cur_groups_min_trans_negative_0,0.010523
8,days_groups_min_trans_positive_10,0.008301
9,trans_negative_small,0.008252


In [43]:
d_test = xgb.DMatrix(df[df['time']==-1][train_columns].copy(),
                    nthread=-1,
                    enable_categorical=True)

In [48]:
predictions = models[0].predict(d_test)
for i in range(1, len(models)):
    predictions += models[i].predict(d_test)
predictions /= 5
submit = df[df['time']==-1][['user_id']].copy()
submit['predict'] = predictions
submit.to_csv(f'submission_xgboost_6.csv',index=False)
submit

Unnamed: 0,user_id,predict
1,9,0.303870
9,61,0.259679
10,62,0.920720
17,80,0.114155
20,88,3.522721
...,...,...
95988,561362,0.988217
95990,561419,1.465799
95993,561895,1.022656
95994,561908,2.505330
