In [1]:
import pandas as pd
import xgboost as xgb
from myfunctions import amex_metric
import myfunctions

In [2]:
path = '../../data/prepared/'

In [None]:
train_df = pd.read_csv(path + 'train_data.csv')
train_labels = pd.read_csv(path + 'train_labels.csv')
valid_df = pd.read_csv(path + 'val_data.csv')
valid_labels = pd.read_csv(path + 'val_labels.csv')

In [None]:
column_list = list(train_df.columns)

In [None]:
categorical_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                       'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
numerical_columns = [col for col in column_list if col not in categorical_columns]

numerical_columns.remove('customer_ID')
numerical_columns.remove('S_2')

In [7]:
def data_preparer(df, df_labels):
    
    df.fillna(-2, inplace=True)
    df.drop(columns=['S_2'], inplace=True)
    
    new_df = pd.DataFrame({'customer_ID': df['customer_ID'].unique()})
    
    for col in numerical_columns:
        mini = df.groupby('customer_ID')[col].min()
        maxi = df.groupby('customer_ID')[col].max()
        med = df.groupby('customer_ID')[col].median()
        std = df.groupby('customer_ID')[col].std()
    
        stats = pd.DataFrame({f'{col}_min': mini, f'{col}_max': maxi,
                              f'{col}_median': med, f'{col}_std': std})
    
        stats.reset_index(drop=True, inplace=True)
    
        new_df = pd.concat([new_df, stats], axis=1)
        
    new_df = new_df.merge(df_labels, on='customer_ID')
    
    my
    
    X = new_df.drop(columns=['customer_ID', 'target'])
    y = new_df['target']
    
    dmatrix = xgb.DMatrix(X, label=y)
    
    return dmatrix, y

In [8]:
D_train, y_train = data_preparer(train_df, train_labels)

In [9]:
D_valid, y_valid = data_preparer(valid_df, valid_labels)

In [10]:
steps = 1000
seed = 42

params = {
    'verbosity': 1,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed
}

In [11]:
model_d4 = xgb.train(params, D_train, steps, early_stopping_rounds=5,
                     evals=[(D_valid, 'Validation')])

[0]	Validation-logloss:0.61009
[1]	Validation-logloss:0.54720
[2]	Validation-logloss:0.49891
[3]	Validation-logloss:0.46085
[4]	Validation-logloss:0.42978
[5]	Validation-logloss:0.40450
[6]	Validation-logloss:0.38371
[7]	Validation-logloss:0.36596
[8]	Validation-logloss:0.35156
[9]	Validation-logloss:0.33939
[10]	Validation-logloss:0.32885
[11]	Validation-logloss:0.31939
[12]	Validation-logloss:0.31186
[13]	Validation-logloss:0.30537
[14]	Validation-logloss:0.29981
[15]	Validation-logloss:0.29472
[16]	Validation-logloss:0.29047
[17]	Validation-logloss:0.28658
[18]	Validation-logloss:0.28334
[19]	Validation-logloss:0.28028
[20]	Validation-logloss:0.27796
[21]	Validation-logloss:0.27549
[22]	Validation-logloss:0.27351
[23]	Validation-logloss:0.27177
[24]	Validation-logloss:0.27009
[25]	Validation-logloss:0.26842
[26]	Validation-logloss:0.26714
[27]	Validation-logloss:0.26604
[28]	Validation-logloss:0.26484
[29]	Validation-logloss:0.26386
[30]	Validation-logloss:0.26293
[31]	Validation-lo

In [12]:
def model_evaluator(model, data, y_true):
    
    y_hat = model.predict(data)
    
    y_true_final = pd.DataFrame(y_true)
    y_hat_final = pd.DataFrame(y_hat, columns=['prediction'])
    
    return amex_metric(y_true_final, y_hat_final)

In [13]:
result_d4 = model_evaluator(model_d4, D_valid, y_valid)

result_d4

0.7365156488976947

In [14]:
steps = 1000
seed = 42

params = {
    'verbosity': 1,
    'max_depth': 5,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed
}

In [15]:
model_d5 = xgb.train(params, D_train, steps, early_stopping_rounds=5,
                     evals=[(D_valid, 'Validation')])

[0]	Validation-logloss:0.60804
[1]	Validation-logloss:0.54412
[2]	Validation-logloss:0.49479
[3]	Validation-logloss:0.45604
[4]	Validation-logloss:0.42408
[5]	Validation-logloss:0.39822
[6]	Validation-logloss:0.37691
[7]	Validation-logloss:0.35922
[8]	Validation-logloss:0.34456
[9]	Validation-logloss:0.33218
[10]	Validation-logloss:0.32183
[11]	Validation-logloss:0.31290
[12]	Validation-logloss:0.30508
[13]	Validation-logloss:0.29862
[14]	Validation-logloss:0.29305
[15]	Validation-logloss:0.28822
[16]	Validation-logloss:0.28414
[17]	Validation-logloss:0.28062
[18]	Validation-logloss:0.27758
[19]	Validation-logloss:0.27488
[20]	Validation-logloss:0.27253
[21]	Validation-logloss:0.27049
[22]	Validation-logloss:0.26880
[23]	Validation-logloss:0.26721
[24]	Validation-logloss:0.26595
[25]	Validation-logloss:0.26482
[26]	Validation-logloss:0.26364
[27]	Validation-logloss:0.26265
[28]	Validation-logloss:0.26168
[29]	Validation-logloss:0.26103
[30]	Validation-logloss:0.26016
[31]	Validation-lo

In [16]:
result_d5 = model_evaluator(model_d5, D_valid, y_valid)

result_d5

0.7355595941949633

In [17]:
steps = 1000
seed = 42

params = {
    'verbosity': 1,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed,
    'colsample_bytree': 0.8
}

In [18]:
model_bytree = xgb.train(params, D_train, steps, early_stopping_rounds=3,
                         evals=[(D_valid, 'Validation')])

[0]	Validation-logloss:0.60991
[1]	Validation-logloss:0.54771
[2]	Validation-logloss:0.49951
[3]	Validation-logloss:0.46063
[4]	Validation-logloss:0.42946
[5]	Validation-logloss:0.40432
[6]	Validation-logloss:0.38347
[7]	Validation-logloss:0.36621
[8]	Validation-logloss:0.35171
[9]	Validation-logloss:0.33937
[10]	Validation-logloss:0.32883
[11]	Validation-logloss:0.32003
[12]	Validation-logloss:0.31251
[13]	Validation-logloss:0.30604
[14]	Validation-logloss:0.30050
[15]	Validation-logloss:0.29551
[16]	Validation-logloss:0.29128
[17]	Validation-logloss:0.28745
[18]	Validation-logloss:0.28406
[19]	Validation-logloss:0.28105
[20]	Validation-logloss:0.27838
[21]	Validation-logloss:0.27635
[22]	Validation-logloss:0.27429
[23]	Validation-logloss:0.27243
[24]	Validation-logloss:0.27068
[25]	Validation-logloss:0.26923
[26]	Validation-logloss:0.26795
[27]	Validation-logloss:0.26652
[28]	Validation-logloss:0.26547
[29]	Validation-logloss:0.26453
[30]	Validation-logloss:0.26351
[31]	Validation-lo

In [19]:
result_bytree = model_evaluator(model_bytree, D_valid, y_valid)

result_bytree

0.7377835985773353

In [20]:
steps = 1000
seed = 42

params = {
    'verbosity': 1,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.8
}

In [21]:
model_bytl = xgb.train(params, D_train, steps, early_stopping_rounds=3,
                         evals=[(D_valid, 'Validation')])

[0]	Validation-logloss:0.61014
[1]	Validation-logloss:0.54739
[2]	Validation-logloss:0.49884
[3]	Validation-logloss:0.46047
[4]	Validation-logloss:0.42973
[5]	Validation-logloss:0.40442
[6]	Validation-logloss:0.38327
[7]	Validation-logloss:0.36564
[8]	Validation-logloss:0.35097
[9]	Validation-logloss:0.33879
[10]	Validation-logloss:0.32843
[11]	Validation-logloss:0.31965
[12]	Validation-logloss:0.31197
[13]	Validation-logloss:0.30543
[14]	Validation-logloss:0.29988
[15]	Validation-logloss:0.29473
[16]	Validation-logloss:0.29042
[17]	Validation-logloss:0.28683
[18]	Validation-logloss:0.28336
[19]	Validation-logloss:0.28043
[20]	Validation-logloss:0.27780
[21]	Validation-logloss:0.27550
[22]	Validation-logloss:0.27352
[23]	Validation-logloss:0.27188
[24]	Validation-logloss:0.27019
[25]	Validation-logloss:0.26869
[26]	Validation-logloss:0.26755
[27]	Validation-logloss:0.26641
[28]	Validation-logloss:0.26526
[29]	Validation-logloss:0.26424
[30]	Validation-logloss:0.26323
[31]	Validation-lo

In [22]:
result_bytl = model_evaluator(model_bytl, D_valid, y_valid)

result_bytl

0.7402951614826507

In [23]:
steps = 1000
seed = 42

params = {
    'verbosity': 1,
    'max_depth': 4,
    'objective': 'binary:logistic',
    'eta': 0.15,
    'random_state': seed,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.8,
    'colsample_bynode': 0.8
}

In [24]:
model_byall = xgb.train(params, D_train, steps, early_stopping_rounds=3,
                         evals=[(D_valid, 'Validation')])

[0]	Validation-logloss:0.61214
[1]	Validation-logloss:0.54934
[2]	Validation-logloss:0.50063
[3]	Validation-logloss:0.46194
[4]	Validation-logloss:0.43089
[5]	Validation-logloss:0.40525
[6]	Validation-logloss:0.38419
[7]	Validation-logloss:0.36653
[8]	Validation-logloss:0.35197
[9]	Validation-logloss:0.33946
[10]	Validation-logloss:0.32898
[11]	Validation-logloss:0.32018
[12]	Validation-logloss:0.31238
[13]	Validation-logloss:0.30580
[14]	Validation-logloss:0.29997
[15]	Validation-logloss:0.29507
[16]	Validation-logloss:0.29069
[17]	Validation-logloss:0.28696
[18]	Validation-logloss:0.28371
[19]	Validation-logloss:0.28075
[20]	Validation-logloss:0.27823
[21]	Validation-logloss:0.27593
[22]	Validation-logloss:0.27382
[23]	Validation-logloss:0.27210
[24]	Validation-logloss:0.27046
[25]	Validation-logloss:0.26902
[26]	Validation-logloss:0.26775
[27]	Validation-logloss:0.26650
[28]	Validation-logloss:0.26544
[29]	Validation-logloss:0.26456
[30]	Validation-logloss:0.26355
[31]	Validation-lo

In [25]:
result_byall = model_evaluator(model_byall, D_valid, y_valid)

result_byall

0.7380946660295717