In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


# Import the data

In [2]:
df = pd.read_csv('Base.csv')

# Preparation and EDA

In [3]:
df.shape

(1000000, 32)

In [4]:
df.isna().sum()

fraud_bool                          0
income                              0
name_email_similarity               0
prev_address_months_count           0
current_address_months_count        0
customer_age                        0
days_since_request                  0
intended_balcon_amount              0
payment_type                        0
zip_count_4w                        0
velocity_6h                         0
velocity_24h                        0
velocity_4w                         0
bank_branch_count_8w                0
date_of_birth_distinct_emails_4w    0
employment_status                   0
credit_risk_score                   0
email_is_free                       0
housing_status                      0
phone_home_valid                    0
phone_mobile_valid                  0
bank_months_count                   0
has_other_cards                     0
proposed_credit_limit               0
foreign_request                     0
source                              0
session_leng

In [5]:
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,1,0.9,0.166828,-1,88,50,0.020925,-1.331345,AA,769,...,0,500.0,0,INTERNET,3.888115,windows,0,1,0,7
1,1,0.9,0.296286,-1,144,50,0.005418,-0.816224,AB,366,...,0,1500.0,0,INTERNET,31.798819,windows,0,1,0,7
2,1,0.9,0.044985,-1,132,40,3.108549,-0.755728,AC,870,...,0,200.0,0,INTERNET,4.728705,other,0,1,0,7
3,1,0.9,0.159511,-1,22,50,0.019079,-1.205124,AB,810,...,1,200.0,0,INTERNET,2.047904,linux,0,1,0,7
4,1,0.9,0.596414,-1,218,50,0.004441,-0.773276,AB,890,...,0,1500.0,0,INTERNET,3.775225,macintosh,1,1,0,7


In [6]:
df[df.eq(-1).any(1)]

  df[df.eq(-1).any(1)]


Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,1,0.9,0.166828,-1,88,50,0.020925,-1.331345,AA,769,...,0,500.0,0,INTERNET,3.888115,windows,0,1,0,7
1,1,0.9,0.296286,-1,144,50,0.005418,-0.816224,AB,366,...,0,1500.0,0,INTERNET,31.798819,windows,0,1,0,7
2,1,0.9,0.044985,-1,132,40,3.108549,-0.755728,AC,870,...,0,200.0,0,INTERNET,4.728705,other,0,1,0,7
3,1,0.9,0.159511,-1,22,50,0.019079,-1.205124,AB,810,...,1,200.0,0,INTERNET,2.047904,linux,0,1,0,7
4,1,0.9,0.596414,-1,218,50,0.004441,-0.773276,AB,890,...,0,1500.0,0,INTERNET,3.775225,macintosh,1,1,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0.6,0.192631,-1,104,40,0.030592,-1.044454,AB,804,...,0,200.0,0,INTERNET,8.511502,linux,1,1,0,4
999996,0,0.8,0.322989,148,9,50,1.628119,-1.409803,AC,3306,...,0,200.0,0,INTERNET,8.967865,windows,0,1,0,4
999997,0,0.8,0.879403,-1,30,20,0.018563,34.692760,AA,1522,...,0,200.0,0,INTERNET,8.195531,other,0,1,0,4
999998,0,0.9,0.762112,-1,189,20,0.015352,94.661055,AA,1418,...,0,500.0,0,INTERNET,4.336064,windows,1,1,0,4


In [7]:
df['prev_address_months_count'][df['prev_address_months_count'] == -1].count()

712920

In [8]:
df['current_address_months_count'][df['current_address_months_count'] == -1].count()

4254

In [9]:
df = df.drop(columns=['prev_address_months_count'])

In [10]:
df['current_address_months_count'] = df['current_address_months_count'].replace(-1, df['current_address_months_count'].median())

In [11]:
df['current_address_months_count'][df['current_address_months_count'] == -1].count()

0

In [12]:
df.dtypes

fraud_bool                            int64
income                              float64
name_email_similarity               float64
current_address_months_count          int64
customer_age                          int64
days_since_request                  float64
intended_balcon_amount              float64
payment_type                         object
zip_count_4w                          int64
velocity_6h                         float64
velocity_24h                        float64
velocity_4w                         float64
bank_branch_count_8w                  int64
date_of_birth_distinct_emails_4w      int64
employment_status                    object
credit_risk_score                     int64
email_is_free                         int64
housing_status                       object
phone_home_valid                      int64
phone_mobile_valid                    int64
bank_months_count                     int64
has_other_cards                       int64
proposed_credit_limit           

In [13]:
df.nunique()

fraud_bool                               2
income                                   9
name_email_similarity               998861
current_address_months_count           422
customer_age                             9
days_since_request                  989330
intended_balcon_amount              994971
payment_type                             5
zip_count_4w                          6306
velocity_6h                         998687
velocity_24h                        998940
velocity_4w                         998318
bank_branch_count_8w                  2326
date_of_birth_distinct_emails_4w        40
employment_status                        7
credit_risk_score                      551
email_is_free                            2
housing_status                           7
phone_home_valid                         2
phone_mobile_valid                       2
bank_months_count                       33
has_other_cards                          2
proposed_credit_limit                   12
foreign_req

In [14]:
df['employment_status'].head()

0    CA
1    CA
2    CB
3    CA
4    CA
Name: employment_status, dtype: object

Balancing the data by undersampling

In [15]:
df['fraud_bool'].value_counts()

0    988971
1     11029
Name: fraud_bool, dtype: int64

In [16]:
neg_samp = df[df['fraud_bool']==0].sample(n=11029, random_state=42)

In [17]:
pos_samp = df[df['fraud_bool']==1]
df_sample = pd.concat([neg_samp, pos_samp], ignore_index=True, sort=False)
df_sample['fraud_bool'].value_counts()

0    11029
1    11029
Name: fraud_bool, dtype: int64

In [18]:
X = df_sample.drop(columns=['fraud_bool'])
y = df_sample['fraud_bool']
dicts = X.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(dicts)

In [19]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

# Logistic Regression and Feature Importance

In [20]:
model_1 = LogisticRegression(max_iter=100, solver='liblinear')
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_val)
roc_auc_score(y_val, y_pred)

0.7801707272966396

In [21]:
pd.Series(model_1.coef_[0], index=dv.get_feature_names()).sort_values()



name_email_similarity              -1.306754
has_other_cards                    -1.191042
phone_home_valid                   -1.096967
device_os=linux                    -0.750335
keep_alive_session                 -0.739716
source=INTERNET                    -0.631450
employment_status=CF               -0.574961
housing_status=BE                  -0.572538
device_os=other                    -0.500189
payment_type=AA                    -0.457368
employment_status=CE               -0.452037
phone_mobile_valid                 -0.445358
housing_status=BC                  -0.379146
housing_status=BB                  -0.321490
payment_type=AB                    -0.248894
employment_status=CD               -0.231670
payment_type=AD                    -0.177139
housing_status=BF                  -0.088703
employment_status=CB               -0.072694
date_of_birth_distinct_emails_4w   -0.026994
housing_status=BG                  -0.021205
month                              -0.005817
intended_b

# Random Forest

In [22]:
%%capture --no-display
estimators = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250]
depth = [2, 3, 4, 5, 10, 15, 20, 25]
best = pd.DataFrame(columns=["depth", "estimators", "auc_train", "auc_val"])
for d in depth:
    for n in estimators:
        rf = RandomForestClassifier(n_estimators=n, max_depth=d, random_state=42)
        rf.fit(X_train, y_train)
        y_pred_train = rf.predict(X_train)
        y_pred_val = rf.predict(X_val)
        roc_auc_score_train = roc_auc_score(y_train, y_pred_train)
        roc_auc_score_val = roc_auc_score(y_val, y_pred_val)
        best = best.append({'depth':d, 'estimators':n, 'auc_train':(round(roc_auc_score_train, 5)), 'auc_val':(round(roc_auc_score_val, 5))}, ignore_index=True)

In [23]:
best.sort_values(by='auc_val')

Unnamed: 0,depth,estimators,auc_train,auc_val
0,2.0,10.0,0.74703,0.73291
2,2.0,30.0,0.75420,0.74241
3,2.0,40.0,0.75624,0.74487
1,2.0,20.0,0.75290,0.74507
4,2.0,50.0,0.75684,0.74553
...,...,...,...,...
162,20.0,130.0,0.99992,0.79130
172,20.0,230.0,0.99992,0.79174
166,20.0,170.0,0.99992,0.79176
160,20.0,110.0,0.99992,0.79244


In [24]:
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42)
rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
roc_auc_score_train = roc_auc_score(y_train, y_pred_train)
roc_auc_score_val = roc_auc_score(y_val, y_pred_val)
roc_auc_score_val

0.7926703163017031

In [25]:
pd.Series(rf.feature_importances_, index=dv.get_feature_names()).sort_values()



device_fraud_count                  0.000000
payment_type=AE                     0.000000
housing_status=BG                   0.000006
employment_status=CG                0.000061
housing_status=BF                   0.000100
source=TELEAPP                      0.000624
source=INTERNET                     0.000714
device_os=x11                       0.000827
employment_status=CD                0.001122
employment_status=CE                0.001191
housing_status=BD                   0.001830
employment_status=CF                0.002191
foreign_request                     0.002602
employment_status=CC                0.002960
device_os=macintosh                 0.003756
payment_type=AD                     0.003898
employment_status=CB                0.004664
phone_mobile_valid                  0.004692
payment_type=AB                     0.005197
payment_type=AC                     0.006050
employment_status=CA                0.006408
housing_status=BB                   0.006933
housing_st

Tuning

# XGBoost

In [26]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.get_feature_names())
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.get_feature_names())
watchlist = [(dtrain, 'train'), (dval, 'val')]
columns = ['eta', 'depth', 'iter_num', 'train_auc', 'val_auc']
total = pd.DataFrame(columns=columns)
def parse_xgb_output(output, e, d):
    results = []
    
    for line in output.stdout.strip().split('\n'):
        it_line, train_line, val_line = line.split('\t')
        
        it = int(it_line.strip('[]'))
        train = float(train_line.split(':')[1])
        val = float(val_line.split(':')[1])
        
        results.append((e, d, it, train, val))
    df_results = pd.DataFrame(results, columns=columns)
    return df_results



Tuning depth

In [27]:
%%capture output

eta = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
depth = 1
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [28]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,1,0,0.70455,0.69669
1,0.3,1,1,0.7709,0.76485
2,0.3,1,2,0.7709,0.76485
3,0.3,1,3,0.7941,0.78713
5,0.3,1,5,0.8037,0.79866


In [29]:
%%capture output

depth = 2
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [30]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,2,0,0.7709,0.76485
1,0.3,2,1,0.79409,0.78593
2,0.3,2,2,0.79876,0.79162
3,0.3,2,3,0.81986,0.81196
4,0.3,2,4,0.83324,0.82643


In [31]:
%%capture output

depth = 3
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [32]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,3,0,0.79893,0.78812
1,0.3,3,1,0.81212,0.8026
2,0.3,3,2,0.82427,0.81473
3,0.3,3,3,0.84297,0.83343
4,0.3,3,4,0.85342,0.84457


In [33]:
%%capture output

depth = 4
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [34]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,4,0,0.817,0.79999
1,0.3,4,1,0.83235,0.81654
2,0.3,4,2,0.84586,0.82872
3,0.3,4,3,0.85375,0.83602
4,0.3,4,4,0.86313,0.84546


In [35]:
%%capture output

depth = 5
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [36]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,5,0,0.83282,0.81117
1,0.3,5,1,0.84978,0.82784
2,0.3,5,2,0.8613,0.8373
3,0.3,5,3,0.8704,0.84377
4,0.3,5,4,0.87841,0.85184


In [37]:
%%capture output

depth = 6
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [38]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,6,0,0.84893,0.81942
1,0.3,6,1,0.86721,0.83029
2,0.3,6,2,0.87797,0.84311
3,0.3,6,3,0.88824,0.84878
4,0.3,6,4,0.89516,0.853


In [39]:
%%capture output

depth = 7
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [40]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,7,0,0.86862,0.81912
1,0.3,7,1,0.88874,0.8321
2,0.3,7,2,0.90043,0.84227
3,0.3,7,3,0.9109,0.84666
4,0.3,7,4,0.91883,0.85006


In [41]:
%%capture output

depth = 8
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [42]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,8,0,0.89033,0.80711
1,0.3,8,1,0.91417,0.82601
2,0.3,8,2,0.92661,0.83559
3,0.3,8,3,0.93536,0.84439
4,0.3,8,4,0.9421,0.84815


In [43]:
%%capture output

depth = 9
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [44]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,9,0,0.91393,0.79673
1,0.3,9,1,0.93845,0.81944
2,0.3,9,2,0.94981,0.83153
3,0.3,9,3,0.95813,0.83728
4,0.3,9,4,0.96508,0.8432


In [45]:
%%capture output

depth = 10
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [46]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.3,10,0,0.93407,0.78812
1,0.3,10,1,0.95636,0.81316
2,0.3,10,2,0.96852,0.82496
3,0.3,10,3,0.97507,0.83174
4,0.3,10,4,0.98078,0.83638


Using best depth (6) tuning eta

In [47]:
%%capture output

xgb_params = {
    'eta': 0.01,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [48]:
parse_xgb_output(output, 0.01, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.01,10,0,0.84893,0.81942
1,0.01,10,1,0.84923,0.81976
2,0.01,10,2,0.85044,0.82058
3,0.01,10,3,0.85049,0.82107
4,0.01,10,4,0.85124,0.82183


In [49]:
%%capture output

xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [50]:
parse_xgb_output(output, 0.05, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.05,10,0,0.84893,0.81942
1,0.05,10,1,0.85496,0.82475
2,0.05,10,2,0.85774,0.82716
3,0.05,10,3,0.86335,0.83153
4,0.05,10,4,0.86657,0.83535


In [51]:
%%capture output

xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [52]:
parse_xgb_output(output, 0.1, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.1,10,0,0.84893,0.81942
1,0.1,10,1,0.85729,0.82601
2,0.1,10,2,0.86599,0.83281
3,0.1,10,3,0.87139,0.83774
4,0.1,10,4,0.87655,0.84293


In [53]:
%%capture output

xgb_params = {
    'eta': 0.15,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [54]:
parse_xgb_output(output, 0.15, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.15,10,0,0.84893,0.81942
1,0.15,10,1,0.86313,0.8302
2,0.15,10,2,0.87089,0.83859
3,0.15,10,3,0.8775,0.84393
4,0.15,10,4,0.88179,0.84868


In [55]:
%%capture output

xgb_params = {
    'eta': 0.2,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [56]:
parse_xgb_output(output, 0.2, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.2,10,0,0.84893,0.81942
1,0.2,10,1,0.86473,0.83201
2,0.2,10,2,0.87418,0.84226
3,0.2,10,3,0.8791,0.84595
4,0.2,10,4,0.88623,0.84981


In [57]:
%%capture output

xgb_params = {
    'eta': 0.25,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [58]:
parse_xgb_output(output, 0.25, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.25,10,0,0.84893,0.81942
1,0.25,10,1,0.86678,0.83164
2,0.25,10,2,0.87563,0.84195
3,0.25,10,3,0.88646,0.84843
4,0.25,10,4,0.89086,0.85245


In [59]:
%%capture output

xgb_params = {
    'eta': 0.35,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [60]:
parse_xgb_output(output, 0.35, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.35,10,0,0.84893,0.81942
1,0.35,10,1,0.86931,0.82825
2,0.35,10,2,0.88245,0.84058
3,0.35,10,3,0.89045,0.84541
4,0.35,10,4,0.89893,0.85273


In [61]:
%%capture output

xgb_params = {
    'eta': 0.4,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [62]:
parse_xgb_output(output, 0.4, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.4,10,0,0.84893,0.81942
1,0.4,10,1,0.86997,0.83345
2,0.4,10,2,0.8836,0.84477
3,0.4,10,3,0.89381,0.85086
4,0.4,10,4,0.90148,0.85433


In [124]:
%%capture output

xgb_params = {
    'eta': 0.45,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [125]:
parse_xgb_output(output, 0.45, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.45,10,0,0.84893,0.81942
1,0.45,10,1,0.87043,0.83271
2,0.45,10,2,0.88568,0.84242
98,0.45,10,98,0.99673,0.84595
95,0.45,10,95,0.99648,0.846


Best eta: 0.4 - other parameters

In [250]:
%%capture output

xgb_params = {
    'eta': 0.4,
    'max_depth': 6,
    'colsample_bytree': 0.6,
    'subsample': 1,
    'min_child_weight': 1,
    'gamma': 0.4,
    'lambda': 10,

    'objective': 'binary:logistic',
    'nthread': 8,
    'eval_metric': 'auc',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [251]:
parse_xgb_output(output, 0.45, depth).sort_values(by='val_auc').head()

Unnamed: 0,eta,depth,iter_num,train_auc,val_auc
0,0.45,10,0,0.83303,0.81266
1,0.45,10,1,0.85433,0.82676
2,0.45,10,2,0.8787,0.85108
3,0.45,10,3,0.88709,0.85735
4,0.45,10,4,0.89287,0.86134


{'subsample': 1, 'max_depth': 2, 'eta': 0.45, 'colsample_bytree': 0.5}