In [1]:
import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../data/real_final_train.csv')
test = pd.read_csv('../data/real_final_test.csv')

In [3]:
display(train)

Unnamed: 0,VCL_0,VCL_38,education,urban,gender,engnat,hand,religion,hand.1,religion.1,age_cat,married,orientation,familysize,ASD,nerdiness,Qs,TIPI_left,TIPI_right,VCL_1
0,2,0,2,1,3,1,2,12,2,12,2,1,4,4,2,1,2.346154,2.75,2.333333,1.0
1,1,1,4,2,2,1,1,2,1,2,4,2,1,4,2,1,2.269231,3.50,2.000000,1.0
2,1,1,2,1,1,2,1,2,1,2,4,3,2,4,2,1,2.346154,5.00,2.000000,1.0
3,2,0,1,3,1,1,2,1,2,1,1,1,1,2,2,1,2.384615,3.50,2.500000,1.0
4,2,1,1,2,2,2,2,12,2,12,1,1,1,1,2,0,2.423077,3.75,2.666667,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,2,1,2,2,2,1,1,1,1,1,1,1,3,3,2,0,2.307692,3.75,2.166667,1.0
14996,2,0,4,1,2,2,1,3,1,3,4,2,1,3,2,1,2.730769,4.25,2.500000,1.0
14997,2,0,2,2,2,1,1,1,1,1,2,1,2,3,1,1,2.884615,5.00,2.000000,1.0
14998,2,2,3,2,2,1,1,12,1,12,2,2,4,2,1,0,2.615385,4.50,2.500000,1.0


In [4]:
columns = [col for col in train.columns.to_list() if col not in ['nerdiness']]

In [5]:
data = train[columns]
target = train['nerdiness']

In [8]:
def objective(trial, data=data, target=target):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15, random_state=777)
    param = {
        'tree_method': 'gpu_hist',
        'lambda' : trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha' : trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree' : trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample' : trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate' : trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators' : 10000,
        'max_depth' : trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17]),
        'random_state' : trial.suggest_categorical('random_state', [777]),
        'min_child_weight' : trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)
    
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100, verbose=False)
    
    preds = model.predict(test_x)
    
    auc = roc_auc_score(test_y, preds)
    
    return auc

In [9]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-08-02 16:07:45,275][0m A new study created in memory with name: no-name-66a2ac24-916c-438e-82f9-989cbc0d48df[0m




[32m[I 2022-08-02 16:07:56,931][0m Trial 0 finished with value: 0.9934959349593495 and parameters: {'lambda': 0.006120864205992862, 'alpha': 3.9890223460013146, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 17, 'random_state': 777, 'min_child_weight': 27}. Best is trial 0 with value: 0.9934959349593495.[0m




[32m[I 2022-08-02 16:08:04,701][0m Trial 1 finished with value: 0.9925992348158776 and parameters: {'lambda': 0.07713588765281236, 'alpha': 2.364118870880586, 'colsample_bytree': 0.4, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 777, 'min_child_weight': 48}. Best is trial 1 with value: 0.9925992348158776.[0m




[32m[I 2022-08-02 16:08:06,600][0m Trial 2 finished with value: 0.9673362027737925 and parameters: {'lambda': 5.675832519777149, 'alpha': 0.10431344999851598, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.018, 'max_depth': 11, 'random_state': 777, 'min_child_weight': 208}. Best is trial 2 with value: 0.9673362027737925.[0m




[32m[I 2022-08-02 16:08:11,208][0m Trial 3 finished with value: 0.95844093735055 and parameters: {'lambda': 0.3160024158776561, 'alpha': 0.08776819095660347, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 7, 'random_state': 777, 'min_child_weight': 161}. Best is trial 3 with value: 0.95844093735055.[0m




[32m[I 2022-08-02 16:08:14,086][0m Trial 4 finished with value: 0.9785987565758011 and parameters: {'lambda': 0.0048658941181418175, 'alpha': 0.0021303778534601135, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.018, 'max_depth': 11, 'random_state': 777, 'min_child_weight': 78}. Best is trial 3 with value: 0.95844093735055.[0m




[32m[I 2022-08-02 16:08:21,167][0m Trial 5 finished with value: 0.9434122429459589 and parameters: {'lambda': 0.029392475967507596, 'alpha': 0.011005770411081032, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.008, 'max_depth': 13, 'random_state': 777, 'min_child_weight': 151}. Best is trial 5 with value: 0.9434122429459589.[0m




[32m[I 2022-08-02 16:08:24,571][0m Trial 6 finished with value: 0.9934959349593495 and parameters: {'lambda': 0.001335750698803229, 'alpha': 0.0018834549420799547, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.018, 'max_depth': 13, 'random_state': 777, 'min_child_weight': 30}. Best is trial 5 with value: 0.9434122429459589.[0m




[32m[I 2022-08-02 16:08:28,194][0m Trial 7 finished with value: 0.994296987087518 and parameters: {'lambda': 0.02088285022619671, 'alpha': 1.0086712625766807, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 777, 'min_child_weight': 36}. Best is trial 5 with value: 0.9434122429459589.[0m




[32m[I 2022-08-02 16:08:34,729][0m Trial 8 finished with value: 0.8691415590626494 and parameters: {'lambda': 0.07465901737378901, 'alpha': 0.013674812669042923, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.014, 'max_depth': 15, 'random_state': 777, 'min_child_weight': 203}. Best is trial 8 with value: 0.8691415590626494.[0m




[32m[I 2022-08-02 16:08:41,434][0m Trial 9 finished with value: 0.91142993782879 and parameters: {'lambda': 0.0018791375323414946, 'alpha': 0.19149503280070632, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.014, 'max_depth': 7, 'random_state': 777, 'min_child_weight': 144}. Best is trial 8 with value: 0.8691415590626494.[0m




[32m[I 2022-08-02 16:08:47,870][0m Trial 10 finished with value: 0.7281802965088475 and parameters: {'lambda': 0.7192874427022057, 'alpha': 0.017305317538498354, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 271}. Best is trial 10 with value: 0.7281802965088475.[0m




[32m[I 2022-08-02 16:08:51,747][0m Trial 11 finished with value: 0.7241750358680057 and parameters: {'lambda': 0.7362706705948139, 'alpha': 0.014608637025859234, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 288}. Best is trial 11 with value: 0.7241750358680057.[0m




[32m[I 2022-08-02 16:08:54,319][0m Trial 12 finished with value: 0.7255619320899092 and parameters: {'lambda': 0.9694610575464075, 'alpha': 0.015152726434889258, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 294}. Best is trial 11 with value: 0.7241750358680057.[0m




[32m[I 2022-08-02 16:08:58,281][0m Trial 13 finished with value: 0.7226327116212339 and parameters: {'lambda': 2.6015349393290106, 'alpha': 0.029573418216154546, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 294}. Best is trial 13 with value: 0.7226327116212339.[0m




[32m[I 2022-08-02 16:09:02,729][0m Trial 14 finished with value: 0.8927427068388332 and parameters: {'lambda': 9.576028645044468, 'alpha': 0.049313169092160096, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 9, 'random_state': 777, 'min_child_weight': 250}. Best is trial 13 with value: 0.7226327116212339.[0m




[32m[I 2022-08-02 16:09:08,483][0m Trial 15 finished with value: 0.894846963175514 and parameters: {'lambda': 3.065047913877309, 'alpha': 0.44084713321721264, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 240}. Best is trial 13 with value: 0.7226327116212339.[0m




[32m[I 2022-08-02 16:09:10,839][0m Trial 16 finished with value: 0.7271640363462459 and parameters: {'lambda': 1.7433871406065533, 'alpha': 0.0049992386620793415, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 295}. Best is trial 13 with value: 0.7226327116212339.[0m




[32m[I 2022-08-02 16:09:14,971][0m Trial 17 finished with value: 0.9658656145384983 and parameters: {'lambda': 0.4317697862768644, 'alpha': 0.036785457271044236, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 214}. Best is trial 13 with value: 0.7226327116212339.[0m




[32m[I 2022-08-02 16:09:17,655][0m Trial 18 finished with value: 0.9666068866571019 and parameters: {'lambda': 0.22737527750429165, 'alpha': 0.005156956698941073, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 9, 'random_state': 777, 'min_child_weight': 100}. Best is trial 13 with value: 0.7226327116212339.[0m




[32m[I 2022-08-02 16:09:21,102][0m Trial 19 finished with value: 0.7332496413199425 and parameters: {'lambda': 2.5559095221850097, 'alpha': 0.0010978081850679634, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.016, 'max_depth': 17, 'random_state': 777, 'min_child_weight': 264}. Best is trial 13 with value: 0.7226327116212339.[0m


Number of finished trials: 20
Best trial: {'lambda': 2.6015349393290106, 'alpha': 0.029573418216154546, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 777, 'min_child_weight': 294}


In [10]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_random_state,params_subsample,state
0,0,0.993496,2022-08-02 16:07:45.276459,2022-08-02 16:07:56.931427,0 days 00:00:11.654968,3.989022,0.4,0.006121,0.008,17,27,777,0.4,COMPLETE
1,1,0.992599,2022-08-02 16:07:56.932427,2022-08-02 16:08:04.701350,0 days 00:00:07.768923,2.364119,0.4,0.077136,0.01,15,48,777,0.6,COMPLETE
2,2,0.967336,2022-08-02 16:08:04.702350,2022-08-02 16:08:06.600780,0 days 00:00:01.898430,0.104313,0.6,5.675833,0.018,11,208,777,1.0,COMPLETE
3,3,0.958441,2022-08-02 16:08:06.601780,2022-08-02 16:08:11.207814,0 days 00:00:04.606034,0.087768,0.9,0.316002,0.008,7,161,777,0.7,COMPLETE
4,4,0.978599,2022-08-02 16:08:11.208814,2022-08-02 16:08:14.085022,0 days 00:00:02.876208,0.00213,0.8,0.004866,0.018,11,78,777,0.6,COMPLETE
5,5,0.943412,2022-08-02 16:08:14.086022,2022-08-02 16:08:21.166612,0 days 00:00:07.080590,0.011006,0.5,0.029392,0.008,13,151,777,0.6,COMPLETE
6,6,0.993496,2022-08-02 16:08:21.167612,2022-08-02 16:08:24.571376,0 days 00:00:03.403764,0.001883,0.8,0.001336,0.018,13,30,777,0.5,COMPLETE
7,7,0.994297,2022-08-02 16:08:24.572376,2022-08-02 16:08:28.194189,0 days 00:00:03.621813,1.008671,0.8,0.020883,0.01,15,36,777,0.8,COMPLETE
8,8,0.869142,2022-08-02 16:08:28.195189,2022-08-02 16:08:34.729658,0 days 00:00:06.534469,0.013675,0.5,0.074659,0.014,15,203,777,0.5,COMPLETE
9,9,0.91143,2022-08-02 16:08:34.730658,2022-08-02 16:08:41.433164,0 days 00:00:06.702506,0.191495,1.0,0.001879,0.014,7,144,777,0.5,COMPLETE


In [12]:
Best_trial = study.best_trial.params
Best_trial["n_estimators"], Best_trial["tree_method"] = 10000, 'gpu_hist'
Best_trial

{'lambda': 2.6015349393290106,
 'alpha': 0.029573418216154546,
 'colsample_bytree': 0.3,
 'subsample': 0.5,
 'learning_rate': 0.012,
 'max_depth': 5,
 'random_state': 777,
 'min_child_weight': 294,
 'n_estimators': 10000,
 'tree_method': 'gpu_hist'}

In [15]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5, random_state=777, shuffle=True)
auc=[]
n=0
for trn_idx, test_idx in kf.split(train[columns], train['nerdiness']):
    X_tr, X_val = train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val = train['nerdiness'].iloc[trn_idx], train['nerdiness'].iloc[test_idx]
    model = xgb.XGBClassifier(**Best_trial)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    preds += model.predict(test[columns])/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict(X_val)))
    print(f'fold: {n+1} ==> auc: {auc[n]}')
    n += 1



fold: 1 ==> auc: 0.7391350072538153




fold: 2 ==> auc: 0.7428133809003612




fold: 3 ==> auc: 0.7244283908772333




fold: 4 ==> auc: 0.7360031098914392




fold: 5 ==> auc: 0.743615248950527


In [16]:
np.mean(auc)

0.7371990275746751

In [17]:
preds

array([1., 1., 1., ..., 1., 1., 1.])

In [18]:
len(preds)

35452

In [19]:
sub = pd.read_csv('../data/sample_submission.csv')

In [20]:
sub.head()

Unnamed: 0,index,nerdiness
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1


In [23]:
sub['nerdiness']=preds
sub.to_csv('../data/submission.csv', index=False)

In [22]:
sub.shape

(35452, 2)