In [1]:
import optuna
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../data/minseok_EDA2_train.csv')
test = pd.read_csv('../data/minseok_EDA2_test.csv')

In [3]:
display(train)

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,hand,religion,orientation,voted,married,familysize,ASD,nerdiness,Qs_Mach,age_cat
0,0.219430,0.684909,0.645424,0.689224,0.270300,0.580272,0.649525,0.664648,0.287443,0.462615,...,0.550968,0.567721,0.617225,0.549084,0.560104,0.547405,0.546760,1,3.653846,0.559000
1,0.549147,0.490676,0.505430,0.576165,0.538382,0.754382,0.494185,0.537294,0.460687,0.462615,...,0.552290,0.597924,0.521066,0.562683,0.515796,0.547405,0.546760,1,3.269231,0.523114
2,0.549147,0.684909,0.645424,0.576165,0.490014,0.754382,0.649525,0.664648,0.541905,0.523798,...,0.552290,0.597924,0.600253,0.549084,0.529865,0.547405,0.546760,1,3.692308,0.523114
3,0.549147,0.490676,0.505430,0.382335,0.538382,0.427424,0.435168,0.664648,0.460687,0.523798,...,0.550968,0.565894,0.521066,0.549084,0.560104,0.565604,0.546760,1,3.500000,0.564478
4,0.549147,0.490676,0.505430,0.576165,0.490014,0.427424,0.494185,0.413060,0.460687,0.523798,...,0.550968,0.567721,0.521066,0.549084,0.560104,0.573536,0.546760,0,3.461538,0.564478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,0.302370,0.684909,0.505430,0.440718,0.490014,0.580272,0.494185,0.537294,0.460687,0.523798,...,0.552290,0.565894,0.578521,0.549084,0.560104,0.544640,0.546760,0,3.346154,0.564478
14996,0.696582,0.490676,0.645424,0.576165,0.538382,0.754382,0.649525,0.537294,0.541905,0.641272,...,0.552290,0.495370,0.521066,0.562683,0.515796,0.544640,0.546760,1,4.038462,0.523114
14997,0.549147,0.684909,0.645424,0.689224,0.679836,0.754382,0.649525,0.664648,0.541905,0.641272,...,0.552290,0.565894,0.600253,0.562683,0.560104,0.544640,0.664114,1,4.615385,0.559000
14998,0.696582,0.684909,0.505430,0.689224,0.679836,0.754382,0.649525,0.313116,0.675138,0.641272,...,0.552290,0.567721,0.617225,0.549084,0.515796,0.565604,0.664114,0,4.115385,0.559000


In [4]:
columns = [col for col in train.columns.to_list() if col not in ['nerdiness']]

In [5]:
data = train[columns]
target = train['nerdiness']

In [23]:
def objective(trial, data=data, target=target):
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=20171184)
    param = {
        'tree_method': 'gpu_hist',
        'lambda' : trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha' : trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree' : trial.suggest_categorical('colsample_bytree', [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample' : trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 1e-2, 0.2),
        'n_estimators' : trial.suggest_categorical('colsample_bytree', [1000, 3000, 5000, 7000, 10000]),
        'max_depth' : trial.suggest_categorical('max_depth', [3, 5, 6, 7, 8, 9, 10, 12]),
        'random_state' : trial.suggest_categorical('random_state', [42, 777, 20171184]),
        'min_child_weight' : trial.suggest_int('min_child_weight', 1, 300),
        'use_label_encoder' : False,
        'eval_metric': 'auc'
    }
    model = xgb.XGBClassifier(**param)
    
    model.fit(train_x, train_y, eval_set=[(test_x, test_y)], early_stopping_rounds=100, verbose=False)
    
    preds = model.predict(test_x)
    
    auc = roc_auc_score(test_y, preds)
    
    return auc

In [28]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-08-07 20:27:23,481][0m A new study created in memory with name: no-name-3ef0cc18-90c9-423a-beca-f3ff11c42c4f[0m
[32m[I 2022-08-07 20:27:25,787][0m Trial 0 finished with value: 0.7059696533995599 and parameters: {'lambda': 0.006799621068806167, 'alpha': 0.06308880026837554, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.1295283977001047, 'n_estimators': 7119, 'max_depth': 8, 'random_state': 777, 'min_child_weight': 195}. Best is trial 0 with value: 0.7059696533995599.[0m
[32m[I 2022-08-07 20:27:38,225][0m Trial 1 finished with value: 0.7684138291614926 and parameters: {'lambda': 0.9369654913096985, 'alpha': 0.061509452376463165, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.036876683150603315, 'n_estimators': 5723, 'max_depth': 12, 'random_state': 42, 'min_child_weight': 17}. Best is trial 1 with value: 0.7684138291614926.[0m
[32m[I 2022-08-07 20:27:40,166][0m Trial 2 finished with value: 0.7000468378973053 and parameters: {'lambda':

KeyboardInterrupt: 

In [29]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_random_state,params_subsample,state
0,0,0.70597,2022-08-07 20:27:23.482525,2022-08-07 20:27:25.786042,0 days 00:00:02.303517,0.063089,0.7,0.0068,0.129528,8,195,7119,777,0.8,COMPLETE
1,1,0.768414,2022-08-07 20:27:25.787042,2022-08-07 20:27:38.225225,0 days 00:00:12.438183,0.061509,0.7,0.936965,0.036877,12,17,5723,42,0.8,COMPLETE
2,2,0.700047,2022-08-07 20:27:38.226225,2022-08-07 20:27:40.166525,0 days 00:00:01.940300,0.001203,0.7,0.263373,0.071405,6,270,6670,42,0.6,COMPLETE
3,3,0.714234,2022-08-07 20:27:40.167525,2022-08-07 20:27:42.215902,0 days 00:00:02.048377,0.171909,0.7,0.055511,0.104656,12,95,7492,777,0.4,COMPLETE
4,4,0.748668,2022-08-07 20:27:42.216902,2022-08-07 20:28:06.086336,0 days 00:00:23.869434,1.585109,0.5,0.005982,0.012099,6,56,5674,20171184,0.9,COMPLETE
5,5,0.708306,2022-08-07 20:28:06.087336,2022-08-07 20:28:07.275901,0 days 00:00:01.188565,0.007079,0.9,0.47958,0.150178,10,252,6278,42,0.6,COMPLETE
6,6,0.701703,2022-08-07 20:28:07.276901,2022-08-07 20:28:09.242081,0 days 00:00:01.965180,0.013467,0.4,1.301185,0.045453,8,173,5276,20171184,0.4,COMPLETE
7,7,0.772409,2022-08-07 20:28:09.243081,2022-08-07 20:28:20.280772,0 days 00:00:11.037691,1.02279,0.8,0.003741,0.05579,9,8,9091,777,0.7,COMPLETE
8,8,0.701893,2022-08-07 20:28:20.281772,2022-08-07 20:28:21.945184,0 days 00:00:01.663412,0.289915,0.7,0.005408,0.039005,8,196,7575,777,0.5,COMPLETE
9,9,0.698486,2022-08-07 20:28:21.946184,2022-08-07 20:28:23.665575,0 days 00:00:01.719391,0.014635,0.5,0.003193,0.077908,8,270,8797,20171184,0.5,COMPLETE


In [32]:
Best_trial = study.best_trial.params
Best_trial["tree_method"] = 'gpu_hist'
Best_trial

{'lambda': 0.003741189672443248,
 'alpha': 1.022789580397131,
 'colsample_bytree': 0.8,
 'subsample': 0.7,
 'learning_rate': 0.0557903704656932,
 'n_estimators': 9091,
 'max_depth': 9,
 'random_state': 777,
 'min_child_weight': 8,
 'tree_method': 'gpu_hist'}

In [33]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=3, random_state=42, shuffle=True)
auc=[]
n=0
for trn_idx, test_idx in kf.split(train[columns], train['nerdiness']):
    X_tr, X_val = train[columns].iloc[trn_idx], train[columns].iloc[test_idx]
    y_tr, y_val = train['nerdiness'].iloc[trn_idx], train['nerdiness'].iloc[test_idx]
    model = xgb.XGBClassifier(**Best_trial)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=False)
    preds += model.predict(test[columns])/kf.n_splits
    auc.append(roc_auc_score(y_val, model.predict(X_val)))
    print(f'fold: {n+1} ==> auc: {auc[n]}')
    n += 1



fold: 1 ==> auc: 0.7710090495539977




fold: 2 ==> auc: 0.7895254603574805




fold: 3 ==> auc: 0.7877452236604655




fold: 4 ==> auc: 0.7799394261151256




fold: 5 ==> auc: 0.7755677983337262


In [34]:
np.mean(auc)

0.7807573916041591

In [None]:
preds

In [None]:
len(preds)

In [None]:
sub = pd.read_csv('../data/sample_submission.csv')

In [None]:
sub.head()

In [None]:
sub['nerdiness']=preds
sub.to_csv('../submission/xgboost_optuna.csv, index=False)

In [None]:
sub.shape