In [1]:
import pandas as pd

import xgboost as xgb

import optuna

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold

from sklearn.metrics import log_loss

import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_labels = pd.read_csv('../data/train_labels.csv')
train_metadata = pd.read_csv('../data/train_metadata.csv')

train = train_metadata.merge(train_labels, on='filename', how='inner')

print(train.shape)

(1342, 14)


In [3]:
def process_age(age_str):
    age_str = age_str.replace('[', '').replace(']', '').split(':')
    return (int(age_str[0]) + int(age_str[1])) / 2

train.age = train.age.apply(lambda x: process_age(x)).astype(int)
train.body_site = train.body_site.replace('trunk', 'trunc')

train.head()

Unnamed: 0,filename,age,sex,body_site,melanoma_history,breslow,ulceration,resolution,tif_cksum,tif_size,us_tif_url,eu_tif_url,as_tif_url,relapse
0,1u4lhlqb.tif,33,2,thigh,YES,<0.8,NO,0.264384,3028450373,747151312,s3://drivendata-competition-visiomel-public-us...,s3://drivendata-competition-visiomel-public-eu...,s3://drivendata-competition-visiomel-public-as...,0
1,rqumqnfp.tif,47,1,trunc,NO,[1 : 2[,NO,0.264384,1294832049,591027450,s3://drivendata-competition-visiomel-public-us...,s3://drivendata-competition-visiomel-public-eu...,s3://drivendata-competition-visiomel-public-as...,0
2,bu5xt1xm.tif,65,2,face,NO,<0.8,NO,0.264384,774102360,465947458,s3://drivendata-competition-visiomel-public-us...,s3://drivendata-competition-visiomel-public-eu...,s3://drivendata-competition-visiomel-public-as...,0
3,dibvu7wk.tif,63,2,forearm,,[2 : 4[,YES,0.22649,515827065,568174704,s3://drivendata-competition-visiomel-public-us...,s3://drivendata-competition-visiomel-public-eu...,s3://drivendata-competition-visiomel-public-as...,0
4,qsza4coh.tif,91,2,face,NO,[1 : 2[,NO,0.22649,1541795099,1042691978,s3://drivendata-competition-visiomel-public-us...,s3://drivendata-competition-visiomel-public-eu...,s3://drivendata-competition-visiomel-public-as...,0


In [4]:
X = train[['age', 'sex', 'body_site',  'melanoma_history']]
y = train.relapse.values

In [5]:
categorical_pipeline = Pipeline(
    steps=[
        ("oh-encode", OneHotEncoder(handle_unknown="ignore")),
    ]
)

cat_cols = ['sex', 'body_site', 'melanoma_history']

full_processor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

X = full_processor.fit_transform(X)
X.shape

(1342, 25)

In [6]:
# Training
N_SPLITS = 30 #10
N_TRIALS = 700 #50

kf = KFold(n_splits=N_SPLITS, random_state=24, shuffle=True)


def objective(trial, X, y, kf):
    
    param = {
        'scale_pos_weight': None, #SCALE_POS_WEIGHT,
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),
        'objective':  trial.suggest_categorical('objective',['binary:logistic']),
        'n_estimators':trial.suggest_int('n_estimators', 1, 20),
        'max_depth':trial.suggest_int('max_depth', 1, 7),
        #'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        # L2 regularization weight.
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        # L1 regularization weight.
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        # minimum child weight, larger the term more conservative the tree.
        'min_child_weight': trial.suggest_int("min_child_weight", 2, 10),
        # lr
        'eta' : trial.suggest_float("eta", 1e-8, 1.0, log=True),
        'gamma': trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        'grow_policy': trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    }

    loss = 0
    for train_index, test_index in kf.split(X, y):

        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        model = xgb.XGBClassifier(**param)
        
        model.fit(X_train, y_train, verbose=0)
    
        preds = model.predict_proba(X_val)

        loss += log_loss(y_val,preds)

    return (loss/kf.n_splits).round(6)

In [7]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, X, y, kf), n_trials=N_TRIALS)

[32m[I 2023-04-30 22:36:08,710][0m A new study created in memory with name: no-name-e76f731e-8fae-4007-9457-47f2edb0fc78[0m
[32m[I 2023-04-30 22:36:09,888][0m Trial 0 finished with value: 0.693147 and parameters: {'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 'n_estimators': 2, 'max_depth': 5, 'lambda': 1.1829204278276088e-08, 'alpha': 3.1279820394903835e-06, 'subsample': 0.7444510150533334, 'colsample_bytree': 0.6663986729963027, 'min_child_weight': 8, 'eta': 1.9725360464749844e-08, 'gamma': 0.5801827379291463, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.693147.[0m
[32m[I 2023-04-30 22:36:12,394][0m Trial 1 finished with value: 0.693147 and parameters: {'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 'n_estimators': 15, 'max_depth': 4, 'lambda': 0.004853812216012364, 'alpha': 0.00451121407735889, 'subsample': 0.7330284000451344, 'colsample_bytree': 0.7149862465570765, 'min_child_weight': 10, 'eta': 3.936648254276e-08, 'gamma': 0.002527109

In [8]:
N_ENSEMBLE = 10

best_trials = study.trials_dataframe().sort_values('value').head(N_ENSEMBLE).number.tolist()

for idx, trial_id in enumerate(best_trials):
    
    print(study.trials[trial_id])
    model = xgb.XGBClassifier(**study.trials[trial_id].params)
    model.fit(X, y)
    pickle.dump(model, open(f'../submission/assets/model_{idx}.pkl', "wb"))
    
pickle.dump(full_processor, open('../submission/assets/transformer.pkl', "wb"))

FrozenTrial(number=182, state=1, values=[0.410739], datetime_start=datetime.datetime(2023, 4, 30, 22, 42, 28, 518774), datetime_complete=datetime.datetime(2023, 4, 30, 22, 42, 30, 448038), params={'tree_method': 'gpu_hist', 'objective': 'binary:logistic', 'n_estimators': 17, 'max_depth': 1, 'lambda': 0.0003895892018009526, 'alpha': 0.0010603739026819447, 'subsample': 0.7647409873062938, 'colsample_bytree': 0.9268187484272701, 'min_child_weight': 10, 'eta': 0.9507787376693266, 'gamma': 0.15490035668650476, 'grow_policy': 'lossguide'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'tree_method': CategoricalDistribution(choices=('gpu_hist',)), 'objective': CategoricalDistribution(choices=('binary:logistic',)), 'n_estimators': IntDistribution(high=20, log=False, low=1, step=1), 'max_depth': IntDistribution(high=7, log=False, low=1, step=1), 'lambda': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'alpha': FloatDistribution(high=1.0, log=True, low=1e-0

In [9]:
model = pickle.load(open(f'../submission/assets/model_0.pkl', mode='rb'))

In [12]:
pred = model.predict_proba(X[:5])
pred[:,1]

array([0.26022592, 0.22239468, 0.20476389, 0.10268841, 0.20476389],
      dtype=float32)