### Context

#### Experiment Tools

- wandb

In [1]:
# !pip install wandb

- json 파일로 관리하면 좋음

In [2]:
sweep_config = {
  "name" : "mdc_sweep",
  "method" : "bayes",
  "parameters" : {
    "max_depth" : {
      "distribution": "int_uniform",
      "min":2,
      "max":15
    },
    "subsample" :{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    },
    "colsample_bytree":{
      "distribution": "uniform",
      "min": 0.5,
      "max": 1.0
    }
  },
  "metric":{
      "name": "cv_loss",
      "goal": "minimize"
  }
}


In [3]:
!pip install wandb



In [4]:
import os
from os.path import join

import multiprocessing
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import wandb

# 웹 서버 연결
wandb.init()

# optuna 세팅
sweep_id = wandb.sweep(sweep_config, 
                       project="medici wandb test")

n_cpus = multiprocessing.cpu_count()

[34m[1mwandb[0m: Currently logged in as: [33msubinium[0m (use `wandb login --relogin` to force relogin)




Create sweep with ID: 1xap0art
Sweep URL: https://wandb.ai/subinium/medici%20wandb%20test/sweeps/1xap0art


In [5]:
BASE_DIR = './data' 

train_path = os.path.join(BASE_DIR, 'train.csv')
test_path = os.path.join(BASE_DIR, 'test.csv')
submission_path = os.path.join(BASE_DIR, 'sample_submission.csv')

data = pd.read_csv(train_path)
test = pd.read_csv(test_path)

label = data['credit']

In [6]:
# 불필요한 컬럼 제거
data.drop(columns=['index', 'credit'], inplace=True)
test.drop(columns=['index'],         inplace=True)

In [7]:
cat_columns = [c for c, t in zip(data.dtypes.index, data.dtypes) if t == 'O'] 
num_columns = [c for c    in data.columns if c not in cat_columns]

print('Categorical Columns: \n{}\n'.format(cat_columns))
print('Numeric Columns: \n{}'.format(num_columns))

Categorical Columns: 
['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']

Numeric Columns: 
['child_num', 'income_total', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'family_size', 'begin_month']


#### 라벨 데이터 인코딩

In [8]:
label = label.astype(int)

#### 전처리 프로세스 함수로 작성

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def preprocess(x_train, x_valid, x_test):
    tmp_x_train = x_train.copy()
    tmp_x_valid = x_valid.copy()
    tmp_x_test  = x_test.copy()
    
    tmp_x_train.reset_index(drop=True, inplace=True)
    tmp_x_valid.reset_index(drop=True, inplace=True)
    
    # 결측치 처리
    imputer = SimpleImputer(strategy='most_frequent')
    tmp_x_train[cat_columns] = imputer.fit_transform(tmp_x_train[cat_columns])
    tmp_x_valid[cat_columns] = imputer.transform(tmp_x_valid[cat_columns])
    tmp_x_test[cat_columns]  = imputer.transform(tmp_x_test[cat_columns])
    
    # 스케일링
    scaler = StandardScaler()
    tmp_x_train[num_columns] = scaler.fit_transform(tmp_x_train[num_columns])
    tmp_x_valid[num_columns] = scaler.transform(tmp_x_valid[num_columns])
    tmp_x_test[num_columns]  = scaler.transform(tmp_x_test[num_columns])

    # 인코딩
    ohe = OneHotEncoder(sparse=False)
    ohe.fit(tmp_x_train[cat_columns])
    
    tmp_x_train_cat = pd.DataFrame(ohe.transform(tmp_x_train[cat_columns]))
    tmp_x_valid_cat = pd.DataFrame(ohe.transform(tmp_x_valid[cat_columns]))
    tmp_x_test_cat  = pd.DataFrame(ohe.transform(tmp_x_test[cat_columns]))
    
    tmp_x_train.drop(columns=cat_columns, inplace=True)
    tmp_x_valid.drop(columns=cat_columns, inplace=True)
    tmp_x_test.drop(columns=cat_columns, inplace=True)
    
    tmp_x_train = pd.concat([tmp_x_train, tmp_x_train_cat], axis=1)
    tmp_x_valid = pd.concat([tmp_x_valid, tmp_x_valid_cat], axis=1)
    tmp_x_test  = pd.concat([tmp_x_test, tmp_x_test_cat], axis=1)
    
    return tmp_x_train, tmp_x_valid, tmp_x_test

### Ensemble

In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from xgboost import XGBClassifier

def train():
    with wandb.init() as run:
        params = wandb.config
        
        val_scores = []
        n_splits = 5

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        for i, (trn_idx, val_idx) in enumerate(skf.split(data, label)):
            x_train, y_train = data.iloc[trn_idx, :], label.iloc[trn_idx,]
            x_valid, y_valid = data.iloc[val_idx, :], label.iloc[val_idx,]

            # 전처리
            x_train, x_valid, x_test = preprocess(x_train, x_valid, test)

            # 모델 정의
            model = XGBClassifier(n_estimators=1000,
                                  max_depth=params['max_depth'],
                                  subsample=params['subsample'],
                                  colsample_bytree=params['colsample_bytree'],
#                                   tree_method='gpu_hist',
                                  n_jobs=n_cpus-1)

            # 모델 학습
            model.fit(x_train, y_train, 
                      eval_metric='mlogloss', 
                      eval_set=[[x_train, y_train], [x_valid, y_valid]],
                      early_stopping_rounds=100,
                      verbose=100)

            # 훈련, 검증 데이터 log_loss 확인
            trn_logloss = log_loss(y_train, model.predict_proba(x_train))
            val_logloss = log_loss(y_valid, model.predict_proba(x_valid))
            print('{} Fold, train logloss : {:.4f}4, validation logloss : {:.4f}'.format(i, trn_logloss, val_logloss))

            val_scores.append(val_logloss)
            
        metrics = {"cv_loss": np.mean(val_scores)}
        wandb.log(metrics)
count = 5
wandb.agent(sweep_id, function=train, count=count)

[34m[1mwandb[0m: Agent Starting Run: jnw3p3al with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.6862110204160192
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.8986802757967174


[0]	validation_0-mlogloss:0.99585	validation_1-mlogloss:0.99541
[100]	validation_0-mlogloss:0.73846	validation_1-mlogloss:0.77927
[200]	validation_0-mlogloss:0.69112	validation_1-mlogloss:0.76142
[300]	validation_0-mlogloss:0.65333	validation_1-mlogloss:0.75074
[400]	validation_0-mlogloss:0.62475	validation_1-mlogloss:0.74419
[500]	validation_0-mlogloss:0.60003	validation_1-mlogloss:0.74039
[600]	validation_0-mlogloss:0.57768	validation_1-mlogloss:0.73733
[700]	validation_0-mlogloss:0.55772	validation_1-mlogloss:0.73379
[800]	validation_0-mlogloss:0.54025	validation_1-mlogloss:0.73267
[900]	validation_0-mlogloss:0.52406	validation_1-mlogloss:0.73123
[984]	validation_0-mlogloss:0.51162	validation_1-mlogloss:0.73111
0 Fold, train logloss : 0.52634, validation logloss : 0.7306
[0]	validation_0-mlogloss:0.99458	validation_1-mlogloss:0.99629
[100]	validation_0-mlogloss:0.73250	validation_1-mlogloss:0.78808
[200]	validation_0-mlogloss:0.68779	validation_1-mlogloss:0.77454
[300]	validation_0-

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73938
_runtime,269.0
_timestamp,1628321330.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: g9u2nk4c with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.7981342456812505
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.8888067665412711


[0]	validation_0-mlogloss:0.97889	validation_1-mlogloss:0.97802
[100]	validation_0-mlogloss:0.73547	validation_1-mlogloss:0.77537
[200]	validation_0-mlogloss:0.68885	validation_1-mlogloss:0.76005
[300]	validation_0-mlogloss:0.65119	validation_1-mlogloss:0.75067
[400]	validation_0-mlogloss:0.62118	validation_1-mlogloss:0.74316
[500]	validation_0-mlogloss:0.59544	validation_1-mlogloss:0.73945
[600]	validation_0-mlogloss:0.57318	validation_1-mlogloss:0.73580
[700]	validation_0-mlogloss:0.55246	validation_1-mlogloss:0.73391
[800]	validation_0-mlogloss:0.53486	validation_1-mlogloss:0.73318
[900]	validation_0-mlogloss:0.51818	validation_1-mlogloss:0.73316
[907]	validation_0-mlogloss:0.51726	validation_1-mlogloss:0.73310
0 Fold, train logloss : 0.53364, validation logloss : 0.7330
[0]	validation_0-mlogloss:0.97784	validation_1-mlogloss:0.98007
[100]	validation_0-mlogloss:0.73160	validation_1-mlogloss:0.78754
[200]	validation_0-mlogloss:0.68690	validation_1-mlogloss:0.77397
[300]	validation_0-

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.73959
_runtime,296.0
_timestamp,1628321633.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: uga5ljzo with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.688392987630204
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.9184617067357335


[0]	validation_0-mlogloss:0.97315	validation_1-mlogloss:0.98673
[100]	validation_0-mlogloss:0.20001	validation_1-mlogloss:0.76651
[138]	validation_0-mlogloss:0.14424	validation_1-mlogloss:0.81374
0 Fold, train logloss : 0.38154, validation logloss : 0.7210
[0]	validation_0-mlogloss:0.97386	validation_1-mlogloss:0.99059
[100]	validation_0-mlogloss:0.19118	validation_1-mlogloss:0.78765
[139]	validation_0-mlogloss:0.13630	validation_1-mlogloss:0.84209
1 Fold, train logloss : 0.37074, validation logloss : 0.7321
[0]	validation_0-mlogloss:0.97923	validation_1-mlogloss:0.99421
[100]	validation_0-mlogloss:0.18986	validation_1-mlogloss:0.79280
[133]	validation_0-mlogloss:0.14338	validation_1-mlogloss:0.83899
2 Fold, train logloss : 0.40354, validation logloss : 0.7325
[0]	validation_0-mlogloss:0.98134	validation_1-mlogloss:0.99596
[100]	validation_0-mlogloss:0.19536	validation_1-mlogloss:0.78406
[134]	validation_0-mlogloss:0.14205	validation_1-mlogloss:0.83362
3 Fold, train logloss : 0.41944, 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72703
_runtime,194.0
_timestamp,1628321834.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: ap42h48m with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.6593670209065594
[34m[1mwandb[0m: 	max_depth: 15
[34m[1mwandb[0m: 	subsample: 0.9240215396452902


[0]	validation_0-mlogloss:0.96128	validation_1-mlogloss:0.98286
[100]	validation_0-mlogloss:0.11091	validation_1-mlogloss:0.84140
[123]	validation_0-mlogloss:0.08841	validation_1-mlogloss:0.89005
0 Fold, train logloss : 0.36124, validation logloss : 0.7169
[0]	validation_0-mlogloss:0.95833	validation_1-mlogloss:0.98533
[100]	validation_0-mlogloss:0.11087	validation_1-mlogloss:0.87285
[119]	validation_0-mlogloss:0.09349	validation_1-mlogloss:0.90959
1 Fold, train logloss : 0.39444, validation logloss : 0.7343
[0]	validation_0-mlogloss:0.95825	validation_1-mlogloss:0.98616
[100]	validation_0-mlogloss:0.10863	validation_1-mlogloss:0.87591
[118]	validation_0-mlogloss:0.09066	validation_1-mlogloss:0.91602
2 Fold, train logloss : 0.39324, validation logloss : 0.7288
[0]	validation_0-mlogloss:0.96342	validation_1-mlogloss:0.98752
[100]	validation_0-mlogloss:0.10910	validation_1-mlogloss:0.86463
[122]	validation_0-mlogloss:0.08866	validation_1-mlogloss:0.91177
3 Fold, train logloss : 0.37034, 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.72505
_runtime,250.0
_timestamp,1628322089.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


[34m[1mwandb[0m: Agent Starting Run: 6c54fw0r with config:
[34m[1mwandb[0m: 	colsample_bytree: 0.5441274017917261
[34m[1mwandb[0m: 	max_depth: 15
[34m[1mwandb[0m: 	subsample: 0.8010343387488805


[0]	validation_0-mlogloss:0.96988	validation_1-mlogloss:0.99057
[100]	validation_0-mlogloss:0.12110	validation_1-mlogloss:0.83665
[123]	validation_0-mlogloss:0.09763	validation_1-mlogloss:0.88015
0 Fold, train logloss : 0.38154, validation logloss : 0.7145
[0]	validation_0-mlogloss:0.96786	validation_1-mlogloss:0.99007
[100]	validation_0-mlogloss:0.12022	validation_1-mlogloss:0.86750
[120]	validation_0-mlogloss:0.09967	validation_1-mlogloss:0.91148
1 Fold, train logloss : 0.41104, validation logloss : 0.7277
[0]	validation_0-mlogloss:0.97078	validation_1-mlogloss:0.99228
[100]	validation_0-mlogloss:0.11756	validation_1-mlogloss:0.86687
[119]	validation_0-mlogloss:0.10060	validation_1-mlogloss:0.90608
2 Fold, train logloss : 0.42174, validation logloss : 0.7249
[0]	validation_0-mlogloss:0.97095	validation_1-mlogloss:0.98989
[100]	validation_0-mlogloss:0.11597	validation_1-mlogloss:0.85715
[123]	validation_0-mlogloss:0.09534	validation_1-mlogloss:0.90298
3 Fold, train logloss : 0.38654, 

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
cv_loss,0.7202
_runtime,202.0
_timestamp,1628322297.0
_step,0.0


0,1
cv_loss,▁
_runtime,▁
_timestamp,▁
_step,▁


In [11]:
submit.to_csv('oof_first_submit.csv', index=False)

NameError: name 'submit' is not defined