<a href="https://colab.research.google.com/github/ShotaArima/kaggle/blob/main/books/Kaggle%E3%81%A7%E5%8B%9D%E3%81%A4%E3%83%87%E3%83%BC%E3%82%BF%E5%88%86%E6%9E%90%E3%81%AE%E6%8A%80%E8%A1%93/02-%E3%82%BF%E3%82%B9%E3%82%AF%E3%81%A8%E8%A9%95%E4%BE%A1%E6%8C%87%E6%A8%99/04-custom_evaluation_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# カスタム評価指標とカスタム目的関数

## 準備

In [2]:
import numpy as np
import pandas as pd

In [8]:
# 自分のディレクトリのsample_dataディレクトリURL
url = 'https://raw.githubusercontent.com/ShotaArima/kaggle/refs/heads/main/books/Kaggle%E3%81%A7%E5%8B%9D%E3%81%A4%E3%83%87%E3%83%BC%E3%82%BF%E5%88%86%E6%9E%90%E3%81%AE%E6%8A%80%E8%A1%93/sample_data/'

In [14]:
# train_xは学習データ、train_yは目的変数、test_xはテストデータ
# pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります

train = pd.read_csv(url + 'train_preprocessed_data.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv(url+'test_preprocessed.csv')

from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]

# 学習データを学習データとバリデーションデータに分ける
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

### データの中身

In [25]:
print(f"tr_x:\n", tr_x.head())
print(f"tr_y:\n", tr_y.head())

tr_x:
    age  sex      height     weight  product  ...  medical_keyword_10  year  month  day  yearmonth
0   50    1  166.445608  65.016732        9  ...                   0  2015      2    3      24182
1   68    0  164.334615  56.544217        0  ...                   0  2015      5    9      24185
2   77    1  167.462917  54.242267        2  ...                   0  2016      2   13      24194
3   17    1  177.097725  71.147762        3  ...                   0  2015      7    6      24187
4   62    0  158.165788  65.240697        1  ...                   0  2016      9   17      24201

[5 rows x 28 columns]
tr_y:
 0    0
1    0
2    1
3    0
4    1
Name: target, dtype: int64


In [26]:
print(f"va_x:\n", va_x.head())
print(f"va_y:\n", va_y.head())

va_x:
     age  sex      height     weight  product  ...  medical_keyword_10  year  month  day  yearmonth
6    63    1  181.146801  63.982878        2  ...                   0  2015      8   19      24188
10   54    1  175.576514  52.470671        7  ...                   0  2016     12   24      24204
11   17    0  162.283723  51.544568        6  ...                   0  2016      2   28      24194
12    5    1  160.458369  55.488517        4  ...                   0  2016      8   20      24200
23   35    0  147.256907  44.591515        2  ...                   0  2015     12   15      24192

[5 rows x 28 columns]
va_y:
 6     1
10    0
11    0
12    0
23    1
Name: target, dtype: int64


# サンプルコード

In [15]:
import xgboost as xgb
from sklearn.metrics import log_loss

# 特徴量と目的変数をxgboostのデータ構造に変換する
# 学習データの特徴量と目的変数がtr_x, tr_y
# バリデーションデータの特徴量と目的変数がva_x, va_y
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)

In [37]:
# カスタム目的関数(この場合は、loglossであり、xgboostの'binary:logistic'と等価)
def logregobj(preds, dtrain):
    labels = dtrain.get_label() # 真の値のラベルを取得
    preds = 1.0 / (1.0 + np.exp(-preds)) # シグモイド関数
    grad = preds - labels # 勾配
    hass = preds * (1.0 - preds) # 二階微分値
    return grad, hass

In [38]:
# カスタム評価指標 (この場合は誤答率)
def evalerror(preds, dtrain):
    labels = dtrain.get_label() # 真の値のラベルを取得
    return 'cstom-error', float(sum(labels != (preds > 0.0))) / len(labels)

In [44]:
# ハイパーパラメータの設定
params = {'silent':1,
          'random_state':71}
num_round = 50
watchlist = [(dtrain, 'dtrain'), (dvalid, 'eval')]

# モデルの学習の実行
bst = xgb.train(params, dtrain, num_round, watchlist, obj = logregobj, feval = evalerror)

# 目的関数に binary:logistic を指定した時と違い、確率に変換する前の値で予測値が出力されるので変換が必要
pred_val = bst.predict(dvalid)
pred = 1.0 / (1.0 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)

[0]	dtrain-rmse:0.40096	dtrain-cstom-error:0.17067	eval-rmse:0.42510	eval-cstom-error:0.19160
[1]	dtrain-rmse:0.70012	dtrain-cstom-error:0.11627	eval-rmse:0.71947	eval-cstom-error:0.14880
[2]	dtrain-rmse:0.98004	dtrain-cstom-error:0.10707	eval-rmse:0.99417	eval-cstom-error:0.14120
[3]	dtrain-rmse:1.22553	dtrain-cstom-error:0.09853	eval-rmse:1.23620	eval-cstom-error:0.13680


Parameters: { "silent" } are not used.



[4]	dtrain-rmse:1.43887	dtrain-cstom-error:0.09307	eval-rmse:1.45035	eval-cstom-error:0.13280
[5]	dtrain-rmse:1.62458	dtrain-cstom-error:0.09027	eval-rmse:1.62992	eval-cstom-error:0.13160
[6]	dtrain-rmse:1.79119	dtrain-cstom-error:0.08507	eval-rmse:1.79517	eval-cstom-error:0.13080
[7]	dtrain-rmse:1.92439	dtrain-cstom-error:0.08133	eval-rmse:1.92730	eval-cstom-error:0.13720
[8]	dtrain-rmse:2.05722	dtrain-cstom-error:0.07693	eval-rmse:2.05313	eval-cstom-error:0.12880
[9]	dtrain-rmse:2.16512	dtrain-cstom-error:0.07427	eval-rmse:2.15662	eval-cstom-error:0.12320
[10]	dtrain-rmse:2.25427	dtrain-cstom-error:0.07227	eval-rmse:2.24850	eval-cstom-error:0.12160
[11]	dtrain-rmse:2.34389	dtrain-cstom-error:0.06680	eval-rmse:2.33289	eval-cstom-error:0.11640
[12]	dtrain-rmse:2.43563	dtrain-cstom-error:0.06373	eval-rmse:2.42562	eval-cstom-error:0.11920
[13]	dtrain-rmse:2.50021	dtrain-cstom-error:0.06013	eval-rmse:2.49314	eval-cstom-error:0.11720
[14]	dtrain-rmse:2.57911	dtrain-cstom-error:0.05693	eval

In [45]:
# 参考 通常の方法で学習を行う場合
params = {'silent': 1, 'random_state': 71, 'objective': 'binary:logistic'}
bst = xgb.train(params, dtrain, num_round, watchlist)

pred = bst.predict(dvalid)
logloss2 = log_loss(va_y, pred)
print(logloss2)

[0]	dtrain-logloss:0.41663	eval-logloss:0.43550
[1]	dtrain-logloss:0.37126	eval-logloss:0.39889
[2]	dtrain-logloss:0.33889	eval-logloss:0.37205
[3]	dtrain-logloss:0.31320	eval-logloss:0.35606


Parameters: { "silent" } are not used.



[4]	dtrain-logloss:0.29062	eval-logloss:0.33709
[5]	dtrain-logloss:0.27315	eval-logloss:0.32549
[6]	dtrain-logloss:0.25735	eval-logloss:0.31328
[7]	dtrain-logloss:0.24299	eval-logloss:0.30434
[8]	dtrain-logloss:0.22850	eval-logloss:0.29755
[9]	dtrain-logloss:0.21984	eval-logloss:0.29490
[10]	dtrain-logloss:0.21094	eval-logloss:0.28867
[11]	dtrain-logloss:0.20280	eval-logloss:0.28290
[12]	dtrain-logloss:0.19173	eval-logloss:0.27823
[13]	dtrain-logloss:0.18626	eval-logloss:0.27372
[14]	dtrain-logloss:0.17750	eval-logloss:0.27174
[15]	dtrain-logloss:0.17111	eval-logloss:0.26724
[16]	dtrain-logloss:0.16269	eval-logloss:0.26364
[17]	dtrain-logloss:0.15749	eval-logloss:0.26171
[18]	dtrain-logloss:0.15286	eval-logloss:0.25777
[19]	dtrain-logloss:0.14977	eval-logloss:0.25564
[20]	dtrain-logloss:0.14463	eval-logloss:0.25519
[21]	dtrain-logloss:0.13899	eval-logloss:0.25306
[22]	dtrain-logloss:0.13512	eval-logloss:0.25066
[23]	dtrain-logloss:0.13177	eval-logloss:0.24852
[24]	dtrain-logloss:0.1269

In [48]:
print(f"logloss\t\t",logloss)
print(f"logloss2\t", logloss2)

logloss		 0.21992204652445954
logloss2	 0.22572590332195305
