In [140]:
# ライブラリの読み込み
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
from ydata_profiling import ProfileReport # EDA用ライブラリ

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
# ラベルエンコーディング
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# 評価指標
from sklearn.metrics import f1_score, log_loss
# モデリング
import lightgbm as lgb
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# パイプライン
from sklearn.pipeline import Pipeline
# パラメータチューニング
import optuna

%matplotlib inline

# pandasのカラムが100列まで見れるようにする
pd.set_option("display.max_columns", 100)

In [141]:
# データセットの読み込み
# 訓練データ
df_train = pd.read_csv("./dataset/train.csv")
train_id = df_train["id"]
print(df_train.shape)
df_train.head()

(1235, 29)


Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [142]:
# テストデータ
df_test = pd.read_csv("./dataset/test.csv")
test_id = df_test["id"]
print(df_test.shape)
df_test.head()

(824, 28)


Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,1235,no,adult,534053,38.6,40.0,20.0,normal,normal,normal_pink,less_3_sec,mild_pain,hypomotile,slight,none,none,7.0,normal,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1,1236,yes,adult,528469,38.2,112.0,48.0,cool,reduced,bright_pink,more_3_sec,depressed,hypomotile,moderate,slight,none,3.5,decreased,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
2,1237,yes,adult,528178,37.7,66.0,12.0,cool,normal,bright_red,less_3_sec,mild_pain,hypomotile,slight,slight,none,3.0,normal,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
3,1238,no,adult,534784,37.1,88.0,20.0,cool,reduced,pale_cyanotic,less_3_sec,depressed,absent,severe,slight,more_1_liter,2.0,absent,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
4,1239,yes,adult,529840,38.3,50.0,12.0,,normal,bright_pink,less_3_sec,mild_pain,absent,slight,slight,none,3.0,decreased,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes


In [143]:
# データの結合
df_concat = pd.concat([df_train, df_test], ignore_index=True)
print(df_concat.shape)
df_concat.head()

(2059, 29)


Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,peristalsis,abdominal_distention,nasogastric_tube,nasogastric_reflux,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,depressed,absent,slight,slight,less_1_liter,6.5,decreased,distend_small,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,mild_pain,absent,moderate,none,more_1_liter,2.0,absent,distend_small,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,extreme_pain,hypomotile,moderate,slight,none,3.5,,distend_large,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,mild_pain,hypomotile,moderate,slight,more_1_liter,2.0,decreased,distend_small,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,alert,hypomotile,none,slight,less_1_liter,7.0,normal,normal,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [144]:
# 欠損値の確認
df_concat.isnull().sum()

id                         0
surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       74
peripheral_pulse         107
mucous_membrane           34
capillary_refill_time     12
pain                      73
peristalsis               39
abdominal_distention      45
nasogastric_tube         144
nasogastric_reflux        35
nasogastric_reflux_ph      0
rectal_exam_feces        315
abdomen                  367
packed_cell_volume         0
total_protein              0
abdomo_appearance         79
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                  824
dtype: int64

In [145]:
df_concat["age"].value_counts()

age
adult    1942
young     117
Name: count, dtype: int64

In [146]:
# # サンプルデータを作成
# df = pd.DataFrame(
#     {"status": ["lived", "died", "euthanized", "lived", "died", "euthanized"]}
# )

# # マッピング辞書を作成
# status_map = {"lived": 0, "died": 1, "euthanized": 2}

# # マッピングを適用
# df["status_encoded"] = df["status"].map(status_map)

# # One-Hotエンコーディングを適用
# df_onehot = pd.get_dummies(df["status_encoded"], prefix="status")

# # 元のデータフレームとOne-Hotエンコードされたデータを結合
# df_final = pd.concat([df, df_onehot], axis=1)

# print(df_final)
# print(df_final["status_encoded"])

In [147]:
### 目的変数をラベルエンコーディング

# 元々の目的変数を確認
display(df_train["outcome"].head())

# ラベルエンコーディングのインスタンスを作成
label_encoder = LabelEncoder()

# 目的変数をラベルエンコーディング
df_train["outcome"] = label_encoder.fit_transform(df_train["outcome"])
display(df_train["outcome"].head())

0          died
1    euthanized
2         lived
3         lived
4         lived
Name: outcome, dtype: object

0    0
1    1
2    2
3    2
4    2
Name: outcome, dtype: int64

In [148]:
# 使用するデータセット
# df_concatの["surgery", "age", "hospital_number"]とdf_train["outcome"]を結合
df_concat1 = pd.concat(
    [df_train[["outcome"]], df_concat[["surgery", "age", "hospital_number"]]], axis=1
)
# df_concat1 = df_concat[["surgery", "age", "hospital_number"]]
display(df_concat1.head())

# ラベルエンコーディング(surgery, age)
# for c in ["surgery", "age"]:
#     le = LabelEncoder()
#     df_concat1.loc[:, c] = le.fit_transform(df_concat1[c])

# ダミー変数化(surgery, age)
df_concat1 = pd.get_dummies(df_concat1, columns=["surgery", "age"])

display(df_concat1.head())

Unnamed: 0,outcome,surgery,age,hospital_number
0,0.0,yes,adult,530001
1,1.0,yes,adult,533836
2,2.0,yes,adult,529812
3,2.0,yes,adult,5262541
4,2.0,no,adult,5299629


Unnamed: 0,outcome,hospital_number,surgery_no,surgery_yes,age_adult,age_young
0,0.0,530001,False,True,True,False
1,1.0,533836,False,True,True,False
2,2.0,529812,False,True,True,False
3,2.0,5262541,False,True,True,False
4,2.0,5299629,True,False,True,False


In [149]:
### 説明変数と目的変数を定義
# 訓練データとテストデータに再度分割
df_train, df_test = df_concat1[:1235], df_concat1[1235:]

# 説明変数
x_train = df_train.drop(["outcome"], axis=1)
y_train = df_train["outcome"]
print("訓練データ", x_train.shape, y_train.shape)
# print(y_train.isnull().sum())

# 目的変数
del df_test["outcome"]
x_test = df_test
print("テストデータ", x_test.shape)

訓練データ (1235, 5) (1235,)
テストデータ (824, 5)


In [150]:
x_train.head()

Unnamed: 0,hospital_number,surgery_no,surgery_yes,age_adult,age_young
0,530001,False,True,True,False
1,533836,False,True,True,False
2,529812,False,True,True,False
3,5262541,False,True,True,False
4,5299629,True,False,True,False


In [151]:
y_train.head()

0    0.0
1    1.0
2    2.0
3    2.0
4    2.0
Name: outcome, dtype: float64

In [152]:
x_test.head()

Unnamed: 0,hospital_number,surgery_no,surgery_yes,age_adult,age_young
1235,534053,True,False,True,False
1236,528469,False,True,True,False
1237,528178,False,True,True,False
1238,534784,True,False,True,False
1239,529840,False,True,True,False


In [153]:
### LightGBM カスタム評価指標(マイクロF1スコアを計算する関数)


# 引数に、「正解ラベル」と「予測ラベル」を渡す
def micro_f1_metric(y_true, y_pred):
    # 確率が一番高いクラスを予測ラベルとして取得
    y_pred_labels = np.argmax(y_pred, axis=1)
    score = f1_score(y_true, y_pred_labels, average="micro")

    # 返り値(LightGBM用)：評価指標名, スコア, 指標の最適化方向(Trueで大きいほどモデルの性能が良い)
    return "micro_f1", score, True

In [154]:
### LightGBMのモデル構築・予測を関数化

# パラメータの設定
params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "num_class": 3,
    "metric": "multi_logloss",  # 評価指標にログ損失も使用する
    "learning_rate": 0.1,
    "num_leaves": 16,
    "n_estimators": 100000,
    "random_state": 123,
    "importance_type": "gain",
    "verbose": -1,
}


# モデルの学習と評価を関数化
def lgb_train_eval(input_x, input_y, input_test, params, n_splits=5):
    metrics = []
    imp = pd.DataFrame()
    train_pred = np.zeros((input_x.shape[0], params["num_class"]))
    test_pred = np.zeros((input_test.shape[0], params["num_class"], n_splits))

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)
    cv = list(skf.split(input_x, input_y))

    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = input_x.iloc[tr_idx], input_x.iloc[val_idx]
        y_tr, y_val = input_y.iloc[tr_idx], input_y.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(
            x_tr,
            y_tr,
            eval_set=[(x_tr, y_tr), (x_val, y_val)],
            eval_metric=micro_f1_metric,
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=True),
                lgb.log_evaluation(100),
            ],
        )

        # 予測結果として、各クラスの確率が出力される
        y_tr_pred = model.predict_proba(x_tr)
        tr_pred_class = np.argmax(y_tr_pred, axis=1)
        y_val_pred = model.predict_proba(x_val)
        y_test_pred = model.predict_proba(input_test)
        # 動作確認用
        # print("訓練データ予測", pd.DataFrame(y_tr_pred))
        # print(pd.DataFrame(tr_pred_class))

        train_pred[val_idx] = y_val_pred
        test_pred[:, :, nfold] = y_test_pred

        # np.argmax(y_tr_pred, axis=1)で、確率が最大のクラス(=予測ラベル)を取得
        tr_score = f1_score(y_tr, np.argmax(y_tr_pred, axis=1), average="micro")
        val_score = f1_score(y_val, np.argmax(y_val_pred, axis=1), average="micro")
        metrics.append([nfold, tr_score, val_score])

        # 動作確認用
        # print("マイクロF1スコア(訓練)：{:.5f}".format(tr_score))

        _imp = pd.DataFrame(
            {
                "columns": input_x.columns,
                "importance": model.feature_importances_,
                "nfold": nfold,
            }
        )
        imp = pd.concat([imp, _imp])

    metrics = np.array(metrics)

    # ラベルを元に戻すための辞書
    label_map = {0: "died", 1: "euthanized", 2: "lived"}
    train_pred = pd.DataFrame(
        train_pred, columns=[f"クラス_{i}" for i in range(params["num_class"])]
    )
    train_pred["pred_class"] = np.argmax(train_pred.values, axis=1)
    train_pred["pred_class_submit"] = train_pred["pred_class"].map(label_map)

    test_pred = pd.DataFrame(
        np.mean(test_pred, axis=2),
        columns=[f"クラス_{i}" for i in range(params["num_class"])],
    )
    test_pred["pred_class"] = np.argmax(test_pred.values, axis=1)
    test_pred["pred_class_submit"] = test_pred["pred_class"].map(label_map)

    imp = imp.groupby("columns")["importance"].agg(["mean", "std"]).reset_index()
    imp.columns = ["columns", "imp_mean", "imp_std"]

    return metrics, imp, train_pred, test_pred

In [155]:
lgb_metrics, lgb_imp, lgb_tr_pred, lgb_test_pred = lgb_train_eval(
    x_train, y_train, x_test, params, 5
)

Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.680161	training's micro_f1: 0.689271	valid_1's multi_logloss: 0.819553	valid_1's micro_f1: 0.639676
Early stopping, best iteration is:
[50]	training's multi_logloss: 0.724473	training's micro_f1: 0.667004	valid_1's multi_logloss: 0.792376	valid_1's micro_f1: 0.643725
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.65357	training's micro_f1: 0.706478	valid_1's multi_logloss: 0.940447	valid_1's micro_f1: 0.562753
Early stopping, best iteration is:
[22]	training's multi_logloss: 0.757793	training's micro_f1: 0.654858	valid_1's multi_logloss: 0.881002	valid_1's micro_f1: 0.562753
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.64141	training's micro_f1: 0.715587	valid_1's multi_logloss: 0.997426	valid_1's micro_f1: 0.566802
Early stopping, best iteration is:
[18]	training's multi_logloss: 0.760309	tra

In [156]:
print("===== LightGBM =====")
print("【マイクロF1スコア】")
print(lgb_metrics)
print(
    "train：{:.2f} ± {:.2f}".format(lgb_metrics[:, 1].mean(), lgb_metrics[:, 1].std())
)
print("val：{:.2f} ± {:.2f}".format(lgb_metrics[:, 2].mean(), lgb_metrics[:, 2].std()))

print("\n【特徴量重要度】")
print(lgb_imp.sort_values(by="imp_mean", ascending=False))

===== LightGBM =====
【マイクロF1スコア】
[[0.         0.66700405 0.6437247 ]
 [1.         0.6548583  0.56275304]
 [2.         0.67105263 0.52226721]
 [3.         0.65688259 0.65182186]
 [4.         0.65991903 0.56275304]]
train：0.66 ± 0.01
val：0.59 ± 0.05

【特徴量重要度】
           columns     imp_mean     imp_std
2  hospital_number  2653.178101  144.629957
3       surgery_no   262.962435   30.858824
0        age_adult   109.727314   17.933012
4      surgery_yes    19.791843   10.287798
1        age_young     0.000000    0.000000


In [157]:
lgb_test_pred.head()

Unnamed: 0,クラス_0,クラス_1,クラス_2,pred_class,pred_class_submit
0,0.048092,0.666813,0.285095,1,euthanized
1,0.550938,0.056037,0.393025,0,died
2,0.254438,0.067933,0.677629,2,lived
3,0.046758,0.530119,0.423124,1,euthanized
4,0.417986,0.082099,0.499915,2,lived


In [158]:
# 提出ファイルの作成
submit = pd.DataFrame(
    data={"id": test_id, "outcome": lgb_test_pred["pred_class_submit"]}
)
display(submit.head(), submit.shape)
submit.to_csv("./submit/baseline_lgb.csv", index=False)

Unnamed: 0,id,outcome
0,1235,euthanized
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived


(824, 2)

In [168]:
### 同様にランダムフォレストのモデル構築・予測を関数化
# モデルの学習と評価を関数化
def rf_train_eval(input_x, input_y, input_test, num_class, n_splits=5):
    metrics = []
    imp = pd.DataFrame()
    train_pred = np.zeros((input_x.shape[0], num_class))
    test_pred = np.zeros((input_test.shape[0], num_class, n_splits))

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123)
    cv = list(skf.split(input_x, input_y))

    for nfold in range(n_splits):
        tr_idx, val_idx = cv[nfold][0], cv[nfold][1]
        x_tr, x_val = input_x.iloc[tr_idx], input_x.iloc[val_idx]
        y_tr, y_val = input_y.iloc[tr_idx], input_y.iloc[val_idx]

        model = RandomForestClassifier(
            n_estimators=100,  # デフォルト値にして、実行速度を上げる
            # n_estimators=1000,  # 木の数を増やして安定性向上
            max_depth=10,  # 適度な深さで過学習を防ぐ
            min_samples_split=5,  # 分割のための最小サンプル数
            min_samples_leaf=2,  # 葉の最小サンプル数
            max_features="sqrt",  # 特徴量はsqrtが分類タスクで効果的
            class_weight="balanced",  # クラス不均衡に対応
            bootstrap=True,  # バギング
            random_state=42,
            n_jobs=-1,  # 並列処理
        )
        model.fit(x_tr, y_tr)

        # 予測結果として、各クラスの確率が出力される
        y_tr_pred = model.predict_proba(x_tr)
        tr_pred_class = np.argmax(y_tr_pred, axis=1)
        y_val_pred = model.predict_proba(x_val)
        y_test_pred = model.predict_proba(input_test)
        # 動作確認用
        # print("訓練データ予測", pd.DataFrame(y_tr_pred))
        # print(pd.DataFrame(tr_pred_class))

        train_pred[val_idx] = y_val_pred
        test_pred[:, :, nfold] = y_test_pred

        # np.argmax(y_tr_pred, axis=1)で、確率が最大のクラス(=予測ラベル)を取得
        tr_score = f1_score(y_tr, np.argmax(y_tr_pred, axis=1), average="micro")
        val_score = f1_score(y_val, np.argmax(y_val_pred, axis=1), average="micro")
        metrics.append([nfold, tr_score, val_score])

        # 動作確認用
        # print("マイクロF1スコア(訓練)：{:.5f}".format(tr_score))

        _imp = pd.DataFrame(
            {
                "columns": input_x.columns,
                "importance": model.feature_importances_,
                "nfold": nfold,
            }
        )
        imp = pd.concat([imp, _imp])

    metrics = np.array(metrics)

    # ラベルを元に戻すための辞書
    label_map = {0: "died", 1: "euthanized", 2: "lived"}
    train_pred = pd.DataFrame(
        train_pred, columns=[f"クラス_{i}" for i in range(params["num_class"])]
    )
    train_pred["pred_class"] = np.argmax(train_pred.values, axis=1)
    train_pred["pred_class_submit"] = train_pred["pred_class"].map(label_map)

    test_pred = pd.DataFrame(
        np.mean(test_pred, axis=2),
        columns=[f"クラス_{i}" for i in range(params["num_class"])],
    )
    test_pred["pred_class"] = np.argmax(test_pred.values, axis=1)
    test_pred["pred_class_submit"] = test_pred["pred_class"].map(label_map)

    imp = imp.groupby("columns")["importance"].agg(["mean", "std"]).reset_index()
    imp.columns = ["columns", "imp_mean", "imp_std"]

    return metrics, imp, train_pred, test_pred

In [169]:
rf_metrics, rf_imp, rf_tr_pred, rf_test_pred = rf_train_eval(
    x_train, y_train, x_test, 3, 5
)

In [170]:
print("===== Random Forest =====")
print("【マイクロF1スコア】")
print(rf_metrics)
print("train：{:.2f} ± {:.2f}".format(rf_metrics[:, 1].mean(), rf_metrics[:, 1].std()))
print("val：{:.2f} ± {:.2f}".format(rf_metrics[:, 2].mean(), rf_metrics[:, 2].std()))

print("\n【特徴量重要度】")
print(rf_imp.sort_values(by="imp_mean", ascending=False))

===== Random Forest =====
【マイクロF1スコア】
[[0.         0.6902834  0.62348178]
 [1.         0.69838057 0.55465587]
 [2.         0.70445344 0.53441296]
 [3.         0.68927126 0.59109312]
 [4.         0.70647773 0.57894737]]
train：0.70 ± 0.01
val：0.58 ± 0.03

【特徴量重要度】
           columns  imp_mean   imp_std
2  hospital_number  0.899240  0.002203
1        age_young  0.026797  0.001722
4      surgery_yes  0.026166  0.001164
0        age_adult  0.024801  0.002120
3       surgery_no  0.022996  0.002593


In [171]:
rf_test_pred.head()

Unnamed: 0,クラス_0,クラス_1,クラス_2,pred_class,pred_class_submit
0,0.003432,0.60831,0.388258,1,euthanized
1,0.762354,0.042951,0.194695,0,died
2,0.32676,0.034743,0.638497,2,lived
3,0.001425,0.649791,0.348783,1,euthanized
4,0.539082,0.093027,0.367891,0,died


In [172]:
# 提出ファイルの作成
submit = pd.DataFrame(
    data={"id": test_id, "outcome": rf_test_pred["pred_class_submit"]}
)
display(submit.head(), submit.shape)
submit.to_csv("./submit/baseline_rf.csv", index=False)

Unnamed: 0,id,outcome
0,1235,euthanized
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,died


(824, 2)