In [88]:
!pip install optuna
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
import re
import pickle
import gc
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")



In [89]:
# Google Colaboratoryで作業する場合はこちらも実行してください。
from google.colab import drive
drive.mount('/content/drive')
# %cd 以降にこのnotebookを置いているディレクトリを指定してください。
%cd "/content/drive/MyDrive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [90]:
# データの読み込み
# INPUT_DIRにtrain.csvなどのデータを置いているディレクトリを指定してください。
INPUT_DIR = "/content/drive/My Drive/GCI/cmp2/"

train = pd.read_csv(INPUT_DIR + "train.csv")
test = pd.read_csv(INPUT_DIR + "test.csv")
sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")

In [91]:
X_train = train.drop(["TARGET"], axis=1)
y_train = train["TARGET"]

id_train = train[["SK_ID_CURR"]]
for col in X_train.columns:
    if X_train[col].dtype=="O":
        X_train[col] = X_train[col].astype("category")

In [92]:
# 層化分割したバリデーションのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X_train, y_train))

# indexの確認：fold=0のtrainデータ
print("index(train):", cv[0][0])

# indexの確認：fold=0のvalidデータ
print("index(valid):", cv[0][1])

index(train): [     0      1      2 ... 171196 171197 171198]
index(valid): [     7     18     19 ... 171199 171200 171201]


In [93]:
# カテゴリカルカラムの抽出
categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
# 連続値カラムのリスト（仮）
numerical_columns = [col for col in X_train.columns if col not in categorical_columns]

# 前処理の設定
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_columns),
        ('cat', OrdinalEncoder(), categorical_columns)
    ])

In [94]:
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = X_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = X_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)
print(x_tr.isnull().sum())

(136961, 50) (136961,) (136961, 1)
(34241, 50) (34241,) (34241, 1)
SK_ID_CURR                         0
NAME_CONTRACT_TYPE                 0
CODE_GENDER                        0
FLAG_OWN_CAR                   27396
FLAG_OWN_REALTY                27396
CNT_CHILDREN                       0
AMT_INCOME_TOTAL                   0
AMT_CREDIT                         0
AMT_ANNUITY                        4
AMT_GOODS_PRICE                  130
NAME_TYPE_SUITE                  604
NAME_INCOME_TYPE                   0
NAME_EDUCATION_TYPE                0
NAME_FAMILY_STATUS                 0
NAME_HOUSING_TYPE                  0
REGION_POPULATION_RELATIVE         0
DAYS_BIRTH                         0
DAYS_EMPLOYED                      0
DAYS_REGISTRATION                  0
DAYS_ID_PUBLISH                    0
OWN_CAR_AGE                    90528
FLAG_MOBIL                         0
FLAG_EMP_PHONE                     0
FLAG_WORK_PHONE                    0
FLAG_CONT_MOBILE                   0
FLAG_PHO

In [95]:
# モデルの学習
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 1000,
    'max_depth': 10,
    'min_child_samples': 100,
    "random_state": 123,
    "importance_type": "gain",
}

model = lgb.LGBMClassifier(**params)

# 早期停止コールバックの作成
early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=True)

model.fit(x_tr, y_tr,
          eval_set=[(x_va, y_va)],
          callbacks=[early_stopping_callback]
         )

# モデルの保存
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

[LightGBM] [Info] Number of positive: 11057, number of negative: 125904
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3642
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 49
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080731 -> initscore=-2.432456
[LightGBM] [Info] Start training from score -2.432456
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[159]	valid_0's auc: 0.75036


In [96]:
'''モデル評価'''
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# 検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

# 評価値を入れる変数の作成（最初のfoldのときのみ）
metrics = []

# 評価値を格納
metrics.append([nfold, metric_tr, metric_va])

# 結果の表示
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

[auc] tr:0.8113, va:0.7504


In [97]:
'''説明変数の重要度取得（学習したモデルから説明変数の重要度を取得）'''
# 重要度の取得
imp_fold = pd.DataFrame({"col":X_train.columns, "imp":model.feature_importances_, "nfold":nfold})
# 確認（重要度の上位10個）
#display(imp_fold.sort_values("imp", ascending=False)[:10])
display(imp_fold.sort_values("imp", ascending=False))
# 重要度を格納する5fold用データフレームの作成
imp = pd.DataFrame()
# imp_foldを5fold用データフレームに結合
imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
39,EXT_SOURCE_2,34901.259294,0
40,EXT_SOURCE_3,28725.852855,0
37,ORGANIZATION_TYPE,14246.540099,0
38,EXT_SOURCE_1,7310.62465,0
7,AMT_CREDIT,5505.036798,0
27,OCCUPATION_TYPE,5172.902002,0
16,DAYS_BIRTH,4914.762548,0
9,AMT_GOODS_PRICE,4392.840714,0
8,AMT_ANNUITY,4227.517249,0
17,DAYS_EMPLOYED,3317.708879,0


In [98]:
# impが1000以下の列名を取得
low_importance_cols = imp[imp['imp'] <= 0]['col'].unique()

# 低重要度の特徴量名を表示
print("Low importance features (imp <= 0):", low_importance_cols)

Low importance features (imp <= 100): ['FLAG_OWN_CAR' 'CNT_CHILDREN' 'FLAG_MOBIL' 'FLAG_EMP_PHONE'
 'FLAG_CONT_MOBILE' 'FLAG_PHONE' 'FLAG_EMAIL' 'CNT_FAM_MEMBERS'
 'REGION_RATING_CLIENT' 'REG_REGION_NOT_LIVE_REGION'
 'REG_REGION_NOT_WORK_REGION' 'LIVE_REGION_NOT_WORK_REGION'
 'REG_CITY_NOT_WORK_CITY' 'LIVE_CITY_NOT_WORK_CITY'
 'AMT_REQ_CREDIT_BUREAU_HOUR']


In [101]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 1000,
    'max_depth': 10,
    'min_child_samples': 10,
    "random_state": 123,
    "importance_type": "gain",
}

list_nfold=[0,1,2,3,4]
n_splits=5
train_oof = np.zeros(len(X_train))
metrics = []
imp = pd.DataFrame()

# 早期停止コールバックの作成
early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=True)


# cross-validation
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(X_train, y_train))
for nfold in list_nfold:
    print("-"*20, nfold, "-"*20)

    # make dataset
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr, id_tr = X_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
    x_va, y_va, id_va = X_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
    x_tr.drop(columns=low_importance_cols, inplace=True)
    x_va.drop(columns=low_importance_cols, inplace=True)
    print(x_tr.shape, x_va.shape)

    # train
    model = lgb.LGBMClassifier(**params)
    model.fit(x_tr, y_tr,
              eval_set=[(x_va, y_va)],
              callbacks=[early_stopping_callback]
             )
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "wb") as f:
        pickle.dump(model, f, protocol=4)

    #evaluate
    y_tr_pred = model.predict_proba(x_tr)[:,1]
    y_va_pred = model.predict_proba(x_va)[:,1]
    metric_tr = roc_auc_score(y_tr, y_tr_pred)
    metric_va = roc_auc_score(y_va, y_va_pred)
    metrics.append([nfold, metric_tr, metric_va])
    print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

    # oof
    train_oof[idx_va] = y_va_pred

    # imp
    _imp = pd.DataFrame({"col":x_tr.columns, "imp":model.feature_importances_, "nfold":nfold})
    imp = pd.concat([imp, _imp])

print("-"*20, "result", "-"*20)
# metric
metrics = np.array(metrics)
print(metrics)
print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
    metrics[:,1].mean(), metrics[:,1].std(),
    metrics[:,2].mean(), metrics[:,2].std(),
))
print("[oof] {:.4f}".format(
    roc_auc_score(y_train, train_oof)
))

# oof
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"pred":train_oof})
], axis=1)

# importance
imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
imp.columns = ["col", "imp", "imp_std"]


-------------------- 0 --------------------
(136961, 35) (34241, 35)
[LightGBM] [Info] Number of positive: 11057, number of negative: 125904
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3588
[LightGBM] [Info] Number of data points in the train set: 136961, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080731 -> initscore=-2.432456
[LightGBM] [Info] Start training from score -2.432456
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[154]	valid_0's auc: 0.74934
[auc] tr:0.8170, va:0.7493
-------------------- 1 --------------------
(136961, 35) (34241, 35)
[LightGBM] [Info] Number of positive: 11056, number of negative: 125905
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of tes

In [102]:

# データセットの作成
x_test = test
id_test = test[["SK_ID_CURR"]]
x_test.drop(columns=low_importance_cols, inplace=True)
# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

In [103]:
pred = np.zeros((len(x_test), len(list_nfold)))
for nfold in list_nfold:
    print("-"*20, nfold, "-"*20)
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "rb") as f:
        model = pickle.load(f)
    pred[:, nfold] = model.predict_proba(x_test)[:,1]

pred = pd.concat([
    id_test,
    pd.DataFrame({"pred": pred.mean(axis=1)}),
], axis=1)

#提出ファイルの作成
df_submit = pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("submission_baseline.csv", index=None)

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
(61500, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,171202,0.03314
1,171203,0.137486
2,171204,0.140614
3,171205,0.104902
4,171206,0.22122


In [None]:
from google.colab import files
files.download('submission_baseline.csv')