In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd /content/drive/My Drive/Colab Notebooks/Home Credit Default Risk

/content/drive/My Drive/Colab Notebooks/Home Credit Default Risk


In [4]:
import numpy as np
import pandas as pd
import re
import pickle
import gc

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
application_train = pd.read_csv('application_train.csv')
application_test = pd.read_csv('application_test.csv')

In [6]:
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# prompt: メモリ削減のための関数

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [8]:
application_train = reduce_mem_usage(application_train)
application_test = reduce_mem_usage(application_test)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%
Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%


In [9]:
x_train = application_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = application_train['TARGET']
id_train = application_train['SK_ID_CURR']

In [10]:
for col in x_train.columns:
    if x_train[col].dtype == 'object':
        x_train[col] = x_train[col].astype('category')

In [11]:
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

mean: 0.0807


Unnamed: 0_level_0,count
TARGET,Unnamed: 1_level_1
0,282686
1,24825


In [12]:
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

print("index(train):", cv[0][0])

print("index(valid):", cv[0][1])

index(train): [     0      1      3 ... 307508 307509 307510]
index(valid): [     2     11     22 ... 307488 307495 307497]


In [13]:
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train[idx_tr]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train.iloc[idx_va], id_train.loc[idx_va]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008,) (246008,)
(61503, 120) (61503,) (61503,)


In [14]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estitators': 100000,
    "importance_type": "gain",
    'verbosity': 100  # Change 'verbose' to 'verbosity'
}

model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr, y_tr), (x_va, y_va)],
          callbacks=[lgb.early_stopping(stopping_rounds=100)],
          )  # Remove the verbose argument from fit()

with open("model_lgb_fold0.pickle", "wb") as f:
  pickle.dump(model, f, protocol=4)

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.881107
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.382379
[LightGBM] [Debug] init for col-wise cost 0.155778 seconds, init for row-wise cost 0.571481 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.264473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 11298
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Debug] Trained a tree with leaves = 32 and depth = 7
Training until validation scores don't improve for 100 rounds
[LightGB

In [15]:
y_tr_pred = model.predict_proba(x_tr)[:, 1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)
print("train:", metric_tr)

y_va_pred = model.predict_proba(x_va)[:, 1]
metric_va = roc_auc_score(y_va, y_va_pred)
print("valid:", metric_va)

metrics = []

metrics.append([metric_tr, metric_va])

print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

train: 0.7820721869151412
valid: 0.7557975908119366
[auc] tr:0.7821, va:0.7558


In [16]:
train_oof = np.zeros(len(x_train))

train_oof[idx_va] = y_va_pred

In [17]:
imp_fold = pd.DataFrame({'col': x_train.columns, 'imp': model.feature_importances_, 'nfold': nfold})

display(imp_fold.sort_values('imp', ascending=False).head(10))

imp = pd.DataFrame()

imp = pd.concat([imp, imp_fold])

imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,63524.643824,0
40,EXT_SOURCE_2,50765.783345,0
39,EXT_SOURCE_1,17820.682039,0
38,ORGANIZATION_TYPE,14231.274089,0
6,AMT_CREDIT,6279.08711,0
8,AMT_GOODS_PRICE,5640.688594,0
15,DAYS_BIRTH,5060.815312,0
7,AMT_ANNUITY,4687.708477,0
26,OCCUPATION_TYPE,3839.209475,0
16,DAYS_EMPLOYED,3690.827181,0


In [18]:
metrics = np.array(metrics)
print(metrics)

print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
    metrics[:, 0].mean(), metrics[:, 0].std(),
    metrics[:, 1].mean(), metrics[:, 1].std() # Change index 2 to 1
))

print("[oof] {:.4f}".format(
    roc_auc_score(y_train, train_oof)
))

[[0.78207219 0.75579759]]
[cv] tr:0.7821+-0.0000, va:0.7558+-0.0000
[oof] 0.5102


In [19]:
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"true": y_train, "pred": train_oof}),
], axis=1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.035208
3,100006,0,0.0
4,100007,0,0.0


In [20]:
imp = imp.groupby('col')['imp'].agg(['mean', 'std']).reset_index(drop=False)
imp.columns = ['col', 'imp', 'imp_std']
imp = imp.sort_values('imp', ascending=False).reset_index(drop=True)
imp.head(10)

Unnamed: 0,col,imp,imp_std
0,EXT_SOURCE_3,63524.643824,0.0
1,EXT_SOURCE_2,50765.783345,0.0
2,EXT_SOURCE_1,17820.682039,0.0
3,ORGANIZATION_TYPE,14231.274089,0.0
4,AMT_CREDIT,6279.08711,0.0
5,AMT_GOODS_PRICE,5640.688594,0.0
6,DAYS_BIRTH,5060.815312,0.0
7,AMT_ANNUITY,4687.708477,0.0
8,OCCUPATION_TYPE,3839.209475,0.0
9,DAYS_EMPLOYED,3690.827181,0.0


In [21]:
# prompt: 学習関数の定義

def train_model(x_train, y_train, id_train, params, nfold=0):
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train[idx_tr]
    x_va, y_va, id_va = x_train.loc[idx_va, :], y_train.iloc[idx_va], id_train.loc[idx_va]

    model = lgb.LGBMClassifier(**params)
    model.fit(x_tr, y_tr, eval_set=[(x_tr, y_tr), (x_va, y_va)], callbacks=[lgb.early_stopping(stopping_rounds=100)])

    y_tr_pred = model.predict_proba(x_tr)[:, 1]
    metric_tr = roc_auc_score(y_tr, y_tr_pred)
    y_va_pred = model.predict_proba(x_va)[:, 1]
    metric_va = roc_auc_score(y_va, y_va_pred)

    print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

    train_oof = np.zeros(len(x_train))
    train_oof[idx_va] = y_va_pred

    imp_fold = pd.DataFrame({'col': x_train.columns, 'imp': model.feature_importances_, 'nfold': nfold})

    return model, train_oof, imp_fold, metric_tr, metric_va

In [22]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estitators': 100000,
    "importance_type": "gain",
    'verbosity': 100  # Change 'verbose' to 'verbosity'
}

# Assuming you want to train the model for 5 folds (0 to 4):
train_oof_list = []
imp_list = []
metrics_list = []

for nfold in [0, 1, 2, 3, 4]:
    model, train_oof, imp_fold, metric_tr, metric_va = train_model(
        x_train, y_train, id_train, params, nfold=nfold
    )
    train_oof_list.append(train_oof)  # Store train_oof for each fold
    imp_list.append(imp_fold)        # Store imp_fold for each fold
    metrics_list.append([metric_tr, metric_va]) # Store metrics for each fold

# Combine results from all folds:
train_oof = np.mean(train_oof_list, axis=0)  # Average predictions across folds
imp = pd.concat(imp_list)                     # Concatenate importance dataframes
metrics = np.array(metrics_list)               # Combine metrics into an array

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.881107
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.382379
[LightGBM] [Debug] init for col-wise cost 0.123773 seconds, init for row-wise cost 0.382100 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.194881 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 11298
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Debug] Trained a tree with leaves = 32 and depth = 7
Training until validation scores don't improve for 100 rounds
[LightGB

In [23]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,63718.149445,3
41,EXT_SOURCE_3,63524.643824,0
41,EXT_SOURCE_3,62306.357008,4
41,EXT_SOURCE_3,60845.638296,2
41,EXT_SOURCE_3,60233.747977,1
40,EXT_SOURCE_2,53726.046979,1
40,EXT_SOURCE_2,53474.56208,4
40,EXT_SOURCE_2,51547.22876,2
40,EXT_SOURCE_2,51521.429276,3
40,EXT_SOURCE_2,50765.783345,0


In [24]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            # Convert to categorical with order
            df[col] = df[col].astype('category').cat.as_ordered()

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [25]:
with open("model_lgb_fold0.pickle", "rb") as f:
  model = pickle.load(f)

In [26]:
# application_test から x_test と id_test を作成する
x_test = application_test.drop(columns=['SK_ID_CURR']) # 'TARGET' は application_test に含まれていません
id_test = application_test['SK_ID_CURR']

# カテゴリカル変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype == 'object':
        x_test[col] = x_test[col].astype('category')

# ... (予測部分を含む、残りのコード) ...

train_test_fold = model.predict_proba(x_test)[:,1]

test_pred = np.zeros((len(x_test), 5))
test_pred[:, 0] = train_test_fold



In [27]:
test_pred_mean = test_pred.mean(axis=1)

df_test_pred = pd.concat([
    id_test,
    pd.DataFrame({"pred": test_pred_mean}),
], axis=1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.008379
1,100005,0.021002
2,100013,0.005116
3,100028,0.008861
4,100038,0.028775


In [28]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
                ):
  pred = np.zeros((len(input_x), len(list_nfold)))
  for nfold in list_nfold:
    print("-"*20, nfold, "-"*20)
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "rb") as f:
      model = pickle.load(f)
    pred[:, nfold] = model.predict_proba(input_x)[:,1]

  pred = pd.concat([
      input_id,
      pd.DataFrame({"pred": pred.mean(axis=1)}),
  ], axis=1)

  print("Done.")

  return pred

In [29]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estitators': 100000,
    "importance_type": "gain",
    'verbosity': 100  # 'verbose' を 'verbosity' に変更
}

# 5fold（0〜4）でモデルをトレーニングすると仮定します。
train_oof_list = []
imp_list = []
metrics_list = []

for nfold in [0, 1, 2, 3, 4]:
    model, train_oof, imp_fold, metric_tr, metric_va = train_model(
        x_train, y_train, id_train, params, nfold=nfold
    )
    train_oof_list.append(train_oof)  # 各foldのtrain_oofを保存
    imp_list.append(imp_fold)        # 各foldのimp_foldを保存
    metrics_list.append([metric_tr, metric_va]) # 各foldのメトリックを保存

    # 現在のfoldのモデルを保存
    with open(f"model_lgb_fold{nfold}.pickle", "wb") as f:
        pickle.dump(model, f, protocol=4)

# すべてのfoldの結果を結合：
train_oof = np.mean(train_oof_list, axis=0)  # fold間の予測の平均
imp = pd.concat(imp_list)                     # importanceデータフレームを連結
metrics = np.array(metrics_list)               # メトリックを配列に結合

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.881107
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.382379
[LightGBM] [Debug] init for col-wise cost 0.154243 seconds, init for row-wise cost 0.607579 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.264041 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 11298
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Debug] Trained a tree with leaves = 32 and depth = 7
Training until validation scores don't improve for 100 rounds
[LightGB

In [30]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                        )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [31]:
df_submit = test_pred.rename(columns={"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head())

df_submit.to_csv("HCDRsubmit_lgb.csv", index=False)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.047446
1,100005,0.111351
2,100013,0.027422
3,100028,0.043999
4,100038,0.140377


In [32]:
display(application_train["DAYS_EMPLOYED"].value_counts())
print("正の値の割合: {:.4f}".format((application_train["DAYS_EMPLOYED"]>0).mean()))
print("正の値の個数: {:.4f}".format((application_train["DAYS_EMPLOYED"]>0).sum()))

Unnamed: 0_level_0,count
DAYS_EMPLOYED,Unnamed: 1_level_1
365243,55374
-200,156
-224,152
-230,151
-199,151
...,...
-13961,1
-11827,1
-10176,1
-9459,1


正の値の割合: 0.1801
正の値の個数: 55374.0000


In [33]:
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

In [34]:
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)

application_train['INCOME_div_BIRTH'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_BIRTH']

application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

application_train['ANNUITY_div_BIRTH'] = application_train['AMT_ANNUITY'] / application_train['DAYS_BIRTH']

application_train['CREDIT_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

In [35]:
x_train = application_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = application_train['TARGET']
id_train = application_train['SK_ID_CURR']

for col in x_train.columns:
    if x_train[col].dtype == 'object':
        x_train[col] = x_train[col].astype('category')

In [38]:

# train_lgb を train_model に置き換えます。
train_oof_list = []
imp_list = []
metrics_list = []

for nfold in [0, 1, 2, 3, 4]:
    model, train_oof, imp_fold, metric_tr, metric_va = train_model(
        x_train, y_train, id_train, params, nfold=nfold
    )
    train_oof_list.append(train_oof)  # 各foldのtrain_oofを保存
    imp_list.append(imp_fold)        # 各foldのimp_foldを保存
    metrics_list.append([metric_tr, metric_va]) # 各foldのメトリックを保存


# すべてのfoldの結果を結合：
train_oof = np.mean(train_oof_list, axis=0)  # fold間の予測の平均
imp = pd.concat(imp_list)                     # importanceデータフレームを連結
metrics = np.array(metrics_list)               # メトリックを配列に結合

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.881107
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.349259
[LightGBM] [Debug] init for col-wise cost 0.109883 seconds, init for row-wise cost 0.423251 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.186544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 14117
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 127
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Debug] Trained a tree with leaves = 32 and depth = 8
Training until validation scores don't improve for 100 rounds
[LightGB

In [39]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,nfold
122,EXT_SOURCE_mean,116972.242828,4
122,EXT_SOURCE_mean,114968.093173,1
122,EXT_SOURCE_mean,114890.324317,3
122,EXT_SOURCE_mean,114534.749065,0
122,EXT_SOURCE_mean,113116.959892,2
130,CREDIT_div_CREDIT,18899.820207,0
130,CREDIT_div_CREDIT,17560.800329,1
130,CREDIT_div_CREDIT,16808.490193,3
130,CREDIT_div_CREDIT,16252.086881,2
130,CREDIT_div_CREDIT,16211.878327,4


In [40]:
application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['INCOME_div_BIRTH'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

x_test = application_test.drop(columns=['SK_ID_CURR'])
id_test = application_test['SK_ID_CURR']

for col in x_test.columns:
    if x_test[col].dtype == 'object':
        x_test[col] = x_test[col].astype('category')

In [42]:
application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']

# 以下の行を修正: mean, max, min を適切に計算
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)

application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['INCOME_div_BIRTH'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']

# ANNUTIY_div_BIRTH は application_test に存在しないので削除
# application_test['ANNUITY_div_BIRTH'] = application_test['AMT_ANNUITY'] / application_test['DAYS_BIRTH']

application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

x_test = application_test.drop(columns=['SK_ID_CURR'])
id_test = application_test['SK_ID_CURR']

for col in x_test.columns:
    if x_test[col].dtype == 'object':
        x_test[col] = x_test[col].astype('category')

In [43]:
df_submit = test_pred.rename(columns={"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head())

df_submit.to_csv("HCDRsubmit_lgb2.csv", index=False)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.047446
1,100005,0.111351
2,100013,0.027422
3,100028,0.043999
4,100038,0.140377


In [44]:
pos = pd.read_csv("POS_CASH_balance.csv")
pos = reduce_mem_usage(pos)
print(pos.shape)
pos.head()

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 171.69 MB
Decreased by 71.9%
(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [47]:
pos_ohe = pd.get_dummies(pos, columns=["NAME_CONTRACT_STATUS"], dummy_na=True)
col_ohe = sorted(list(set(pos_ohe.columns) - set(pos.columns)))
print(len(col_ohe))
col_ohe

10


['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA',
 'NAME_CONTRACT_STATUS_nan']

In [54]:
pos_ohe_agg = pos_ohe.groupby("SK_ID_CURR").agg(
    {
        "MONTHS_BALANCE": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "std", "min", "max"],
        "SK_DPD": ["mean", "std", "min", "max"],
        "SK_DPD_DEF": ["mean", "std", "min", "max"],
        "NAME_CONTRACT_STATUS_Active": ["mean"],
        "NAME_CONTRACT_STATUS_Amortized debt": ["mean"],
        "NAME_CONTRACT_STATUS_Approved": ["mean"],
        "NAME_CONTRACT_STATUS_Canceled": ["mean"],
        "NAME_CONTRACT_STATUS_Completed": ["mean"],
        "NAME_CONTRACT_STATUS_Demand": ["mean"],
        "NAME_CONTRACT_STATUS_Returned to the store": ["mean"],
        "NAME_CONTRACT_STATUS_Signed": ["mean"],
        "NAME_CONTRACT_STATUS_XNA": ["mean"],
        "NAME_CONTRACT_STATUS_nan": ["mean"],
        "SK_ID_PREV": ["count", "nunique"],
    }
)

pos_ohe_agg.columns = ["POS_" + "_".join(x) for x in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

print(pos_ohe_agg.shape)
pos_ohe_agg.head()

(337252, 33)


Unnamed: 0,SK_ID_CURR,POS_MONTHS_BALANCE_mean,POS_MONTHS_BALANCE_std,POS_MONTHS_BALANCE_min,POS_MONTHS_BALANCE_max,POS_CNT_INSTALMENT_mean,POS_CNT_INSTALMENT_std,POS_CNT_INSTALMENT_min,POS_CNT_INSTALMENT_max,POS_CNT_INSTALMENT_FUTURE_mean,...,POS_NAME_CONTRACT_STATUS_Approved_mean,POS_NAME_CONTRACT_STATUS_Canceled_mean,POS_NAME_CONTRACT_STATUS_Completed_mean,POS_NAME_CONTRACT_STATUS_Demand_mean,POS_NAME_CONTRACT_STATUS_Returned to the store_mean,POS_NAME_CONTRACT_STATUS_Signed_mean,POS_NAME_CONTRACT_STATUS_XNA_mean,POS_NAME_CONTRACT_STATUS_nan_mean,POS_SK_ID_PREV_count,POS_SK_ID_PREV_nunique
0,100001,-72.555556,20.863312,-96,-53,4.0,0.0,4.0,4.0,1.444444,...,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,9,2
1,100002,-10.0,5.627314,-19,-1,24.0,0.0,24.0,24.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1
2,100003,-43.785714,24.640162,-77,-18,10.107142,2.806597,6.0,12.0,5.785714,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28,3
3,100004,-25.5,1.290994,-27,-24,3.75,0.5,3.0,4.0,2.25,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4,1
4,100005,-20.0,3.316625,-25,-15,11.7,0.948683,9.0,12.0,7.2,...,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,11,1


In [55]:
df_train = pd.merge(application_train, pos_ohe_agg, on="SK_ID_CURR", how="left")
print(df_train.shape)
df_train.head()

(307511, 165)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,POS_NAME_CONTRACT_STATUS_Approved_mean,POS_NAME_CONTRACT_STATUS_Canceled_mean,POS_NAME_CONTRACT_STATUS_Completed_mean,POS_NAME_CONTRACT_STATUS_Demand_mean,POS_NAME_CONTRACT_STATUS_Returned to the store_mean,POS_NAME_CONTRACT_STATUS_Signed_mean,POS_NAME_CONTRACT_STATUS_XNA_mean,POS_NAME_CONTRACT_STATUS_nan_mean,POS_SK_ID_PREV_count,POS_SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


In [56]:
x_train = df_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = df_train['TARGET']
id_train = df_train['SK_ID_CURR']

for col in x_train.columns:
    if x_train[col].dtype == 'object':
        x_train[col] = x_train[col].astype('category')

In [59]:
train_oof_list = []
imp_list = []
metrics_list = []

for nfold in [0, 1, 2, 3, 4]:
    model, train_oof, imp_fold, metric_tr, metric_va = train_model(
        x_train, y_train, id_train, params, nfold=nfold
    )
    train_oof_list.append(train_oof)  # 各foldのtrain_oofを保存
    imp_list.append(imp_fold)        # 各foldのimp_foldを保存
    metrics_list.append([metric_tr, metric_va]) # 各foldのメトリックを保存

# すべてのfoldの結果を結合：
train_oof = np.mean(train_oof_list, axis=0)  # fold間の予測の平均
imp = pd.concat(imp_list)                     # importanceデータフレームを連結
metrics = np.array(metrics_list)               # メトリックを配列に結合

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.877691
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.374000
[LightGBM] [Debug] init for col-wise cost 0.149031 seconds, init for row-wise cost 0.510999 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.239413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 18779
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 159
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
[LightGBM] [Debug] Trained a tree with leaves = 32 and depth = 8
Training until validation scores don't improve for 100 rounds
[LightGB

In [61]:
df_submit = test_pred.rename(columns={"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head())

df_submit.to_csv("HCDRsubmit_FeatureEngineering3.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.047446
1,100005,0.111351
2,100013,0.027422
3,100028,0.043999
4,100038,0.140377
