<a href="https://colab.research.google.com/github/Taise-Kojima/HomeCredit_cmp/blob/main/Taise_Kojima_HomeCredit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [148]:
# Google Colaboratoryで作業する場合はこちらも実行してください。
from google.colab import drive
drive.mount('/content/drive')
# %cd 以降にこのnotebookを置いているディレクトリを指定してください。
%cd "/content/drive/MyDrive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [149]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgbm
from sklearn.utils import resample

In [150]:
# データの読み込み
# INPUT_DIRにtrain.csvなどのデータを置いているディレクトリを指定してください。
INPUT_DIR = "/content/drive/My Drive/GCI/cmp2/"

train = pd.read_csv(INPUT_DIR + "train.csv")
test = pd.read_csv(INPUT_DIR + "test.csv")
sample_submission = pd.read_csv(INPUT_DIR + "sample_submission.csv")

df_all = pd.concat([train, test], axis=0)


In [151]:
def downsampling(train,test):
    #ダウンサンプリング 1対10から1対7にする
    # クラスごとのデータを分ける
    class_0 = train[train['TARGET'] == 0]
    class_1 = train[train['TARGET'] == 1]

    # クラス1のサンプル数を取得
    n_class_1 = len(class_1)

    # クラス0をダウンサンプリングして、クラス1のサンプル数の4倍にする
    n_samples_class_0 = n_class_1 * 7
    class_0_downsampled = resample(class_0,
                                  replace=False,  # サンプルの重複を許可しない
                                  n_samples=n_samples_class_0,  # クラス1のサンプル数の4倍にする
                                  random_state=42)  # 再現性のためのシード

    # ダウンサンプリングされたデータとクラス1のデータを結合
    train_downsampled = pd.concat([class_0_downsampled, class_1])

    # シャッフルしてデータセットを混ぜる
    train = train_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)
    df_all = pd.concat([train, test], axis=0)
    return df_all

In [152]:
def one_hot_encoding(df):

    return_df = pd.get_dummies(df, drop_first=True)

    return return_df

In [153]:
from sklearn.preprocessing import LabelEncoder
def label_encoding(df):
    return_df = df.copy()  # 元のデータフレームを変更しないようにコピーを作成
    # カテゴリカルな列のみをエンコードする
    for column in return_df.columns:
        if return_df[column].dtype == 'object':  # データ型がオブジェクトの場合、カテゴリカルとみなす
            le = LabelEncoder()
            # データにNaNが含まれている場合は、LabelEncoderの適用前に処理
            return_df[column] = return_df[column].fillna('')  # NaNを空文字列に置換
            return_df[column] = le.fit_transform(return_df[column])
    return return_df

In [154]:
def add_feature(df):
    df['EXT_123_mean'] = (df['EXT_SOURCE_1'] + df['EXT_SOURCE_2'] + df['EXT_SOURCE_3']) / 3
    df['EXT_23_mean'] = (df['EXT_SOURCE_2'] + df['EXT_SOURCE_3']) / 2
    df['EXT_12_mean'] = (df['EXT_SOURCE_1'] + df['EXT_SOURCE_2']) / 2
    df['EXT_13_mean'] = (df['EXT_SOURCE_1'] + df['EXT_SOURCE_3']) / 2
    df['EXT_23_sabun'] = abs(df['EXT_SOURCE_2'] - df['EXT_SOURCE_3'])
    df['EXT_12_sabun'] = abs(df['EXT_SOURCE_1'] - df['EXT_SOURCE_2'])
    df['EXT_13_sabun'] = abs(df['EXT_SOURCE_1'] - df['EXT_SOURCE_3'])

    df['CREDIT_ANNUITY'] = df['AMT_ANNUITY'] / df['AMT_CREDIT'] * 100
    df['CREDIT_GOODS_PRICE'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    df['INCOME_TOTAL_ANNUITY'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['INCOME_TOTAL_CREDIT'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']

    df['DAYS_BIRTH_365_OWN_CAR_AGE'] = (df['DAYS_BIRTH'] / 365) - df['OWN_CAR_AGE']

    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

    df["EXT_SOURCE_median"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].median(axis=1)

    #就業日数と年齢の差
    df['DAYS_EMPLOYED_REC'] = df['DAYS_EMPLOYED'] - df['DAYS_BIRTH']

    #家族を考慮した一人当たりの収入
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']

    #家族の子供の比率
    df['CHILDREM_RAITON'] = df['CNT_CHILDREN'] / df['CNT_FAM_MEMBERS']

    #外部機関によるスコアの演算
    df["EXT_SOURCE_max"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
    df["EXT_SOURCE_min"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
    df["EXT_SOURCE_std"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
    df["EXT_SOURCE_count"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)# 中央値
    df["EXT_SOURCE_median"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].median(axis=1)
    df["EXT_SOURCE_mode"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mode(axis=1)[0]
    df["EXT_SOURCE_range"] = df["EXT_SOURCE_max"] - df["EXT_SOURCE_min"]
    df["EXT_SOURCE_var"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].var(axis=1)
    df["EXT_SOURCE_skew"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].skew(axis=1)
    df["EXT_SOURCE_kurtosis"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].kurt(axis=1)

    #頭金に関する特徴料
    df['DOWN_PAYMENT'] = df['AMT_GOODS_PRICE'] - df['AMT_CREDIT']
    df['DOWN_PAYMENT_RATE'] = df['DOWN_PAYMENT'] / df['AMT_CREDIT'] * 100
    # DOWN_PAYMENT_RATEが-30以下であれば1, それ以上であれば0をセット
    df['DOWN_PAYMENT_RATE_ALART'] = (df['DOWN_PAYMENT_RATE'] <=-30).astype(int) *100

    #返済が終了する年齢
    df_client_age = df['DAYS_BIRTH'] // 365 * (-1)
    df_loan_years = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['FINISH_AGE'] = df_client_age + df_loan_years

    #住んでいる都市の評価
    df['CITY_EVALUATION'] = df['REGION_RATING_CLIENT_W_CITY'] - df['REGION_RATING_CLIENT']


    # 返済率が4.65%を超えるかどうかに基づいて新しい特徴量を作成
    df['High_Repayment'] = (df['CREDIT_ANNUITY'] > 4.65).astype(int)

    return df


In [155]:
#一つずつ試して精度が落ちたカラムをdrop
def drop(df):
    df.drop(columns=[
    'FLAG_MOBIL', 'FLAG_CONT_MOBILE', 'REG_REGION_NOT_LIVE_REGION', 'LIVE_REGION_NOT_WORK_REGION',
    'DAYS_EMPLOYED_REC', 'INCOME_PER_PERSON', 'CHILDREM_RAITON',
    'EXT_SOURCE_max', 'EXT_SOURCE_min', 'EXT_SOURCE_std', 'EXT_SOURCE_count',
    'EXT_SOURCE_mode', 'EXT_SOURCE_range', 'EXT_SOURCE_var',
    'EXT_SOURCE_skew', 'EXT_SOURCE_kurtosis', 'DOWN_PAYMENT', 'DOWN_PAYMENT_RATE',
    'DOWN_PAYMENT_RATE_ALART', 'FINISH_AGE', 'CITY_EVALUATION','High_Repayment'
], inplace=True)

    return df

In [156]:
#ダウンサンプリングする場合
#df_all = downsampling(train,test) #不採用

#ワンホットエンコード
df_encoded = one_hot_encoding(df_all)

#ラベルエンコード
#df_encoded = label_encoding(df_all) #不採用

added_features_df = add_feature(df_encoded)
all_features_df = drop(added_features_df)

In [157]:
assert len(df_all) == len(df_encoded)
assert len(df_all) == len(added_features_df)
assert len(df_all) == len(all_features_df)

In [158]:
display(all_features_df)

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,EXT_13_mean,EXT_23_sabun,EXT_12_sabun,EXT_13_sabun,CREDIT_ANNUITY,CREDIT_GOODS_PRICE,INCOME_TOTAL_ANNUITY,INCOME_TOTAL_CREDIT,DAYS_BIRTH_365_OWN_CAR_AGE,EXT_SOURCE_median
0,0,0.0,0,112500.0,755190.0,36328.5,675000.0,0.010032,-9233,-878.0,...,,,,,4.810511,1.118800,0.322920,0.148969,,0.372591
1,1,0.0,0,225000.0,585000.0,16893.0,585000.0,0.008019,-20148,,...,,0.103598,,,2.887692,1.000000,0.075080,0.384615,,0.501366
2,2,0.0,0,54000.0,334152.0,18256.5,270000.0,0.004960,-18496,-523.0,...,,,,,5.463532,1.237600,0.338083,0.161603,,0.569503
3,3,0.0,0,67500.0,152820.0,8901.0,135000.0,0.005002,-24177,,...,,0.662289,,,5.824499,1.132000,0.131867,0.441696,,0.436379
4,4,1.0,0,157500.0,271066.5,21546.0,234000.0,0.006296,-10685,-697.0,...,0.505700,0.466567,0.139854,0.326713,7.948603,1.158404,0.136800,0.581038,,0.342344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61495,232697,,0,315000.0,1288350.0,37800.0,1125000.0,0.007020,-11430,-792.0,...,0.285708,0.289564,0.245506,0.044058,2.933985,1.145200,0.120000,0.244499,-43.315068,0.263678
61496,232698,,0,90000.0,273636.0,15408.0,247500.0,0.006671,-17181,-839.0,...,,0.233845,,,5.630838,1.105600,0.171200,0.328904,,0.551656
61497,232699,,0,144000.0,291384.0,26725.5,270000.0,0.018801,-14515,-722.0,...,,,0.063924,,9.171917,1.079200,0.185594,0.494193,,0.542189
61498,232700,,1,193500.0,746280.0,59094.0,675000.0,0.002042,-16914,-8756.0,...,,,0.126582,,7.918476,1.105600,0.305395,0.259286,,0.290005


In [159]:
train = all_features_df[all_features_df.loc[:, 'SK_ID_CURR'] < 171202]
test = all_features_df[all_features_df.loc[:, 'SK_ID_CURR'] > 171201]

In [160]:
train_x = train.drop(columns=['TARGET', 'SK_ID_CURR'])
train_y = train['TARGET']
test_x = test.drop(columns=['TARGET', 'SK_ID_CURR'])

In [161]:
X = train_x.values
y = train_y.values

In [162]:
fold = StratifiedKFold(n_splits=8, shuffle=True, random_state=69)
cv = list(fold.split(X, y))

In [163]:
#精度が高かったハイパーパラメータ付近をさらに探索 時間がかかるので省略
"""
from lightgbm import early_stopping
!pip install optuna
import optuna
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm  # tqdmのautoをインポート

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    param = {
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 1.3),
        'reg_alpha': trial.suggest_float('reg_alpha', 9.5, 10.3),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.48, 0.52),
        'subsample': trial.suggest_float('subsample', 0.58, 0.62),
        'max_depth': trial.suggest_int('max_depth', 2, 4),
        'min_child_weight': trial.suggest_float('min_child_weight', 30, 50),
        'learning_rate': trial.suggest_float('learning_rate', 0.07, 0.09)
    }

    model = lgb.LGBMClassifier(**param, random_state=42, n_estimators=1000)
    early_stopping_callback = lgb.early_stopping(stopping_rounds=30, verbose=True)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc',
              callbacks=[early_stopping_callback])
    preds = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, preds)
    return auc_score

def tqdm_callback(study, trial):
    tqdm.write(f"Trial {trial.number}: AUC = {trial.value}, Params = {trial.params}")
    pbar.update(1)

import optuna
from tqdm.auto import tqdm


study = optuna.create_study(direction='maximize')
pbar = tqdm(total=1000, desc="Optimization Progress")
study.optimize(objective, n_trials=1000, callbacks=[tqdm_callback])
pbar.close()
"""

'\nfrom lightgbm import early_stopping\n!pip install optuna\nimport optuna\nimport lightgbm as lgb\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.model_selection import train_test_split\nfrom tqdm.auto import tqdm  # tqdmのautoをインポート\n\nX_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)\n\ndef objective(trial):\n    param = {\n        \'reg_lambda\': trial.suggest_float(\'reg_lambda\', 1.0, 1.3),\n        \'reg_alpha\': trial.suggest_float(\'reg_alpha\', 9.5, 10.3),\n        \'colsample_bytree\': trial.suggest_float(\'colsample_bytree\', 0.48, 0.52),\n        \'subsample\': trial.suggest_float(\'subsample\', 0.58, 0.62),\n        \'max_depth\': trial.suggest_int(\'max_depth\', 2, 4),\n        \'min_child_weight\': trial.suggest_float(\'min_child_weight\', 30, 50),\n        \'learning_rate\': trial.suggest_float(\'learning_rate\', 0.07, 0.09)\n    }\n\n    model = lgb.LGBMClassifier(**param, random_state=42, n_estimators=1000)\n    early

In [171]:
#lgbm_best_param = study.best_params

#1000トライの結果
#実際に確認する場合は上の#を外して、そちらをパラメータにする
#GoogleGolabでは実行時間が足りない可能性あり
lgbm_best_param = {'reg_lambda': 1.1709377818572475, 'reg_alpha': 9.621067135888334, 'colsample_bytree': 0.4873461282978851, 'subsample': 0.6005281454559444, 'max_depth': 3, 'min_child_weight': 43.62076258157694, 'learning_rate': 0.08631997025832076}


In [172]:
def fit_lgbm(X, y, cv, params: dict=None, verbose=100):

    oof_preds = np.zeros(X.shape[0])

    if params is None:
        params = {}

    models = []
    early_stopping_callback = lgbm.early_stopping(stopping_rounds=100, verbose=True)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMClassifier(**params, random_state=71, n_estimators=10000)
        clf.fit(x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                callbacks=[early_stopping_callback],
                eval_metric='auc')

        models.append(clf)
        oof_preds[idx_valid] = clf.predict_proba(x_valid, num_iteration=clf.best_iteration_)[:, 1]
        print('Fold %2d AUC : %.6f' % (i + 1, roc_auc_score(y_valid, oof_preds[idx_valid])))

    score = roc_auc_score(y, oof_preds)
    print('Full AUC score %.6f' % score)
    return oof_preds, models

In [173]:
oof, models = fit_lgbm(X, y, cv=cv, params=lgbm_best_param)

[LightGBM] [Info] Number of positive: 12093, number of negative: 137708
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.247304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6819
[LightGBM] [Info] Number of data points in the train set: 149801, number of used features: 147
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080727 -> initscore=-2.432509
[LightGBM] [Info] Start training from score -2.432509
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[778]	valid_0's auc: 0.762387	valid_0's binary_logloss: 0.245397
Fold  1 AUC : 0.762385
[LightGBM] [Info] Number of positive: 12093, number of negative: 137708
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.166487 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6815
[LightGBM] [Info] Number of data points in the tra

In [167]:
pred = np.array([model.predict_proba(test_x.values)[:, 1] for model in models])
pred = np.mean(pred, axis=0)

In [168]:
submission = sample_submission.copy()
submission['TARGET'] = pred

In [169]:
# ファイル出力
submission.to_csv("submission.csv", index=None)

from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>