In [43]:
!pip install optuna
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
import re
import pickle
import gc
import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")



In [44]:
# Google Colaboratoryで作業する場合はこちらも実行してください。
from google.colab import drive
drive.mount('/content/drive')
# %cd 以降にこのnotebookを置いているディレクトリを指定してください。
%cd "/content/drive/MyDrive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [None]:
# データの読み込み
# INPUT_DIRにtrain.csvなどのデータを置いているディレクトリを指定してください。
INPUT_DIR = "/content/drive/My Drive/GCI/cmp2/"

train = pd.read_csv(INPUT_DIR + "train.csv")
test = pd.read_csv(INPUT_DIR + "test.csv")
sample_sub = pd.read_csv(INPUT_DIR + "sample_submission.csv")

In [None]:

train['DSCR'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL'] * 100 #収入に対する年間返済額
train['Repayment_Rate'] = train['AMT_ANNUITY'] / train['AMT_CREDIT'] * 100 #返済率
train['DTI'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL'] #DTI クライアントがどれだけの負債を背負っているか
# 返済率が4.65%を超えるかどうかに基づいて新しい特徴量を作成
train['High_Repayment'] = (train['Repayment_Rate'] > 4.65).astype(int)


In [None]:

# High_Repaymentが1で、TARGETも1のデータのみをフィルタリング
target_1_high_repayment = train[(train['High_Repayment'] == 1) & (train['TARGET'] == 1)]

# High_Repaymentが1で、TARGETも1のデータ数をカウント
count_target_1 = target_1_high_repayment.shape[0]

# High Repaymentのデータの総数
count_high_repayment = train[train['High_Repayment'] == 1].shape[0]
# TARGETの全データの数
total_target_count = train['TARGET'].count()

# TARGETが1のデータの数をカウント
count_target_1_total = train[train['TARGET'] == 1].shape[0]

# 全データの数
total_count = train.shape[0]


# TARGETの総数（非欠損値の数）
total_target_count = train['TARGET'].count()

# TARGETが1の割合を計算
percentage_target_1 = (count_target_1_total / total_count) * 100

# 結果を表示
print(f"全データの中でTARGETが1のデータの数: {count_target_1_total}, TARGETの総数:{total_target_count}, 全データにおけるTARGETが1の割合: {percentage_target_1:.2f}%")
print(f"High Repaymentの中でTARGETが1のデータの数: {count_target_1}, High Repaymentの総数: {count_high_repayment}, 割合は: {count_target_1/count_high_repayment * 100:.2f}%")


In [None]:

test['DSCR'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL'] * 100 #
test['Repayment_Rate'] = test['AMT_ANNUITY'] / test['AMT_CREDIT'] * 100 #返済率
test['DTI'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL'] * 100 #DTI クライアントがどれだけの負債を背負っているか
test['High_Repayment'] = (test['Repayment_Rate'] > 4.65).astype(int)


In [None]:
# [仮説]　外部機関によるスコアが平均的に高い方が貸し倒れリスクが低いと考えられる。
train["EXT_SOURCE_mean"] = train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
train["EXT_SOURCE_max"] = train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
train["EXT_SOURCE_min"] = train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
train["EXT_SOURCE_std"] = train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
train["EXT_SOURCE_count"] = train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

In [None]:
# [仮説]　外部機関によるスコアが平均的に高い方が貸し倒れリスクが低いと考えられる。
test["EXT_SOURCE_mean"] = test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
test["EXT_SOURCE_max"] = test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
test["EXT_SOURCE_min"] = test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
test["EXT_SOURCE_std"] = test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
test["EXT_SOURCE_count"] = test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

In [None]:
# OWN_CAR_AGEをグループ分け
train["OWN_CAR_AGE"] = train["OWN_CAR_AGE"] // 10
test["OWN_CAR_AGE"] = test["OWN_CAR_AGE"] // 10

train["OWN_CAR_AGE"].unique()
# OWN_CAR_AGEをOne Hot Encoding
train_car_age_ohe = pd.get_dummies(train["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")
test_car_age_ohe = pd.get_dummies(test["OWN_CAR_AGE"]).add_prefix("OWN_CAR_AGE_")

train = pd.concat([train, train_car_age_ohe], axis=1)
test = pd.concat([test, test_car_age_ohe], axis=1)

train.drop('OWN_CAR_AGE', axis=1, inplace=True)
test.drop('OWN_CAR_AGE', axis=1, inplace=True)

In [None]:
train['DOWN_PAYMENT'] = train['AMT_GOODS_PRICE'] - train['AMT_CREDIT']
test['DOWN_PAYMENT'] = test['AMT_GOODS_PRICE'] - test['AMT_CREDIT']
train['DOWN_PAYMENT_RATE'] = train['DOWN_PAYMENT'] / train['AMT_CREDIT'] * 100
test['DOWN_PAYMENT_RATE'] = test['DOWN_PAYMENT'] / test['AMT_CREDIT'] * 100
# DOWN_PAYMENT_RATEが10以下であれば1, それ以上であれば0をセット
train['DOWN_PAYMENT_RATE_ALART'] = (train['DOWN_PAYMENT_RATE'] <=-30).astype(int) *100
test['DOWN_PAYMENT_RATE_ALART'] = (test['DOWN_PAYMENT_RATE'] <= -30).astype(int) * 100

In [None]:
train['EMPLOYMENT_TO_AGE_RATIO'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']
test['EMPLOYMENT_TO_AGE_RATIO'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']

In [None]:
train['SOCIAL_CIRCLE_DEFAULT_RATE'] = train['DEF_30_CNT_SOCIAL_CIRCLE'] / train['OBS_30_CNT_SOCIAL_CIRCLE'].replace(0, 1)
test['SOCIAL_CIRCLE_DEFAULT_RATE'] = test['DEF_30_CNT_SOCIAL_CIRCLE'] / test['OBS_30_CNT_SOCIAL_CIRCLE'].replace(0, 1)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# カテゴリカルカラムの抽出
categorical_columns = train.select_dtypes(include=['object', 'category']).columns.tolist()

# 連続値カラムのリスト（仮）- TARGET カラムを除外して定義
numerical_columns = [col for col in train.columns if col not in categorical_columns + ['TARGET']]

# カテゴリカルデータのエンコーダーの初期化
encoder = OrdinalEncoder()

# トレーニングデータにエンコーダーを適用
train_cat = encoder.fit_transform(train[categorical_columns])
train_cat = pd.DataFrame(train_cat, columns=categorical_columns, index=train.index)

# テストデータにエンコーダーを適用
test_cat = encoder.transform(test[categorical_columns])
test_cat = pd.DataFrame(test_cat, columns=categorical_columns, index=test.index)

# カテゴリカルデータを数値データに結合
train_processed = pd.concat([train[numerical_columns], train_cat], axis=1)
test_processed = pd.concat([test[numerical_columns], test_cat], axis=1)

In [None]:
X_train = train_processed
y_train = train["TARGET"]

id_train = train_processed[["SK_ID_CURR"]]
for col in X_train.columns:
    if X_train[col].dtype=="O":
        X_train[col] = X_train[col].astype("category")

In [None]:
# 層化分割したバリデーションのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X_train, y_train))

# indexの確認：fold=0のtrainデータ
print("index(train):", cv[0][0])

# indexの確認：fold=0のvalidデータ
print("index(valid):", cv[0][1])

In [None]:
print("categorical_columns:", categorical_columns)

In [None]:
"""
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = X_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = X_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)
print(x_tr.isnull().sum())
"""

In [None]:
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = X_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = X_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]

# モデルの学習
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 1000,
    'max_depth': 10,
    'min_child_samples': 100,
    "random_state": 123,
    "importance_type": "gain",
}

model = lgb.LGBMClassifier(**params)

# 早期停止コールバックの作成
early_stopping_callback = lgb.early_stopping(stopping_rounds=100, verbose=True)

model.fit(x_tr, y_tr,
          eval_set=[(x_va, y_va)],
          callbacks=[early_stopping_callback]
         )

# モデルの保存
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

In [None]:
'''モデル評価'''
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# 検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

# 評価値を入れる変数の作成（最初のfoldのときのみ）
metrics = []

# 評価値を格納
metrics.append([nfold, metric_tr, metric_va])

# 結果の表示
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

In [None]:
'''説明変数の重要度取得（学習したモデルから説明変数の重要度を取得）'''
# 重要度の取得
imp_fold = pd.DataFrame({"col":X_train.columns, "imp":model.feature_importances_, "nfold":nfold})
# 確認（重要度の上位10個）
#display(imp_fold.sort_values("imp", ascending=False)[:10])
display(imp_fold.sort_values("imp", ascending=False))
# 重要度を格納する5fold用データフレームの作成
imp = pd.DataFrame()
# imp_foldを5fold用データフレームに結合
imp = pd.concat([imp, imp_fold])

In [None]:
# impが1000以下の列名を取得
low_importance_cols = imp[imp['imp'] <= 100]['col'].unique()

# 低重要度の特徴量名を表示
print("Low importance features (imp <= 100):", low_importance_cols)

In [None]:
#目的関数の定義
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "bagging_freq": 1,
    "random_state": 123,
}

# 目的関数の定義
def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e+2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e+2, log=True),
    }
    params_tuning.update(params_base)

    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X_train, y_train))
    list_fold = [0]  # 処理高速化のために1つめのfoldのみとする。
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = X_train.loc[idx_tr, :], y_train[idx_tr]
        x_va, y_va = X_train.loc[idx_va, :], y_train[idx_va]
        model = lgb.LGBMClassifier(**params_tuning)
        # 早期停止コールバックの作成
        early_stopping_callback = lgb.early_stopping(stopping_rounds=30, verbose=True)

        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_va,y_va)],
                  callbacks=[early_stopping_callback]
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred) # 評価指標をAUCにする
        list_metrics.append(metric_va)

    # 評価指標の算出
    metrics = np.mean(list_metrics)

    return metrics

In [None]:
#最適化処理（探索の実行）
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=100, n_jobs=-1)

In [None]:
#探索結果の確認
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

In [None]:
# データセットの作成
x_test = test_processed.copy()
id_test = test_processed["SK_ID_CURR"].copy()
x_test.drop(columns=low_importance_cols, inplace=True)

# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

In [None]:
# 最適なハイパーパラメータを取得
best_params = study.best_params
best_params.update(params_base)

# モデルの再訓練
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(X_train, y_train))
for nfold in range(5):
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = X_train.loc[idx_tr, :], y_train[idx_tr]
    x_va, y_va = X_train.loc[idx_va, :], y_train[idx_va]
    x_tr.drop(columns=low_importance_cols, inplace=True)
    x_va.drop(columns=low_importance_cols, inplace=True)
    model = lgb.LGBMClassifier(**best_params)
    model.fit(x_tr, y_tr,
              eval_set=[(x_va, y_va)],
              callbacks=[lgb.early_stopping(stopping_rounds=30, verbose=True)]
             )
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "wb") as f:
        pickle.dump(model, f, protocol=4)

list_nfold=[0,1,2,3,4]

# テストデータで予測
pred = np.zeros((len(x_test), len(cv)))
for nfold in range(5):
    print("-"*20, nfold, "-"*20)
    fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
    with open(fname_lgb, "rb") as f:
        model = pickle.load(f)
    pred[:, nfold] = model.predict_proba(x_test)[:, 1]

# 平均予測の計算
pred_lgb = pred.mean(axis=1)

"""
# 平均予測の計算
pred = pd.concat([
    id_test,
    pd.DataFrame({"pred": pred.mean(axis=1)}),
], axis=1)

# 提出ファイルの作成
df_submit = pred.rename(columns={"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("submission_baseline.csv", index=None)
"""

In [None]:
from google.colab import files
#files.download('submission_baseline.csv')

In [None]:
X_train = train.drop(["TARGET"], axis=1)
y_train = train["TARGET"]

id_train = train[["SK_ID_CURR"]]
for col in X_train.columns:
    if X_train[col].dtype=="O":
        X_train[col] = X_train[col].astype("category")
# データセットの作成
x_test = test.copy()
id_test = test[["SK_ID_CURR"]].copy()
#x_test.drop(columns=low_importance_cols, inplace=True)
# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")


In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# モデルの設定
xgb_params = {
    'objective': 'binary:logistic',
    'learning_rate': 0.05,
    'n_estimators': 100,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    'use_label_encoder': False  # ラベルエンコーダの警告を避けるために追加
}

x_tr, x_va, y_tr, y_va = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(x_tr.shape)

# データをXGBoost用に変換
dtrain = xgb.DMatrix(x_tr, label=y_tr, enable_categorical=True)
dvalid = xgb.DMatrix(x_va, label=y_va, enable_categorical=True)


# モデルの訓練
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=100)

# モデルの評価
y_valid_pred = xgb_model.predict(dvalid)
auc_score = roc_auc_score(y_va, y_valid_pred)
print("XGBoost AUC: {:.4f}".format(auc_score))

# テストデータをXGBoost用に変換
dtest = xgb.DMatrix(x_test, enable_categorical=True)
# テストデータで予測
pred_xgb = xgb_model.predict(dtest)

# 結果の表示（例として先頭の10件を表示）
print("Test predictions (first 10):", pred_xgb[:10])

In [None]:
X_train = train_processed.copy()
y_train = train["TARGET"].copy()

id_train = train_processed[["SK_ID_CURR"]]
for col in X_train.columns:
    if X_train[col].dtype=="O":
        X_train[col] = X_train[col].astype("category")

x_test = test_processed.copy()
id_test = test[["SK_ID_CURR"]].copy()
# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

In [None]:
!pip install catboost
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# モデルの設定
cat_params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'depth': 6,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'verbose': 10,
    'random_seed': 42
}

x_tr, x_va, y_tr, y_va = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# データをCatBoost用に変換
train_pool = Pool(data=x_tr, label=y_tr)
valid_pool = Pool(data=x_va, label=y_va)

# モデルの訓練
cat_model = CatBoostClassifier(**cat_params)
cat_model.fit(train_pool, eval_set=[(x_tr, y_tr), (x_va, y_va)], use_best_model=True, early_stopping_rounds=50)

# 学習曲線のプロット
evals_result = cat_model.get_evals_result()

plt.figure(figsize=(10, 6))
plt.plot(evals_result['validation_1']['AUC'], label='Validation AUC')
plt.plot(evals_result['validation_0']['AUC'], label='Training AUC')
plt.xlabel('Iterations')
plt.ylabel('AUC')
plt.title('Learning Curve')
plt.legend()
plt.show()

# モデルの評価
y_valid_pred = cat_model.predict_proba(x_va)[:, 1]
auc_score = roc_auc_score(y_va, y_valid_pred)
print("CatBoost Validation AUC: {:.4f}".format(auc_score))

In [None]:
# テストデータをCatBoost用に変換
test_pool = Pool(data=x_test)

# テストデータでの予測
pred_cat = cat_model.predict_proba(test_pool)[:, 1]
print(pred_cat)

In [None]:
# すべてのモデルの予測を平均化
#final_pred = (pred_lgb + pred_xgb + pred_cat) / 3
final_pred = (pred_lgb + pred_cat) / 2

In [None]:
# 提出ファイルの作成
submission = pd.DataFrame({
    'SK_ID_CURR': id_test['SK_ID_CURR'],
    'TARGET': final_pred
})

# ファイルの保存
submission.to_csv('submission_ensemble.csv', index=False)
from google.colab import files
files.download('submission_ensemble.csv')

In [None]:
# 各モデルから予測値を取得し、データフレームにまとめる
df_predictions = pd.DataFrame({
    'SK_ID_CURR': id_test['SK_ID_CURR'],  # テスト識別子
    'pred_lgb': pred_lgb,                 # LightGBMの予測
    'pred_xgb': pred_xgb,                 # XGBoostの予測
    'pred_cat': pred_cat                  # CatBoostの予測
})

# 予測値を全て表示
print(df_predictions)