### 必要なモジュールのimport

In [53]:
import warnings
import logging
import time
import pickle
import xgboost as xgb
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from scipy.stats import norm
import seaborn as sns
import lightgbm as lgb
import optuna
import optuna.integration.lightgbm as lgbo
from lightgbm import early_stopping
from lightgbm import log_evaluation
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix, precision_recall_curve, accuracy_score, mean_squared_error
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold, GridSearchCV
from collections import Counter

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

In [54]:
# データの読み込み(02_data_preparation.ipynbで作成したデータ)
train = pd.read_csv('../data/processed/processed20240627_train.csv')
test = pd.read_csv('../data/processed/processed20240627_test.csv')

## targetの分布の確認

In [55]:

# 日付型に変換
train['purchase_month_allallmode'] = pd.to_datetime(train['purchase_month_allallmode'], errors='coerce')
# 月の部分を取得して整数に変換する
train['purchase_month_allallmode'] = train['purchase_month_allallmode'].dt.month.astype('Int64')

train['purchase_month_authallmode'] = pd.to_datetime(train['purchase_month_authallmode'], errors='coerce')
train['purchase_month_authallmode'] = train['purchase_month_authallmode'].dt.month.astype('Int64')

train['purchase_month_histallmode'] = pd.to_datetime(train['purchase_month_histallmode'], errors='coerce')
train['purchase_month_histallmode'] = train['purchase_month_histallmode'].dt.month.astype('Int64')

train['purchase_month_newallmode'] = pd.to_datetime(train['purchase_month_newallmode'], errors='coerce')
train['purchase_month_newallmode'] = train['purchase_month_newallmode'].dt.month.astype('Int64')

In [56]:
# 日付型に変換
test['purchase_month_allallmode'] = pd.to_datetime(test['purchase_month_allallmode'], errors='coerce')
# 月の部分を取得して整数に変換する
test['purchase_month_allallmode'] = test['purchase_month_allallmode'].dt.month.astype('Int64')

test['purchase_month_authallmode'] = pd.to_datetime(test['purchase_month_authallmode'], errors='coerce')
test['purchase_month_authallmode'] = test['purchase_month_authallmode'].dt.month.astype('Int64')

test['purchase_month_histallmode'] = pd.to_datetime(test['purchase_month_histallmode'], errors='coerce')
test['purchase_month_histallmode'] = test['purchase_month_histallmode'].dt.month.astype('Int64')

test['purchase_month_newallmode'] = pd.to_datetime(test['purchase_month_newallmode'], errors='coerce')
test['purchase_month_newallmode'] = test['purchase_month_newallmode'].dt.month.astype('Int64')

In [57]:
# # 日付型をドロップ
# columns_to_drop = [
#     'purchase_month_allallmode', 
#     'purchase_month_authallmode', 
#     'purchase_month_histallmode', 
#     'purchase_month_newallmode'
# ]

# # 指定されたカラムをドロップします
# train = train.drop(columns=columns_to_drop)
# test = test.drop(columns=columns_to_drop)

## lightGBMモデルの作成

In [58]:
# 目的変数と説明変数の作成
target = train['target']
del train['target']

In [59]:
# 特徴量の情報
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = ['feature_2', 'feature_3']

In [60]:
# folds = KFold(n_splits=5, shuffle=True, random_state=15)
# oof_lgb = np.zeros(len(train))
# oof_xgb = np.zeros(len(train))
# predictions_lgb = np.zeros(len(test))
# predictions_xgb = np.zeros(len(test))
# feature_importance_df = pd.DataFrame()

# start = time.time()

In [61]:
# # GRID SEARCH
# # LightGBM
# lgb_reg = lgb.LGBMRegressor(objective='regression', metric='rmse', n_estimators=1000, random_state=42)
# grid_search = GridSearchCV(estimator=lgb_reg, param_grid=param, cv=folds, scoring='neg_mean_squared_error', verbose=2)
# # グリッドサーチの実行
# grid_search.fit(train[features], target)
# # 最良のパラメータとそのスコアの取得
# lgbm_best_params = grid_search.best_params_
# print("Best parameters found: ", lgbm_best_params)
# lgbm_best_score = np.sqrt(-grid_search.best_score_)
# print("Best RMSE score: ", lgbm_best_score)
# # XGBoost
# xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
# grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param2, cv=folds,
#                            scoring='neg_mean_squared_error', verbose=0)
# grid_search.fit(train, target)

# xgb_best_params = grid_search.best_params_
# print("Best parameters found: ", xgb_best_params)
# xgb_best_score = np.sqrt(-grid_search.best_score_)
# print("Best RMSE score: ", xgb_best_score)

# optuna

#### xgboost

In [62]:

# # 目的関数
# def objective(trial, df_X, df_y):
#     params = {
#         'objective': 'reg:squarederror',
#         'eval_metric': 'rmse',
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
#         'n_estimators': trial.suggest_int('n_estimators', 100, 400),
#         'max_depth': trial.suggest_int('max_depth', 3, 14),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
#         'subsample': trial.suggest_uniform('subsample', 0.2, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.2, 1.0),
#         'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
#         'seed': 42
#     }

#     xgb_reg = xgb.XGBRegressor(
#         n_estimators=params['n_estimators'],
#         learning_rate=params['learning_rate'],
#         max_depth=params['max_depth'],
#         min_child_weight=params['min_child_weight'],
#         subsample=params['subsample'],
#         colsample_bytree=params['colsample_bytree'],
#         reg_lambda=params['lambda'],
#         random_state=42,
#         objective='reg:squarederror'
#     )

#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     scores = []
#     for train_index, val_index in kf.split(df_X):
#         X_train, X_val = df_X.iloc[train_index], df_X.iloc[val_index]
#         y_train, y_val = df_y.iloc[train_index], df_y.iloc[val_index]

#         xgb_reg.fit(X_train, y_train)
#         y_pred = xgb_reg.predict(X_val)
#         score = np.sqrt(mean_squared_error(y_val, y_pred))
#         scores.append(score)

#     return np.mean(scores)

# def preprocess_data(df):
#     # 不要な列の削除
#     df = df.drop(columns=['card_id'])

#     # 日付型の列を適切に変換
#     df['first_active_month'] = pd.to_datetime(df['first_active_month'], errors='coerce')
#     df['first_active_year'] = df['first_active_month'].dt.year
#     df['first_active_month'] = df['first_active_month'].dt.month

#     # NaNを0で埋める（必要に応じて適切な方法を選択）
#     df = df.fillna(0)

#     return df

# # データの準備
# train_processed = preprocess_data(train)

# # Optunaのスタディを作成
# study = optuna.create_study(direction='minimize')
# study.optimize(lambda trial: objective(trial, train_processed, target), n_trials=50)

# # 最良のパラメータとそのスコアの取得
# xgb_best_params = study.best_params
# print("Best parameters found: ", xgb_best_params)
# xgb_best_score = study.best_value
# print("Best RMSE score: ", xgb_best_score)

#### lightgbm

In [63]:
xgb_param = {'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.021479177791852513, 'max_depth': 7, 'min_child_weight': 34, 'subsample': 0.5594130759358893, 'colsample_bytree': 0.422564749821534, 'lambda': 2.7868241524176528e-08,
    'verbosity': 1
  }

In [64]:
# train_data = lgb.Dataset(train[features], label=target)

# # パラメータ設定
# params = {
#     'objective': 'regression',
#     'metric': 'rmse',
#     'verbose': -1
# }

# # LightGBMTunerCVのインスタンスを作成
# tuner = lgbo.LightGBMTunerCV(
#     params=params,
#     train_set=train_data,
#     folds=KFold(n_splits=5),
#     optuna_seed=0,
#     callbacks=[lgbo.early_stopping(100), lgbo.log_evaluation(100)]
# )
# # ハイパーパラメータ探索の実行
# tuner.run()

# # サーチしたパラメータの表示
# lgb_best_params = tuner.best_params
# print("Best parameters found: ")
# for key, value in best_params.items():
#     print(f"    {key}: {value}")

In [65]:
# lightgbmのパラメータ
lgb_param = {'num_leaves': 52,
         'min_data_in_leaf': 149,
         'objective':'regression',
         'max_depth': 9,
         "min_child_samples": 100,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 1,
         "bagging_freq": 0,
         "bagging_fraction": 1,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 1.0685408701201727e-08,
         "lambda_l2" : 1.1902670969137922e-06,
         "random_state": 133,
         "verbosity": -1}

In [66]:
# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
#     print("fold n°{}".format(fold_))

#     # LightGBMのデータセット
#     trn_data_lgb = lgb.Dataset(train.iloc[trn_idx][features],
#                                label=target.iloc[trn_idx],
#                                categorical_feature=categorical_feats)
#     val_data_lgb = lgb.Dataset(train.iloc[val_idx][features],
#                                label=target.iloc[val_idx],
#                                categorical_feature=categorical_feats)

#     # LightGBMモデルのトレーニング
#     num_round = 10000
#     clf_lgb = lgb.train(params=lgb_param,
#                         train_set=trn_data_lgb,
#                         num_boost_round=num_round,
#                         valid_sets=[val_data_lgb],
#                         callbacks=[lgb.early_stopping(stopping_rounds=200),
#                                    lgb.log_evaluation(100)])

#     oof_lgb[val_idx] = clf_lgb.predict(train.iloc[val_idx][features], num_iteration=clf_lgb.best_iteration)

#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = features
#     fold_importance_df["importance"] = clf_lgb.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     predictions_lgb += clf_lgb.predict(test[features], num_iteration=clf_lgb.best_iteration) / folds.n_splits

#     # モデルを保存
#     with open(f'../src/models/model_lgb_fold_{fold_}.pkl', 'wb') as f:
#         pickle.dump(clf_lgb, f)

#     # XGBoostのデータセット
#     trn_data_xgb = xgb.DMatrix(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
#     val_data_xgb = xgb.DMatrix(train.iloc[val_idx][features], label=target.iloc[val_idx])

#     # XGBoostモデルのトレーニング
#     clf_xgb = xgb.train(xgb_param, trn_data_xgb,
#                         num_boost_round=num_round,
#                         evals=[(val_data_xgb, 'valid')],
#                         early_stopping_rounds=200,
#                         verbose_eval=100)

#     oof_xgb[val_idx] = clf_xgb.predict(val_data_xgb, iteration_range=(0, clf_xgb.best_iteration))
#     predictions_xgb += clf_xgb.predict(xgb.DMatrix(test[features]), iteration_range=(0, clf_xgb.best_iteration)) / folds.n_splits

#     # モデルを保存
#     with open(f'../src/models/model_ensamble_fold_{fold_}.pkl', 'wb') as f:
#         pickle.dump(clf_xgb, f)



In [67]:
# from catboost import CatBoostRegressor, Pool
# def objective(trial, df_X, df_y):
#     params = {
#         'iterations': 10000,
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
#         'depth': trial.suggest_int('depth', 3, 10),
#         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
#         'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 10.0),
#         'border_count': trial.suggest_int('border_count', 1, 255),
#         'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
#         'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
#         'od_wait': 200,
#         'random_seed': 42
#     }

#     model = CatBoostRegressor(**params, logging_level='Silent')

#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     scores = []
#     for train_index, val_index in kf.split(df_X):
#         X_train, X_val = df_X.iloc[train_index], df_X.iloc[val_index]
#         y_train, y_val = df_y.iloc[train_index], df_y.iloc[val_index]

#         train_pool = Pool(data=X_train, label=y_train)
#         val_pool = Pool(data=X_val, label=y_val)

#         model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
#         y_pred = model.predict(X_val)
#         score = np.sqrt(mean_squared_error(y_val, y_pred))
#         scores.append(score)

#     return np.mean(scores)

# def preprocess_data(df):
#     # 不要な列の削除
#     df = df.drop(columns=['card_id'])

#     # 日付型の列を適切に変換
#     df['first_active_month'] = pd.to_datetime(df['first_active_month'], errors='coerce')
#     df['first_active_year'] = df['first_active_month'].dt.year
#     df['first_active_month'] = df['first_active_month'].dt.month

#     # NaNを0で埋める（必要に応じて適切な方法を選択）
#     df = df.fillna(0)

#     return df

# # データの準備
# train_processed = preprocess_data(train)

# # Optunaのスタディを作成
# study = optuna.create_study(direction='minimize')
# study.optimize(lambda trial: objective(trial, train_processed, target), n_trials=30)

# # 最良のパラメータとそのスコアの取得
# catboost_best_params = study.best_params
# print("Best parameters found: ", catboost_best_params)
# catboost_best_score = study.best_value
# print("Best RMSE score: ", catboost_best_score)

In [68]:
catboost_best_params={'iterations': 100000,
                      'learning_rate': 0.024281517788928442, 'depth': 8, 'l2_leaf_reg': 1.2220358642424707, 'bagging_temperature': 3.350428488837416, 'border_count': 53, 'random_strength': 0.02028193905811996, 'od_type': 'IncToDec',
                      'od_wait': 200}

In [69]:
import catboost as ctb

folds = KFold(n_splits=5, shuffle=True, random_state=42)
oof_lgb = np.zeros(len(train))
oof_xgb = np.zeros(len(train))
oof_ctb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
predictions_xgb = np.zeros(len(test))
predictions_ctb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))

    # LightGBMのデータセット
    trn_data_lgb = lgb.Dataset(train.iloc[trn_idx][features],
                               label=target.iloc[trn_idx],
                               categorical_feature=categorical_feats)
    val_data_lgb = lgb.Dataset(train.iloc[val_idx][features],
                               label=target.iloc[val_idx],
                               categorical_feature=categorical_feats)

    # LightGBMモデルのトレーニング
    clf_lgb = lgb.train(params=lgb_param,
                        train_set=trn_data_lgb,
                        num_boost_round=10000,
                        valid_sets=[val_data_lgb],
                        callbacks=[lgb.early_stopping(stopping_rounds=300),
                                   lgb.log_evaluation(100)])

    oof_lgb[val_idx] = clf_lgb.predict(train.iloc[val_idx][features], num_iteration=clf_lgb.best_iteration)
    predictions_lgb += clf_lgb.predict(test[features], num_iteration=clf_lgb.best_iteration) / folds.n_splits

    # モデルを保存
    with open(f'../src/models/model_lgb_fold_{fold_}.pkl', 'wb') as f:
        pickle.dump(clf_lgb, f)

    # XGBoostのデータセット
    trn_data_xgb = xgb.DMatrix(train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data_xgb = xgb.DMatrix(train.iloc[val_idx][features], label=target.iloc[val_idx])

    # XGBoostモデルのトレーニング
    clf_xgb = xgb.train(xgb_param, trn_data_xgb,
                        num_boost_round=10000,
                        evals=[(val_data_xgb, 'valid')],
                        early_stopping_rounds=200,
                        verbose_eval=100)

    oof_xgb[val_idx] = clf_xgb.predict(val_data_xgb, iteration_range=(0, clf_xgb.best_iteration))
    predictions_xgb += clf_xgb.predict(xgb.DMatrix(test[features]), iteration_range=(0, clf_xgb.best_iteration)) / folds.n_splits

    # モデルを保存
    with open(f'../src/models/model_xgb_fold_{fold_}.pkl', 'wb') as f:
        pickle.dump(clf_xgb, f)

    # CatBoostのデータセット
    trn_data_ctb = ctb.Pool(data=train.iloc[trn_idx][features], label=target.iloc[trn_idx], cat_features=categorical_feats)
    val_data_ctb = ctb.Pool(data=train.iloc[val_idx][features], label=target.iloc[val_idx], cat_features=categorical_feats)

    # CatBoostモデルのトレーニング
    clf_ctb = ctb.CatBoostRegressor(**catboost_best_params)
    clf_ctb.fit(trn_data_ctb, eval_set=val_data_ctb, use_best_model=True, early_stopping_rounds=200)

    oof_ctb[val_idx] = clf_ctb.predict(val_data_ctb)
    predictions_ctb += clf_ctb.predict(test[features]) / folds.n_splits

    # モデルを保存
    with open(f'../src/models/model_ctb_fold_{fold_}.pkl', 'wb') as f:
        pickle.dump(clf_ctb, f)


fold n°0
Training until validation scores don't improve for 300 rounds
[100]	valid_0's rmse: 3.77838
[200]	valid_0's rmse: 3.73779
[300]	valid_0's rmse: 3.71739
[400]	valid_0's rmse: 3.70657
[500]	valid_0's rmse: 3.69876
[600]	valid_0's rmse: 3.69429
[700]	valid_0's rmse: 3.69145
[800]	valid_0's rmse: 3.68924
[900]	valid_0's rmse: 3.68826
[1000]	valid_0's rmse: 3.6874
[1100]	valid_0's rmse: 3.68674
[1200]	valid_0's rmse: 3.68621
[1300]	valid_0's rmse: 3.68561
[1400]	valid_0's rmse: 3.68525
[1500]	valid_0's rmse: 3.68506
[1600]	valid_0's rmse: 3.68498
[1700]	valid_0's rmse: 3.68491
[1800]	valid_0's rmse: 3.68494
[1900]	valid_0's rmse: 3.68489
Early stopping, best iteration is:
[1643]	valid_0's rmse: 3.68478
[0]	valid-rmse:3.87001
[100]	valid-rmse:3.70167
[200]	valid-rmse:3.68807
[300]	valid-rmse:3.68486
[400]	valid-rmse:3.68345
[500]	valid-rmse:3.68348
[598]	valid-rmse:3.68664
0:	learn: 3.8369805	test: 3.8697048	best: 3.8697048 (0)	total: 922ms	remaining: 1d 1h 36m 43s
1:	learn: 3.83023

In [77]:
be_param = {}
best_score = float("inf")
best_weights = None

for i in np.arange(0.0, 2.1, 0.05):
    for j in np.arange(0.0, 2.1, 0.05):
        for k in np.arange(0.0, 2.1, 0.05):
            if abs(i + j + k - 1.0) < 1e-6:  # 重みの合計が2になるようにする
                y_pred = (oof_lgb * i + oof_xgb * j + oof_ctb * k) / (i + j + k)
                score = np.log(mean_squared_error(target, y_pred))
                be_param[score] = [i, j, k]
                if score < best_score:
                    best_score = score
                    best_weights = [i, j, k]

print(f"Best log MSE: {best_score}")
print(f"Best weights: {best_weights}")

# 最適な重みを使って最終予測を計算
final_predictions = (predictions_lgb * best_weights[0] +
                     predictions_xgb * best_weights[1] +
                     predictions_ctb * best_weights[2]) / sum(best_weights)

submission = pd.DataFrame({
    'card_id': test['card_id'],
    'target': final_predictions
})
submission.to_csv("../data/row/submit.csv", index=False)


Best log MSE: 2.590486848772002
Best weights: [0.30000000000000004, 0.25, 0.45]


In [71]:
# lgb_weight = 0.9
# xgb_weight = 0.9
# # 予測のアンサンブル
# final_oof = oof_lgb*lgb_weight + oof_xgb*xgb_weight
# final_predictions = predictions_lgb*lgb_weight + predictions_xgb*xgb_weight

# # 評価
# cv_score = mean_squared_error(target, final_oof)**0.5
# print("CV score: {:<8.5f}".format(cv_score))

# # 必要に応じて最終的な予測結果を保存
# submission = pd.DataFrame({'card_id': test['card_id'], 'target': final_predictions})
# submission.to_csv("../data/row/submit.csv", index=False)

In [72]:
# cv_score = mean_squared_error(target, oof_lgb)**0.5
# print("CV score: {:<8.5f}".format(cv_score))
# submission = pd.DataFrame({'card_id': test['card_id'], 'target': predictions_lgb})
# submission.to_csv("../data/row/submit.csv", index=False)

In [73]:
# # 予測のアンサンブル
# final_oof = (oof_lgb + oof_xgb) / 2
# final_predictions = (predictions_lgb + predictions_xgb) / 2

# # 評価
# cv_score = mean_squared_error(target, final_oof)**0.5
# print("CV score: {:<8.5f}".format(cv_score))


In [74]:
# submission = pd.DataFrame({'card_id': test['card_id'], 'target': final_predictions})
# submission.to_csv("../data/row/submit.csv", index=False)

In [75]:
# # データをKFoldで5分割して学習
# folds = KFold(n_splits=5, shuffle=True, random_state=15)
# oof = np.zeros(len(train))
# predictions = np.zeros(len(test))
# start = time.time()
# feature_importance_df = pd.DataFrame()

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
#     print("fold n°{}".format(fold_))
#     trn_data = lgb.Dataset(train.iloc[trn_idx][features],
#                            label=target.iloc[trn_idx],
#                            categorical_feature=categorical_feats
#                           )
#     val_data = lgb.Dataset(train.iloc[val_idx][features],
#                            label=target.iloc[val_idx],
#                            categorical_feature=categorical_feats
#                           )

#     num_round = 10000
#     clf = lgb.train(params=param,
#                     train_set=trn_data,
#                     num_boost_round=num_round,
#                     valid_sets=[val_data],
#                     callbacks=[lgb.early_stopping(stopping_rounds=200),
#                                lgb.log_evaluation(100)])

#     oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)

#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = features
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
#     predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

#     # モデルを保存
#     with open(f'../src/models/model_fold_{fold_}.pkl', 'wb') as f:
#         pickle.dump(clf, f)

# print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

## 特徴量重要度

In [76]:
# 特徴量重要度の可視化
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,50))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
# plt.savefig('lgbm_importances.png')

KeyError: "None of [Index(['feature', 'importance'], dtype='object')] are in the [columns]"

## 提出ファイルの作成

In [None]:
# sub_df = pd.DataFrame({"card_id":test["card_id"].values})
# sub_df["target"] = predictions
# sub_df.to_csv("submit.csv", index=False)