In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import gc 
import time
import optuna

plt.style.use('ggplot') # Lets make our plots pretty

path = os.getcwd()

print(path)  

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


C:\Users\Naoki Tomita\Desktop\kaggle_elo\Models


In [2]:
# Read in the dataframes
train = pd.read_csv('../input/train_1.csv')
test = pd.read_csv('../input/test_1.csv')

print(train.columns)

target = train['target']
del train['target']
#del train['outliners']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

Index(['Unnamed: 0', 'first_active_month', 'card_id', 'feature_1', 'feature_2',
       'feature_3', 'target', 'elapsed_time', 'outliers',
       'hist_transactions_count',
       ...
       'installments_purchase_amount_max', 'installments_purchase_amount_std',
       'city_id_purchase_amount_mean', 'city_id_purchase_amount_min',
       'city_id_purchase_amount_max', 'city_id_purchase_amount_std',
       'category_1_installments_mean', 'category_1_installments_min',
       'category_1_installments_max', 'category_1_installments_std'],
      dtype='object', length=226)


In [3]:
def kfold_lightgbm(trial):
    FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0']
    seed=20190208
        
    # params optimized by optuna
    learning_rate_tuna = trial.suggest_uniform('learning_rate', 0, 1.0)
    toprate_tuna = trial.suggest_uniform('top_rate', 0, 1.0)
    num_leaves_tuna = trial.suggest_uniform('learning_rate', 0, 1.0)
    min_child_weight_tuna = trial.suggest_int('min_child_weight', 5, 500)
    other_rate_tuna=trial.suggest_uniform('other_rate', 0.0, 1.0)
    num_leaves_tuna=trial.suggest_int('num_leaves', 5, 1000)
    min_gain_split_tuna=trial.suggest_uniform('min_gain_split', 5, 500)
    reg_lambda_tuna=trial.suggest_uniform('reg_lambda', 5, 500)
        
    param ={'task': 'train',
            'boosting': 'dart',
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': learning_rate_tuna ,
            'subsample': 0.9855232997390695,
            'max_depth': 7,
            'top_rate': toprate_tuna ,
            'num_leaves': num_leaves_tuna,
            'min_child_weight': min_child_weight_tuna,
            'other_rate': other_rate_tuna,
            'reg_alpha': 9.677537745007898,
            'colsample_bytree': 0.5665320670155495,
            'min_split_gain': min_gain_split_tuna,
            'reg_lambda': reg_lambda_tuna,
            'min_data_in_leaf': 21,
            'verbose': -1,
            'seed':seed,
            'bagging_seed':seed,
            'drop_seed':seed,
            'max_bin':255,
            'device':'gpu'
            }

    # Create arrays and dataframes to store results
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    start = time.time()
    feature_importance_df = pd.DataFrame()
                  
    # k-fold
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values)):
        print("fold n°{}".format(fold_))
        trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
        val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
        predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
    
    sub_df = pd.read_csv("../input/sample_submission.csv")
    sub_df["target"] = predictions
    sub_df.to_csv("submit_lgb"+str(n)+"_1.csv", index=False)


In [4]:
study = optuna.create_study()
study.optimize(kfold_lightgbm, n_trials=10)

fold n°0




[100]	training's rmse: 1.72592	valid_1's rmse: 1.72472
[200]	training's rmse: 1.57719	valid_1's rmse: 1.57223
[300]	training's rmse: 1.57978	valid_1's rmse: 1.57436
[400]	training's rmse: 1.56544	valid_1's rmse: 1.55929
[500]	training's rmse: 1.56402	valid_1's rmse: 1.55762
[600]	training's rmse: 1.56403	valid_1's rmse: 1.55753
[700]	training's rmse: 1.56785	valid_1's rmse: 1.56116
[800]	training's rmse: 1.56378	valid_1's rmse: 1.55673
[900]	training's rmse: 1.56465	valid_1's rmse: 1.55752
[1000]	training's rmse: 1.56393	valid_1's rmse: 1.55671
[1100]	training's rmse: 1.56438	valid_1's rmse: 1.55713
[1200]	training's rmse: 1.56431	valid_1's rmse: 1.55688
[1300]	training's rmse: 1.56422	valid_1's rmse: 1.55687
[1400]	training's rmse: 1.56396	valid_1's rmse: 1.55646
[1500]	training's rmse: 1.56441	valid_1's rmse: 1.55681
[1600]	training's rmse: 1.56482	valid_1's rmse: 1.55704
[1700]	training's rmse: 1.56446	valid_1's rmse: 1.55662
[1800]	training's rmse: 1.56448	valid_1's rmse: 1.55673
[

[W 2019-02-11 12:12:47,053] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.57065	valid_1's rmse: 1.56055
[200]	training's rmse: 1.57213	valid_1's rmse: 1.56148
[300]	training's rmse: 1.57365	valid_1's rmse: 1.5634
[400]	training's rmse: 1.5728	valid_1's rmse: 1.56246
[500]	training's rmse: 1.57241	valid_1's rmse: 1.56186
[600]	training's rmse: 1.5729	valid_1's rmse: 1.56259
[700]	training's rmse: 1.57079	valid_1's rmse: 1.56111
[800]	training's rmse: 1.57221	valid_1's rmse: 1.56223
[900]	training's rmse: 1.57178	valid_1's rmse: 1.56185
[1000]	training's rmse: 1.5723	valid_1's rmse: 1.56247
[1100]	training's rmse: 1.57207	valid_1's rmse: 1.56213
[1200]	training's rmse: 1.57198	valid_1's rmse: 1.56191
[1300]	training's rmse: 1.57181	valid_1's rmse: 1.5615
[1400]	training's rmse: 1.57209	valid_1's rmse: 1.56211
[1500]	training's rmse: 1.57118	valid_1's rmse: 1.56148
[1600]	training's rmse: 1.57218	valid_1's rmse: 1.56228
[1700]	training's rmse: 1.5722	valid_1's rmse: 1.56232
[1800]	training's rmse: 1.57176	valid_1's rmse: 1.5618

[W 2019-02-11 12:16:48,753] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.57613	valid_1's rmse: 1.56675
[200]	training's rmse: 1.57215	valid_1's rmse: 1.56239
[300]	training's rmse: 1.5724	valid_1's rmse: 1.56267
[400]	training's rmse: 1.57269	valid_1's rmse: 1.563
[500]	training's rmse: 1.5731	valid_1's rmse: 1.56334
[600]	training's rmse: 1.57388	valid_1's rmse: 1.56429
[700]	training's rmse: 1.57348	valid_1's rmse: 1.56373
[800]	training's rmse: 1.57312	valid_1's rmse: 1.56362
[900]	training's rmse: 1.57271	valid_1's rmse: 1.56296
[1000]	training's rmse: 1.57314	valid_1's rmse: 1.56339
[1100]	training's rmse: 1.57293	valid_1's rmse: 1.5631
[1200]	training's rmse: 1.57287	valid_1's rmse: 1.56322
[1300]	training's rmse: 1.57219	valid_1's rmse: 1.56273
[1400]	training's rmse: 1.57212	valid_1's rmse: 1.56248
[1500]	training's rmse: 1.57276	valid_1's rmse: 1.56292
[1600]	training's rmse: 1.57263	valid_1's rmse: 1.56292
[1700]	training's rmse: 1.57254	valid_1's rmse: 1.56295
[1800]	training's rmse: 1.57326	valid_1's rmse: 1.563

[W 2019-02-11 12:22:06,972] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.58778	valid_1's rmse: 1.58237
[200]	training's rmse: 1.56748	valid_1's rmse: 1.56096
[300]	training's rmse: 1.56397	valid_1's rmse: 1.55725
[400]	training's rmse: 1.56383	valid_1's rmse: 1.55677
[500]	training's rmse: 1.56383	valid_1's rmse: 1.55655
[600]	training's rmse: 1.56365	valid_1's rmse: 1.55618
[700]	training's rmse: 1.56478	valid_1's rmse: 1.55727
[800]	training's rmse: 1.56482	valid_1's rmse: 1.55739
[900]	training's rmse: 1.56402	valid_1's rmse: 1.55671
[1000]	training's rmse: 1.56476	valid_1's rmse: 1.55732
[1100]	training's rmse: 1.56418	valid_1's rmse: 1.55682
[1200]	training's rmse: 1.56439	valid_1's rmse: 1.55698
[1300]	training's rmse: 1.56479	valid_1's rmse: 1.5571
[1400]	training's rmse: 1.56478	valid_1's rmse: 1.55711
[1500]	training's rmse: 1.56473	valid_1's rmse: 1.55718
[1600]	training's rmse: 1.56465	valid_1's rmse: 1.55696
[1700]	training's rmse: 1.56503	valid_1's rmse: 1.55711
[1800]	training's rmse: 1.56493	valid_1's rmse: 1

[W 2019-02-11 12:29:02,021] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.54695	valid_1's rmse: 1.55151
[200]	training's rmse: 1.54497	valid_1's rmse: 1.54894
[300]	training's rmse: 1.54512	valid_1's rmse: 1.54871
[400]	training's rmse: 1.54417	valid_1's rmse: 1.54788
[500]	training's rmse: 1.54356	valid_1's rmse: 1.54743
[600]	training's rmse: 1.54412	valid_1's rmse: 1.54743
[700]	training's rmse: 1.54472	valid_1's rmse: 1.54742
[800]	training's rmse: 1.54434	valid_1's rmse: 1.54697
[900]	training's rmse: 1.54398	valid_1's rmse: 1.54661
[1000]	training's rmse: 1.54412	valid_1's rmse: 1.54686
[1100]	training's rmse: 1.54437	valid_1's rmse: 1.54675
[1200]	training's rmse: 1.54443	valid_1's rmse: 1.54644
[1300]	training's rmse: 1.5444	valid_1's rmse: 1.54638
[1400]	training's rmse: 1.54446	valid_1's rmse: 1.54659
[1500]	training's rmse: 1.54431	valid_1's rmse: 1.54647
[1600]	training's rmse: 1.54475	valid_1's rmse: 1.54692
[1700]	training's rmse: 1.54525	valid_1's rmse: 1.54703
[1800]	training's rmse: 1.54487	valid_1's rmse: 1

[W 2019-02-11 12:37:42,320] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.61962	valid_1's rmse: 1.61529
[200]	training's rmse: 1.58297	valid_1's rmse: 1.57734
[300]	training's rmse: 1.56247	valid_1's rmse: 1.55607
[400]	training's rmse: 1.56632	valid_1's rmse: 1.55974
[500]	training's rmse: 1.56128	valid_1's rmse: 1.5546
[600]	training's rmse: 1.56226	valid_1's rmse: 1.55539
[700]	training's rmse: 1.56312	valid_1's rmse: 1.556
[800]	training's rmse: 1.56256	valid_1's rmse: 1.5556
[900]	training's rmse: 1.56297	valid_1's rmse: 1.55582
[1000]	training's rmse: 1.56329	valid_1's rmse: 1.55614
[1100]	training's rmse: 1.56396	valid_1's rmse: 1.55673
[1200]	training's rmse: 1.56329	valid_1's rmse: 1.55601
[1300]	training's rmse: 1.56335	valid_1's rmse: 1.55615
[1400]	training's rmse: 1.56354	valid_1's rmse: 1.55624
[1500]	training's rmse: 1.56376	valid_1's rmse: 1.55633
[1600]	training's rmse: 1.56356	valid_1's rmse: 1.55612
[1700]	training's rmse: 1.56397	valid_1's rmse: 1.55636
[1800]	training's rmse: 1.56381	valid_1's rmse: 1.55

[W 2019-02-11 12:45:16,654] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.61475	valid_1's rmse: 1.62913
[200]	training's rmse: 1.52872	valid_1's rmse: 1.54867
[300]	training's rmse: 1.52431	valid_1's rmse: 1.54267
[400]	training's rmse: 1.52518	valid_1's rmse: 1.54263
[500]	training's rmse: 1.52476	valid_1's rmse: 1.54151
[600]	training's rmse: 1.52889	valid_1's rmse: 1.54511
[700]	training's rmse: 1.52706	valid_1's rmse: 1.54271
[800]	training's rmse: 1.5266	valid_1's rmse: 1.54151
[900]	training's rmse: 1.52795	valid_1's rmse: 1.5421
[1000]	training's rmse: 1.52882	valid_1's rmse: 1.54255
[1100]	training's rmse: 1.52827	valid_1's rmse: 1.54181
[1200]	training's rmse: 1.52818	valid_1's rmse: 1.54141
[1300]	training's rmse: 1.52872	valid_1's rmse: 1.54155
[1400]	training's rmse: 1.52907	valid_1's rmse: 1.5418
[1500]	training's rmse: 1.52904	valid_1's rmse: 1.54153
[1600]	training's rmse: 1.52965	valid_1's rmse: 1.54207
[1700]	training's rmse: 1.52954	valid_1's rmse: 1.54183
[1800]	training's rmse: 1.52996	valid_1's rmse: 1.5

[W 2019-02-11 12:54:37,577] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.58628	valid_1's rmse: 1.57391
[200]	training's rmse: 1.58569	valid_1's rmse: 1.57329
[300]	training's rmse: 1.58467	valid_1's rmse: 1.57253
[400]	training's rmse: 1.58479	valid_1's rmse: 1.57255
[500]	training's rmse: 1.58523	valid_1's rmse: 1.57301
[600]	training's rmse: 1.5851	valid_1's rmse: 1.57234
[700]	training's rmse: 1.58646	valid_1's rmse: 1.57368
[800]	training's rmse: 1.58583	valid_1's rmse: 1.57334
[900]	training's rmse: 1.58667	valid_1's rmse: 1.57388
[1000]	training's rmse: 1.58465	valid_1's rmse: 1.57191
[1100]	training's rmse: 1.58444	valid_1's rmse: 1.57207
[1200]	training's rmse: 1.58382	valid_1's rmse: 1.57805
[1300]	training's rmse: 1.58436	valid_1's rmse: 1.57812
[1400]	training's rmse: 1.58513	valid_1's rmse: 1.57859
[1500]	training's rmse: 1.58501	valid_1's rmse: 1.57857
[1600]	training's rmse: 1.58559	valid_1's rmse: 1.57961
[1700]	training's rmse: 1.58481	valid_1's rmse: 1.5797
[1800]	training's rmse: 1.58587	valid_1's rmse: 1.

[W 2019-02-11 12:57:29,216] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.78563	valid_1's rmse: 1.80024
[200]	training's rmse: 1.56482	valid_1's rmse: 1.57759
[300]	training's rmse: 1.54246	valid_1's rmse: 1.55372
[400]	training's rmse: 1.54117	valid_1's rmse: 1.55131
[500]	training's rmse: 1.53795	valid_1's rmse: 1.54709
[600]	training's rmse: 1.5429	valid_1's rmse: 1.55154
[700]	training's rmse: 1.5411	valid_1's rmse: 1.54914
[800]	training's rmse: 1.54073	valid_1's rmse: 1.54808
[900]	training's rmse: 1.54188	valid_1's rmse: 1.54858
[1000]	training's rmse: 1.54244	valid_1's rmse: 1.54875
[1100]	training's rmse: 1.54173	valid_1's rmse: 1.54754
[1200]	training's rmse: 1.54116	valid_1's rmse: 1.5466
[1300]	training's rmse: 1.54121	valid_1's rmse: 1.5463
[1400]	training's rmse: 1.54211	valid_1's rmse: 1.5469
[1500]	training's rmse: 1.54138	valid_1's rmse: 1.54601
[1600]	training's rmse: 1.54199	valid_1's rmse: 1.5462
[1700]	training's rmse: 1.54206	valid_1's rmse: 1.54604
[1800]	training's rmse: 1.542	valid_1's rmse: 1.54568


[W 2019-02-11 13:05:53,549] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

fold n°0
[100]	training's rmse: 1.59325	valid_1's rmse: 1.59439
[200]	training's rmse: 1.58935	valid_1's rmse: 1.60193
[300]	training's rmse: 1.5899	valid_1's rmse: 1.70593
[400]	training's rmse: 1.58918	valid_1's rmse: 2.16416
[500]	training's rmse: 1.59064	valid_1's rmse: 2.16555
[600]	training's rmse: 1.5905	valid_1's rmse: 2.3314
[700]	training's rmse: 1.58947	valid_1's rmse: 2.63284
[800]	training's rmse: 1.591	valid_1's rmse: 2.65755
[900]	training's rmse: 1.58915	valid_1's rmse: 2.72426
[1000]	training's rmse: 1.58872	valid_1's rmse: 2.83453
[1100]	training's rmse: 1.58974	valid_1's rmse: 3.03318
[1200]	training's rmse: 1.58956	valid_1's rmse: 3.07837
[1300]	training's rmse: 1.5888	valid_1's rmse: 3.09247
[1400]	training's rmse: 1.5889	valid_1's rmse: 3.21582
[1500]	training's rmse: 1.59348	valid_1's rmse: 3.27885
[1600]	training's rmse: 1.58686	valid_1's rmse: 3.31782
[1700]	training's rmse: 1.58919	valid_1's rmse: 3.35662
[1800]	training's rmse: 1.58693	valid_1's rmse: 3.56702

[W 2019-02-11 13:08:16,432] Setting trial status as TrialState.FAIL because of the following error: KeyError("['outliers'] not in index",)
Traceback (most recent call last):
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\optuna\study.py", line 409, in _run_trial
    result = func(trial)
  File "<ipython-input-3-b04324314658>", line 65, in kfold_lightgbm
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\frame.py", line 2002, in _getitem_array
    indexer = self.loc._convert_to_indexer(key, axis=1)
  File "C:\Users\Naoki Tomita\Anaconda3\envs\kaggle\lib\site-packages\pandas\core\indexing.py", line 1231, in _convert_to_indexer
    raise KeyError('%s not in index' % objarr[mask])
KeyErro

In [5]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:20].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

NameError: name 'feature_importance_df' is not defined

In [None]:
n=10
sub_df = pd.read_csv("../input/sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("../output/submit_lgb"+str(n)+"_optuna.csv", index=False)