In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import gc 
import time
import optuna

plt.style.use('ggplot') # Lets make our plots pretty

path = os.getcwd()

print(path)  

/home/tomita/kaggle/kaggle_elo/Models


In [2]:
# Read in the dataframes
train = pd.read_csv('../input/train_1.csv')
test = pd.read_csv('../input/test_1.csv')

print(train.columns)

Index(['Unnamed: 0', 'first_active_month', 'card_id', 'feature_1', 'feature_2',
       'feature_3', 'target', 'elapsed_time', 'outliers',
       'hist_transactions_count',
       ...
       'installments_purchase_amount_max', 'installments_purchase_amount_std',
       'city_id_purchase_amount_mean', 'city_id_purchase_amount_min',
       'city_id_purchase_amount_max', 'city_id_purchase_amount_std',
       'category_1_installments_mean', 'category_1_installments_min',
       'category_1_installments_max', 'category_1_installments_std'],
      dtype='object', length=226)


In [None]:
target = train['target']
del train['target']
del train['outliers']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "dart",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 2015,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 4,
         "verbosity": -1,
          "device":'GPU',
        "max_bin":63}

In [None]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

end=time.time()
elapsed_time = end-start
print(f"経過時間：{elapsed_time}")
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold n°0




Training until validation scores don't improve for 200 rounds.


In [None]:
def kfold_lightgbm(train_df, test_df, num_folds, stratified = False, debug= False):
        FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0']
        seed=20190208
        
        # params optimized by optuna
        learning_rate_tuna = trial.suggest_uniform('learning_rate', 0, 1.0)
        toprate_tuna = trial.suggest_uniform('top_rate', 0, 1.0)
        num_leaves_tuna = trial.suggest_uniform('learning_rate', 0, 1.0)
        min_child_weight_tuna = trial.suggest_int('min_child_weight', 5, 500)
        other_rate_tuna=trial.suggest_uniform('other_rate', 5, 500)
        num_leaves_tuna=trial.suggest_int('num_leaves', 5, 1000)
        min_gain_split_tuna=trial.suggest_uniform('min_gain_split', 5, 500)
        reg_lambda_tuna=trial.suggest_uniform('reg_lambda', 5, 500)
        
        params ={
                'task': 'train',
                'boosting': 'dart',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': learning_rate_tuna ,
                'subsample': 0.9855232997390695,
                'max_depth': 7,
                'top_rate': toprate_tuna ,
                'num_leaves': num_leaves_tuna,
                'min_child_weight': min_child_weight_tuna,
                'other_rate': other_rate_tuna,
                'reg_alpha': 9.677537745007898,
                'colsample_bytree': 0.5665320670155495,
                'min_split_gain': min_gain_split_tuna,
                'reg_lambda': reg_lambda_tuna,
                'min_data_in_leaf': 21,
                'verbose': -1,
                'seed':seed,
                'bagging_seed':seed,
                'drop_seed':seed,
                'max_bin':255
                }

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]
    best_trees=[]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()


In [None]:
study = optuna.create_study()
study.optimize(kfold_lightgbm, n_trials=10)

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:20].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
n=10
sub_df = pd.read_csv("../input/sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("../output/submit_lgb"+str(n)+"_optuna.csv", index=False)