In [1]:
%load_ext watermark
%watermark -v -n -m -p numpy,sklearn,pandas

Wed Jan 30 2019 

CPython 3.6.5
IPython 5.3.0

numpy 1.16.0
sklearn 0.19.1
pandas 0.24.0

compiler   : GCC 4.8.2 20140120 (Red Hat 4.8.2-15)
system     : Linux
release    : 4.15.0-43-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit


In [2]:
import os
import numpy as np 
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import gc

In [3]:
pd.options.display.max_columns = None
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
print(PROJ_ROOT)
%cd ..

/media/DATA/elo_merchant
/media/DATA/elo_merchant


In [4]:
from src.features import read_train, read_test

In [5]:
FEATS_EXCLUDED = ['target', 'outliers']

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('./reports/figures/lgbm_importances.png')


def kfold_lightgbm(train_df, test_df, num_folds, stratified = False, debug= False):
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params optimized by optuna
        params ={
                'task': 'train',
                'boosting': 'goss',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'subsample': 0.9855232997390695,
                'max_depth': 7,
                'top_rate': 0.9064148448434349,
                'num_leaves': 63,
                'min_child_weight': 41.9612869171337,
                'other_rate': 0.0721768246018207,
                'reg_alpha': 9.677537745007898,
                'colsample_bytree': 0.5665320670155495,
                'min_split_gain': 9.820197773625843,
                'reg_lambda': 8.2532317400459,
                'min_data_in_leaf': 21,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # display importances
    display_importances(feature_importance_df)

    if not debug:
        # save submission file
        test_df.loc[:,'target'] = sub_preds
        test_df = test_df.reset_index()
        test_df[['card_id', 'target']].to_csv('./reports/submission.csv', index=False)

In [6]:
train_df = read_train()
train_df['outliers'] = 0
train_df.loc[train_df['target'] < -30, 'outliers'] = 1
test_df = read_test()
kfold_lightgbm(train_df, test_df, num_folds=11, stratified=False, debug=False)

Starting LightGBM. Train shape: (201917, 129), test shape: (123623, 128)
Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 3.69404	test's rmse: 3.76117
[200]	train's rmse: 3.62598	test's rmse: 3.72714
[300]	train's rmse: 3.58296	test's rmse: 3.71422
[400]	train's rmse: 3.55318	test's rmse: 3.70796
[500]	train's rmse: 3.53247	test's rmse: 3.70552
[600]	train's rmse: 3.5149	test's rmse: 3.70437
[700]	train's rmse: 3.49872	test's rmse: 3.70372
[800]	train's rmse: 3.48207	test's rmse: 3.7035
[900]	train's rmse: 3.46916	test's rmse: 3.70331
[1000]	train's rmse: 3.45475	test's rmse: 3.70351
[1100]	train's rmse: 3.43982	test's rmse: 3.70402
Early stopping, best iteration is:
[905]	train's rmse: 3.46849	test's rmse: 3.70323
Fold  1 RMSE : 3.703229
Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 3.6904	test's rmse: 3.81433
[200]	train's rmse: 3.62013	test's rmse: 3.783
[300]	train's rmse: 3.57787	test's rmse: 3.77373
[400]	trai