In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import gc 
import time
plt.style.use('ggplot') # Lets make our plots pretty

path = os.getcwd()

print(path)  

/home/tomita/kaggle/kaggle_elo/Models


In [2]:
# Read in the dataframes
train = pd.read_csv('../input/train_1.csv')
test = pd.read_csv('../input/test_1.csv')

print(train.columns)

Index(['Unnamed: 0', 'first_active_month', 'card_id', 'feature_1', 'feature_2',
       'feature_3', 'target', 'elapsed_time', 'outliers',
       'hist_transactions_count',
       ...
       'installments_purchase_amount_max', 'installments_purchase_amount_std',
       'city_id_purchase_amount_mean', 'city_id_purchase_amount_min',
       'city_id_purchase_amount_max', 'city_id_purchase_amount_std',
       'category_1_installments_mean', 'category_1_installments_min',
       'category_1_installments_max', 'category_1_installments_std'],
      dtype='object', length=226)


In [3]:
target = train['target']
del train['target']
del train['outliers']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 2015,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 4,
         "verbosity": -1,
          "device":'GPU',
        "max_bin":63}

In [4]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

end=time.time()
elapsed_time = end-start
print(f"経過時間：{elapsed_time}")
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.72531	valid_1's rmse: 3.79299
[200]	training's rmse: 3.6569	valid_1's rmse: 3.75052
[300]	training's rmse: 3.60946	valid_1's rmse: 3.72862
[400]	training's rmse: 3.57401	valid_1's rmse: 3.71573
[500]	training's rmse: 3.54592	valid_1's rmse: 3.70706
[600]	training's rmse: 3.52244	valid_1's rmse: 3.70074
[700]	training's rmse: 3.50158	valid_1's rmse: 3.69662
[800]	training's rmse: 3.48306	valid_1's rmse: 3.69329
[900]	training's rmse: 3.46646	valid_1's rmse: 3.69003
[1000]	training's rmse: 3.45123	valid_1's rmse: 3.68809
[1100]	training's rmse: 3.43661	valid_1's rmse: 3.68647
[1200]	training's rmse: 3.42331	valid_1's rmse: 3.68469
[1300]	training's rmse: 3.41009	valid_1's rmse: 3.68313
[1400]	training's rmse: 3.39743	valid_1's rmse: 3.68214
[1500]	training's rmse: 3.38602	valid_1's rmse: 3.68136
[1600]	training's rmse: 3.37419	valid_1's rmse: 3.68074
[1700]	training's rmse: 3.36291	valid_1's rmse: 3.6

[1600]	training's rmse: 3.38734	valid_1's rmse: 3.6165
[1700]	training's rmse: 3.37582	valid_1's rmse: 3.61614
[1800]	training's rmse: 3.36454	valid_1's rmse: 3.61597
[1900]	training's rmse: 3.35348	valid_1's rmse: 3.6158
[2000]	training's rmse: 3.34278	valid_1's rmse: 3.61564
[2100]	training's rmse: 3.33238	valid_1's rmse: 3.61522
[2200]	training's rmse: 3.32226	valid_1's rmse: 3.61497
[2300]	training's rmse: 3.31217	valid_1's rmse: 3.61491
[2400]	training's rmse: 3.30231	valid_1's rmse: 3.61483
[2500]	training's rmse: 3.29247	valid_1's rmse: 3.61445
[2600]	training's rmse: 3.28278	valid_1's rmse: 3.61453
[2700]	training's rmse: 3.27317	valid_1's rmse: 3.61455
[2800]	training's rmse: 3.26384	valid_1's rmse: 3.61425
[2900]	training's rmse: 3.25445	valid_1's rmse: 3.6142
[3000]	training's rmse: 3.24462	valid_1's rmse: 3.61425
Early stopping, best iteration is:
[2885]	training's rmse: 3.25588	valid_1's rmse: 3.61408
経過時間：290.1821677684784
CV score: 3.65642 


In [5]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:20].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [6]:
seed=20190208
lgbparam ={'task': 'train',
            'boosting': 'dart',
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.75836372582243783,
            'subsample': 0.91158142068248083,
            'max_depth': 29,
            'top_rate': 0.7790255922489042,
            'num_leaves': 562,
            'min_child_weight': 27,
            'other_rate':0.03564199289369395,
            'reg_alpha': 0.72876375065913579,
            'colsample_bytree':0.83435723889734326  ,
            'min_split_gain': 27.378180277455101,
            'reg_lambda': 94.549009291544877,
            'min_data_in_leaf': 21,
            'verbose': -1,
            'seed':seed,
            'bagging_seed':seed,
            'drop_seed':seed,
            'max_bin':255,
            'device':'gpu'
            }

In [7]:
from sklearn.model_selection import RepeatedKFold
folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4520)

oof_lgb = np.zeros(len(train))
predictions_dart = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 11000
    clf = lgb.train(lgbparam, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_dart += clf.predict(test[features], num_iteration=clf.best_iteration) / (5 * 2)

print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb, target)**0.5))

fold n°0




Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.66306	valid_1's rmse: 3.74491
Early stopping, best iteration is:
[98]	training's rmse: 2.65497	valid_1's rmse: 3.74842
fold n°1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.69232	valid_1's rmse: 3.59117
Early stopping, best iteration is:
[93]	training's rmse: 2.67624	valid_1's rmse: 3.59881
fold n°2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.62485	valid_1's rmse: 3.87793
Early stopping, best iteration is:
[57]	training's rmse: 2.60778	valid_1's rmse: 3.90702
fold n°3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.66732	valid_1's rmse: 3.7029
Early stopping, best iteration is:
[59]	training's rmse: 2.62978	valid_1's rmse: 3.73347
fold n°4
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.69718	valid_1's rmse: 3.65804
Early stopping, bes

In [8]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:25].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [9]:
sub_df = pd.read_csv("../input//sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("submit_lgb3.csv", index=False)

In [10]:
n=20190219
sub_df = pd.read_csv("../input/sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("submit_lgb"+str(n)+"_1.csv", index=False)

sub_df1 = pd.read_csv("../input/sample_submission.csv")
sub_df1["target"] = predictions_dart
sub_df1.to_csv("submit_lgb"+str(n)+"_dart.csv", index=False)

In [11]:
from sklearn.linear_model import BayesianRidge

train_stack = np.vstack([oof,oof_lgb]).transpose()
test_stack = np.vstack([predictions,predictions_dart]).transpose()

folds = RepeatedKFold(n_splits=5,n_repeats=1,random_state=4520)
oof_stack = np.zeros(train_stack.shape[0])
predictions_stack = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, target)):
    print("fold n°{}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    print("-" * 10 + "Stacking " + str(fold_) + "-" * 10)
    
    clf = BayesianRidge()
    clf.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf.predict(val_data)
    predictions_stack += clf.predict(test_stack) / 5

np.sqrt(mean_squared_error(target.values, oof_stack))

fold n°0
----------Stacking 0----------
fold n°1
----------Stacking 1----------
fold n°2
----------Stacking 2----------
fold n°3
----------Stacking 3----------
fold n°4
----------Stacking 4----------


3.654572988272359

In [12]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample_submission['target'] = predictions_stack
sample_submission.to_csv('Bayesian_Ridge_Stacking.csv', index=False)

In [13]:
sample_submission['target'] = (predictions * 0.125+ predictions_dart*0.5+predictions_stack*0.325)*1.1
sample_submission.to_csv("../output/submit"+str(n)+".csv", index = False)