In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pylab as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import gc 
import time
plt.style.use('ggplot') # Lets make our plots pretty

path = os.getcwd()

print(path)  

C:\Users\Naoki Tomita\Desktop\kaggle_elo\Models


In [2]:
# Read in the dataframes
train = pd.read_csv('../input/train_1.csv')
test = pd.read_csv('../input/test_1.csv')
merchants = pd.read_csv('../input/merchants.csv')
#new_merchants = pd.read_csv('../input/new_merchant_transactions.csv')
delete_id = pd.read_csv('../input/delete_id.csv')
ss = pd.read_csv('../input/sample_submission.csv')

print(train.columns)

Index(['Unnamed: 0', 'first_active_month', 'card_id', 'feature_1', 'feature_2',
       'feature_3', 'target', 'elapsed_time', 'outliers',
       'hist_transactions_count',
       ...
       'installments_purchase_amount_max', 'installments_purchase_amount_std',
       'city_id_purchase_amount_mean', 'city_id_purchase_amount_min',
       'city_id_purchase_amount_max', 'city_id_purchase_amount_std',
       'category_1_installments_mean', 'category_1_installments_min',
       'category_1_installments_max', 'category_1_installments_std'],
      dtype='object', length=226)


In [3]:
target = train['target']
del train['target']
del train['outliers']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.7 ,
         "bagging_seed": 2015,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "nthread": 4,
         "verbosity": -1,
          "device":'GPU',
        "max_bin":63}

In [4]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

end=time.time()
elapsed_time = end-start
print(f"経過時間：{elapsed_time}")
print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.72458	valid_1's rmse: 3.79324
[200]	training's rmse: 3.65587	valid_1's rmse: 3.75054
[300]	training's rmse: 3.60855	valid_1's rmse: 3.72882
[400]	training's rmse: 3.57337	valid_1's rmse: 3.71601
[500]	training's rmse: 3.5456	valid_1's rmse: 3.70775
[600]	training's rmse: 3.52197	valid_1's rmse: 3.70178
[700]	training's rmse: 3.50139	valid_1's rmse: 3.69769
[800]	training's rmse: 3.48274	valid_1's rmse: 3.6943
[900]	training's rmse: 3.46605	valid_1's rmse: 3.69162
[1000]	training's rmse: 3.45118	valid_1's rmse: 3.68955
[1100]	training's rmse: 3.43659	valid_1's rmse: 3.68801
[1200]	training's rmse: 3.42313	valid_1's rmse: 3.68618
[1300]	training's rmse: 3.40996	valid_1's rmse: 3.68497
[1400]	training's rmse: 3.39729	valid_1's rmse: 3.68375
[1500]	training's rmse: 3.38579	valid_1's rmse: 3.68319
[1600]	training's rmse: 3.37355	valid_1's rmse: 3.68279
[1700]	training's rmse: 3.3622	valid_1's rmse: 3.682

[700]	training's rmse: 3.51391	valid_1's rmse: 3.62888
[800]	training's rmse: 3.49557	valid_1's rmse: 3.62555
[900]	training's rmse: 3.47841	valid_1's rmse: 3.62302
[1000]	training's rmse: 3.46294	valid_1's rmse: 3.62121
[1100]	training's rmse: 3.44903	valid_1's rmse: 3.61965
[1200]	training's rmse: 3.43558	valid_1's rmse: 3.61852
[1300]	training's rmse: 3.42205	valid_1's rmse: 3.61784
[1400]	training's rmse: 3.40955	valid_1's rmse: 3.61733
[1500]	training's rmse: 3.3981	valid_1's rmse: 3.61642
[1600]	training's rmse: 3.38662	valid_1's rmse: 3.61591
[1700]	training's rmse: 3.37515	valid_1's rmse: 3.61543
[1800]	training's rmse: 3.36375	valid_1's rmse: 3.61524
[1900]	training's rmse: 3.35289	valid_1's rmse: 3.61485
[2000]	training's rmse: 3.34231	valid_1's rmse: 3.61483
[2100]	training's rmse: 3.33177	valid_1's rmse: 3.61454
[2200]	training's rmse: 3.32144	valid_1's rmse: 3.61449
[2300]	training's rmse: 3.31153	valid_1's rmse: 3.61431
[2400]	training's rmse: 3.30185	valid_1's rmse: 3.61

In [5]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:20].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [6]:
lgbparam = {'num_leaves': 31,
            'boosting_type': 'dart',
             'min_data_in_leaf': 30, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1,
             "nthread": 4,
             "random_state": 4590,
           "device":'GPU',
           "max_bin":63}

In [None]:
from sklearn.model_selection import RepeatedKFold
folds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4520)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 11000
    clf = lgb.train(lgbparam, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions_lgb += clf.predict(test[features], num_iteration=clf.best_iteration) / (5 * 2)

print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb, target)**0.5))

fold n°0




[100]	training's rmse: 3.7586	valid_1's rmse: 3.79616
[200]	training's rmse: 3.7006	valid_1's rmse: 3.76131
[300]	training's rmse: 3.66957	valid_1's rmse: 3.7459
[400]	training's rmse: 3.64916	valid_1's rmse: 3.73768
[500]	training's rmse: 3.61876	valid_1's rmse: 3.72547
[600]	training's rmse: 3.60357	valid_1's rmse: 3.72117
[700]	training's rmse: 3.58243	valid_1's rmse: 3.7149
[800]	training's rmse: 3.56184	valid_1's rmse: 3.70808
[900]	training's rmse: 3.54666	valid_1's rmse: 3.70644
[1000]	training's rmse: 3.53085	valid_1's rmse: 3.70478
[1100]	training's rmse: 3.51293	valid_1's rmse: 3.70207
[1200]	training's rmse: 3.49539	valid_1's rmse: 3.69959
[1300]	training's rmse: 3.47747	valid_1's rmse: 3.69718
[1400]	training's rmse: 3.46194	valid_1's rmse: 3.69465


In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:25].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')

In [None]:
sub_df = pd.read_csv("../input//sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("submit_lgb3.csv", index=False)

In [None]:
n=6
sub_df = pd.read_csv("../input/sample_submission.csv")
sub_df["target"] = predictions
sub_df.to_csv("submit_lgb"+str(n)+"_1.csv", index=False)

sub_df1 = pd.read_csv("../input/sample_submission.csv")
sub_df1["target"] = predictions_lgb
sub_df1.to_csv("submit_lgb"+str(n)+"_2.csv", index=False)

In [None]:
from sklearn.linear_model import BayesianRidge

train_stack = np.vstack([oof,oof_lgb]).transpose()
test_stack = np.vstack([predictions,predictions_lgb]).transpose()

folds = RepeatedKFold(n_splits=5,n_repeats=1,random_state=4520)
oof_stack = np.zeros(train_stack.shape[0])
predictions_stack = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, target)):
    print("fold n°{}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    print("-" * 10 + "Stacking " + str(fold_) + "-" * 10)
    
    clf = BayesianRidge()
    clf.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf.predict(val_data)
    predictions_stack += clf.predict(test_stack) / 5

np.sqrt(mean_squared_error(target.values, oof_stack))

In [None]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
sample_submission['target'] = predictions_stack
sample_submission.to_csv('Bayesian_Ridge_Stacking.csv', index=False)

In [None]:
sample_submission['target'] = (predictions * 0.425 + predictions_lgb*0.15+predictions_stack*0.425)*1.1
sample_submission.to_csv("../output/submit"+str(n)+".csv", index = False)