# 사전 작업

## 모듈 로드

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc
from tqdm import tqdm_notebook

In [33]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

In [34]:
from sklearn.preprocessing import LabelEncoder

In [35]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
pd.set_option('display.max_columns', 400)

## 데이터 로드

In [37]:
path = './data/'

In [38]:
train = pd.read_csv(path + 'train_v3.csv')
test = pd.read_csv(path + 'test_v3.csv')

## Feature Elimination

In [39]:
FEATS_EXCLUDED = ['first_active', 'card_id', 'target', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 
                  'new_purchase_date_max', 'new_purchase_date_min']

In [49]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}

In [87]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [f for f in train.columns if f not in FEATS_EXCLUDED]
train_columns = trainable_feature

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=train.iloc[trn_idx]['target'])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=train.iloc[val_idx]['target'])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.65195	valid_1's rmse: 3.71912
[200]	training's rmse: 3.56745	valid_1's rmse: 3.68735
[300]	training's rmse: 3.51187	valid_1's rmse: 3.67545
[400]	training's rmse: 3.46779	valid_1's rmse: 3.6698
[500]	training's rmse: 3.43192	valid_1's rmse: 3.66621
[600]	training's rmse: 3.40063	valid_1's rmse: 3.66442
[700]	training's rmse: 3.37318	valid_1's rmse: 3.66252
[800]	training's rmse: 3.34851	valid_1's rmse: 3.66127
[900]	training's rmse: 3.32576	valid_1's rmse: 3.66068
[1000]	training's rmse: 3.30495	valid_1's rmse: 3.6608
Early stopping, best iteration is:
[896]	training's rmse: 3.32662	valid_1's rmse: 3.66066
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.64877	valid_1's rmse: 3.72618
[200]	training's rmse: 3.56139	valid_1's rmse: 3.69832
[300]	training's rmse: 3.50561	valid_1's rmse: 3.6851
[400]	training's rmse: 3.46233	valid_1's rmse: 3.6767

KeyboardInterrupt: 

In [83]:
meta = feature_importance[feature_importance.fold == 1].Feature.to_frame()
meta['importance'] = 0

In [84]:
for i in range(5):
    meta.importance += feature_importance[feature_importance.fold == i+1].importance / 5

meta = meta.sort_values('importance')

In [86]:
trainable_feature = meta[meta.importance > 0].Feature.values

In [46]:
sub = pd.read_csv('./data/sample_submission.csv')

In [47]:
sub.target = predictions_lgb

In [48]:
sub.to_csv('./data/sub_3-65132.csv', index=False)