# Base Model 2
* FE V1 이후 기본적인 스코어 확인

## 모듈 임포트

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [25]:
import warnings
warnings.filterwarnings('ignore')

## 데이터 로드

In [26]:
path = './data/'

In [31]:
train = pd.read_csv(path + 'new_train_v1.csv')
test = pd.read_csv(path + 'new_test_v1.csv')
# history = pd.read_csv(path + 'historical_transactions.csv')
# new_history = pd.read_csv(path + 'new_merchant_transactions.csv')
# merchant = pd.read_csv(path + 'merchants.csv')

## 선처리

In [32]:
for df in [train, test]:
    df.set_index("card_id", inplace=True)
    df.drop(['first_active_month'], axis = 1, inplace=True)

In [29]:
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [39]:
train.index[train.target < -20]

Index(['C_ID_8186f3fcc1', 'C_ID_b9379a30ea', 'C_ID_e9120f535c',
       'C_ID_65715cb80d', 'C_ID_ae77d244b6', 'C_ID_c4262c902e',
       'C_ID_6a2c823e5f', 'C_ID_98e1cb10b6', 'C_ID_eb8175721a',
       'C_ID_a2580006bf',
       ...
       'C_ID_85cd7bbd0c', 'C_ID_22d40c59ff', 'C_ID_f28fb61248',
       'C_ID_9789d0a73d', 'C_ID_d8920df04d', 'C_ID_e34ea29439',
       'C_ID_081082a629', 'C_ID_6f61bed11c', 'C_ID_d6c86b18cd',
       'C_ID_09250aa6bf'],
      dtype='object', name='card_id', length=2207)

In [41]:
train = train.drop(train.index[train.target < -20])

## 모델 모듈 임포트

In [42]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error as mse

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [43]:
lgb = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                    importance_type='split', learning_rate=0.1, max_depth=8,
                    min_child_samples=20, min_child_weight=0.001,
                    min_split_gain=0.0, n_estimators=10, n_jobs=-1, num_leaves=31,
                    random_state=0)

In [45]:
cv = KFold(5, shuffle=True, random_state=0)

In [46]:
for i, (idx_train, idx_cv) in enumerate(cv.split(train)):
    df_train = train.iloc[idx_train]
    df_cv = train.iloc[idx_cv]
    
    x_train = df_train.drop(['target'], axis=1)
    y_train = df_train['target']
    
    x_cv = df_cv.drop(['target'], axis=1)
    y_cv = df_cv['target']
    
    model = lgb
    model.fit(x_train, y_train)
    print("[K = {}] train rmse = {}, cv_rmse = {}".format(i, np.sqrt(np.mean((model.predict(x_train) - y_train) ** 2)), np.sqrt(np.mean((model.predict(x_cv) - y_cv) ** 2))))

[K = 0] train rmse = 1.7004060265594525, cv_rmse = 1.7071903686511614
[K = 1] train rmse = 1.7007295191804148, cv_rmse = 1.7059109618335895
[K = 2] train rmse = 1.6969847768787572, cv_rmse = 1.7202453449619661
[K = 3] train rmse = 1.7011612654742774, cv_rmse = 1.7043420941668572
[K = 4] train rmse = 1.7053409038312954, cv_rmse = 1.6877648193613277


In [53]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['target'] = model.predict(test)
sample_submission.to_csv('./data/sub.csv', index=False)

## 모델 모듈 임포트

In [49]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error as mse

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

import lightgbm as lgb
import xgboost as xgb


In [39]:
train_columns = [c for c in train.columns if c not in ['card_id', 'first_active_month','target','outliers']]

In [43]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}

In [44]:
oof_lgb_3 = np.zeros(len(train))
predictions_lgb_3 = np.zeros(len(test))
start = time.time()

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb_3[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb_3 += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.save('oof_lgb_3', oof_lgb_3)
np.save('predictions_lgb_3', predictions_lgb_3)
print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb_3, target)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.72414	valid_1's rmse: 3.76795
[200]	training's rmse: 3.66914	valid_1's rmse: 3.75358
[300]	training's rmse: 3.63287	valid_1's rmse: 3.75054
[400]	training's rmse: 3.60695	valid_1's rmse: 3.74932
[500]	training's rmse: 3.58779	valid_1's rmse: 3.74898
[600]	training's rmse: 3.57146	valid_1's rmse: 3.74864
[700]	training's rmse: 3.55696	valid_1's rmse: 3.74886
[800]	training's rmse: 3.54252	valid_1's rmse: 3.74916
Early stopping, best iteration is:
[638]	training's rmse: 3.56589	valid_1's rmse: 3.74851
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.72046	valid_1's rmse: 3.77741
[200]	training's rmse: 3.66356	valid_1's rmse: 3.76519
[300]	training's rmse: 3.62734	valid_1's rmse: 3.7632
[400]	training's rmse: 3.60267	valid_1's rmse: 3.76285
[500]	training's rmse: 3.58478	valid_1's rmse: 3.76293
Early stopping, best iteration is:
[394]	training's 

In [46]:
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True}

In [47]:
oof_xgb_3 = np.zeros(len(train))
predictions_xgb_3 = np.zeros(len(test))

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train,train['outliers'].values)):    
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = xgb.DMatrix(data=train.iloc[trn_idx][train_columns], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(data=train.iloc[val_idx][train_columns], label=target.iloc[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    print("xgb " + str(fold_) + "-" * 50)
    num_round = 10000
    xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, early_stopping_rounds=50, verbose_eval=1000)
    oof_xgb_3[val_idx] = xgb_model.predict(xgb.DMatrix(train.iloc[val_idx][train_columns]), ntree_limit=xgb_model.best_ntree_limit+50)

    predictions_xgb_3 += xgb_model.predict(xgb.DMatrix(test[train_columns]), ntree_limit=xgb_model.best_ntree_limit+50) / folds.n_splits
    
np.save('oof_xgb_3', oof_xgb_3)
np.save('predictions_xgb_3', predictions_xgb_3)
np.sqrt(mean_squared_error(target.values, oof_xgb_3))

-
Fold 1
xgb 0--------------------------------------------------
[0]	train-rmse:3.94883	valid-rmse:3.95491
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
Stopping. Best iteration:
[608]	train-rmse:3.22901	valid-rmse:3.752

-
Fold 2
xgb 1--------------------------------------------------
[0]	train-rmse:3.94872	valid-rmse:3.95413
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
Stopping. Best iteration:
[514]	train-rmse:3.28388	valid-rmse:3.76508

-
Fold 3
xgb 2--------------------------------------------------
[0]	train-rmse:3.95191	valid-rmse:3.9422
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
Stopping. Best iteration:
[503]	train-rmse:3.29795	valid-rmse:3.75636

-
Fold 4
xgb 3--------------------

3.7542866203548657

In [50]:
train_stack = np.vstack([oof_lgb_3, oof_xgb_3]).transpose()
test_stack = np.vstack([predictions_lgb_3, predictions_xgb_3]).transpose()

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, train['outliers'].values)):
    print("fold n°{}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf = Ridge(alpha=1)
    clf.fit(trn_data, trn_y)
    
    oof[val_idx] = clf.predict(val_data)
    predictions += clf.predict(test_stack) / folds.n_splits


np.sqrt(mean_squared_error(target.values, oof))

fold n°0
fold n°1
fold n°2
fold n°3
fold n°4


3.749507598369251

In [54]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['target'] = predictions
sample_submission.to_csv('./data/sub.csv', index=False)