# 사전 작업

## 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', 400)

# 데이터 로드

In [5]:
path = './data/'

In [6]:
train = pd.read_csv(path + 'train_v3.csv')
test = pd.read_csv(path + 'test_v3.csv')

In [14]:
FEATS_EXCLUDED = ['first_active', 'card_id', 'target', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 
                  'new_purchase_date_max', 'new_purchase_date_min']

In [16]:
def cat_encoding(data):
    if np.abs(data) <= 1:
        return 1
    else:
        return 0

In [18]:
train.target = train.target.apply(cat_encoding)

# Lgb

In [24]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

In [25]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [f for f in train.columns if f not in FEATS_EXCLUDED]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train.target.values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=train.iloc[trn_idx]['target'])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=train.iloc[val_idx]['target'])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.646569	valid_1's binary_logloss: 0.646705
[200]	training's binary_logloss: 0.634989	valid_1's binary_logloss: 0.636504
[300]	training's binary_logloss: 0.628706	valid_1's binary_logloss: 0.632082
[400]	training's binary_logloss: 0.624229	valid_1's binary_logloss: 0.629681
[500]	training's binary_logloss: 0.62065	valid_1's binary_logloss: 0.628264
[600]	training's binary_logloss: 0.617623	valid_1's binary_logloss: 0.627374
[700]	training's binary_logloss: 0.614935	valid_1's binary_logloss: 0.62688
[800]	training's binary_logloss: 0.612468	valid_1's binary_logloss: 0.62651
[900]	training's binary_logloss: 0.61014	valid_1's binary_logloss: 0.62633
[1000]	training's binary_logloss: 0.607899	valid_1's binary_logloss: 0.626132
[1100]	training's binary_logloss: 0.605694	valid_1's binary_logloss: 0.625987
[1200]	training's binary_logloss: 0.603576	valid_1's binary_logloss: 0.625893
[1300]

KeyboardInterrupt: 

In [26]:
z1 = pd.read_csv('./data/zztmp (1).csv')
z2 = pd.read_csv('./data/zztmp (2).csv')
z3 = pd.read_csv('./data/zztmp (3).csv')

In [34]:
sub = pd.read_csv('./data/sample_submission.csv')

In [44]:
subm = pd.read_csv('./data/submission_combining_neg.csv')

In [45]:
sub.target = z1.target + subm.target

In [47]:
sub.target = sub.target / 2

In [36]:
sub.target = z1.target + z2.target + z3.target

In [38]:
sub.target = sub.target / 3

In [49]:
sub.to_csv('./data/z4.csv', index=False)