## 머신러닝 알고리즘 적용 _ 심화
    - 데이터 : 사용자 데이터 + 상품 데이터(과거 상품 구매 이력)
    - 신규 모델 : RandomForest, ExtraTrees, BaggingClassifier, (XGBoost)
    - 업데이트된 데이터 + 기존 모델(DT, LR) 평가척도 
    - 업데이트된 데이터 + 신규 모델 평가척도
    - [+2] 피쳐 엔지니어링
    - [+2] 매개변수 조정
    - 캐글 제출 
    - 머신러닝 파이프라인 흐름도 기록

In [12]:
import pandas as pd
import numpy as np
import pickle
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [13]:
# 신규 데이터 로딩

trn = pd.read_csv('../input/train_append_lb_lag.csv').fillna(0)
target = pd.DataFrame(pickle.load(open('../input/target.pkl','rb')), columns=['target'])
tst = pd.read_csv('../input/test_append_lb_lag.csv').fillna(0)
print(trn.shape, target.shape, tst.shape)

(45619, 246) (45619, 1) (929615, 246)


In [14]:
# 빈도가 낮은 타겟은 사전에 제거 (이유: 교차 검증에 활용할 수 없음 + 너무 빈도가 낮아 무의미함)
rem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23]  # 18 classes
trn = trn[target['target'].isin(rem_targets)]
target = target[target['target'].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)

for t in np.unique(target):
    print(t, sum(target==t))

0

  y = column_or_1d(y, warn=True)


 9452
1 1934
2 55
3 349
4 222
5 154
6 503
7 33
8 1085
9 1219
10 246
11 21
12 2942
13 4733
14 159
15 5151
16 8218
17 9119


## 평가용 함수 정의

In [15]:
def evaluate(x, y, model):
    trn_scores = dict(); vld_scores = dict()
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777)
    for t_ind, v_ind in sss.split(x,y):
        # split data
        x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
        y_trn, y_vld = y[t_ind], y[v_ind]

        # fit model
        model.fit(x_trn, y_trn)
        
        # eval _ trn        
        preds = model.predict_proba(x_trn)

        log_scores = trn_scores.get('log loss', [])
        log_scores.append(log_loss(y_trn, preds))
        trn_scores['log loss'] = log_scores

        # eval _ vld
        preds = model.predict_proba(x_vld)

        log_scores = vld_scores.get('log loss', [])
        log_scores.append(log_loss(y_vld, preds))
        vld_scores['log loss'] = log_scores
    return trn_scores, vld_scores

def print_scores(trn_scores, vld_scores):
    prefix = '        '
    cols = ['log loss']
    print('='*50)
    print('TRAIN EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        print('# {} Raw  : {}'.format(prefix, trn_scores[col]))

    print('='*50)
    print('VALID EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        print('# {} Raw  : {}'.format(prefix, vld_scores[col]))

def print_time(end, start):
    print('='*50)
    elapsed = end - start
    print('{} secs'.format(round(elapsed)))
    
def fit_and_eval(trn, target, model):
    trn_scores, vld_scores = evaluate(trn,target,model)
    print_scores(trn_scores, vld_scores)
    print_time(time.time(), st)    

## 매개변수 최적화 (모델 최적화) [+1]
    - 사용하는 모델의 매개변수를 직접 정의하여 최적의 매개변수 찾아내기
    - 참고: scikit learn 홈페이지를 통해 모델별 매개변수 확인 가능
    - 힌트: trn/vld logloss 를 비교하여, 모델의 복잡도를 조정하기

In [16]:
# 입력 : none
# 출력: model instance

In [17]:
cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']

print(trn.shape, tst.shape)

# 타겟별 누적 합
lags = ['_lag_one','_lag_two','_lag_thr','_lag_fou','_lag_fiv']
for col in cols:
    trn[col+'_sum'] = trn[[col+lag for lag in lags]].sum(axis=1)
    tst[col+'_sum'] = tst[[col+lag for lag in lags]].sum(axis=1)
    
# 월별 누적 합
for lag in lags:
    trn['sum'+lag] = trn[[col+lag for col in cols]].sum(axis=1)
    tst['sum'+lag] = tst[[col+lag for col in cols]].sum(axis=1)
    
print(trn.shape, tst.shape)


(45595, 246) (929615, 246)
(45595, 275) (929615, 275)


In [18]:
st = time.time()
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, max_depth=13, n_jobs=-1, random_state=777)
fit_and_eval(trn, target, rf_model)
# 5 sec

TRAIN EVAL
--------------------------------------------------
# log loss
#          Mean : 0.9044512787244156
#          Raw  : [0.90501505143145322, 0.9035205603068085, 0.90481822443498505]
VALID EVAL
--------------------------------------------------
# log loss
#          Mean : 1.1746878223280186
#          Raw  : [1.1646468431626138, 1.1780926281614648, 1.1813239956599777]
11 secs


## 캐글에 직접 결과물 제출하기
    - MAP@7 평가척도를 기반 (https://www.kaggle.com/c/santander-product-recommendation/details/evaluation)
    - 유저당 상위 7개의 제품을 추천해야함

In [19]:
# 최종 모델 정의하기
model = RandomForestClassifier(n_estimators=50, max_depth=13, n_jobs=-1, random_state=777)

In [20]:
from datetime import datetime
import os

print('='*50)
print('# Test shape : {}'.format(tst.shape))

model.fit(trn,target)

preds = model.predict_proba(tst)
preds = np.fliplr(np.argsort(preds, axis=1))

# Test shape : (929615, 275)


In [21]:
cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']
target_cols = [cols[i] for i, col in enumerate(cols) if i in rem_targets]

In [22]:
final_preds = []
for pred in preds:
    top_products = []
    for i, product in enumerate(pred):
        top_products.append(target_cols[product])
        if i == 6:
            break
    final_preds.append(' '.join(top_products))

temp = pd.read_csv('../input/test_clean.csv')
test_id = temp['ncodpers']
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
out_df.to_csv(os.path.join('../output',file_name), index=False)

결과물 출력은 https://www.kaggle.com/c/santander-product-recommendation/submissions/attach

예시

- 원천 데이터 
    - Kaggle 경진대회 데이터 train_ver2.csv, test_ver2.csv (Link: https://www.kaggle.com/c/santander-product-recommendation/data)


- 전처리 
    - 결측값을 .fillna 함수를 통해 0으로 대체. (기존 데이터에 0이 존재할 경우 -1로 대체)


- 피쳐 엔지니어링 이전 데이터 dimension:
    - trn : (45619, 246)
    - target : (45619, 1) [18 classes]
    - tst : (929615, 246)


- 피쳐 엔지니어링
    - age_log : log(age + 1)
    - ind..._lag_one : 5월 사용자별 금융상품 보유현황
    - ind..._lag_two : 4월 사용자별 금융상품 보유현황
    - ind..._lag_thr : 3월 사용자별 금융상품 보유현황


- 피쳐 엔지니어링 이후 데이터 dimension:
    - trn : (45619, 250)
    - target : (45619, 1) [18 classes]
    - tst : (929615, 250)


- 모델 튜닝 
    - RandomForest : max_depth = 20 로 복잡도 조정


- 검증 결과 
    - trn logloss : 1.18
    - vld logloss : 1.28


- 실제 결과 
    - Public Leaderboard : 0.025984
