## Santander-Product-Recommendation-Problem Solver

In [1]:
import pandas as pd
import numpy as np
import pickle
import time
import operator
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss, f1_score, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

st = time.time()
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


## Feature Engineering

In [None]:
trn = pd.read_csv('../input/train_append_lb_lag.csv').fillna(0)
target = pd.DataFrame(pickle.load(open('../input/target.pkl','rb')), columns=['target'])
tst = pd.read_csv('../input/test_append_lb_lag.csv').fillna(0)
print(trn.shape, target.shape, tst.shape)

In [None]:
for col in trn.columns:
    print(col)

In [None]:
trn.columns == tst.columns

In [None]:
Erem_targets = [2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 21, 22, 23]  # 18 classes
trn = trn[target['target'].isin(rem_targets)]
target = target[target['target'].isin(rem_targets)]
target = LabelEncoder().fit_transform(target)

for t in np.unique(target):
    print(t, sum(target==t))

## Evalutation function

In [None]:
# RandomForest
def evaluate(x, y, model):
    trn_scores = dict(); vld_scores = dict()
    sss = StratifiedShuffleSplit(n_splits=3, test_size=0.1, random_state=777)
    for t_ind, v_ind in sss.split(x,y):
        # split data
        x_trn, x_vld = x.iloc[t_ind], x.iloc[v_ind]
        y_trn, y_vld = y[t_ind], y[v_ind]

        # fit model
        model.fit(x_trn, y_trn)
        
        # eval _ trn        
        preds = model.predict_proba(x_trn)

        log_scores = trn_scores.get('log loss', [])
        log_scores.append(log_loss(y_trn, preds))
        trn_scores['log loss'] = log_scores

        # eval _ vld
        preds = model.predict_proba(x_vld)

        log_scores = vld_scores.get('log loss', [])
        log_scores.append(log_loss(y_vld, preds))
        vld_scores['log loss'] = log_scores
    return trn_scores, vld_scores

def print_scores(trn_scores, vld_scores):
    prefix = '        '
    cols = ['log loss']
    print('='*50)
    print('TRAIN EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        print('# {} Raw  : {}'.format(prefix, trn_scores[col]))

    print('='*50)
    print('VALID EVAL')
    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        print('# {} Raw  : {}'.format(prefix, vld_scores[col]))

def print_time(end, start):
    print('='*50)
    elapsed = end - start
    print('{} secs'.format(round(elapsed)))
    
def fit_and_eval(trn, target, model):
    trn_scores, vld_scores = evaluate(trn,target,model)
    print_scores(trn_scores, vld_scores)
    print_time(time.time(), st)    

In [None]:
SOLUTION_NUM = "4"

# XGB Model Param
num_round = 5
early_stop = 10
xgb_params = {
    'booster': 'gbtree',

    # 모델 복잡도
    'max_depth': 5, # 높을 수록 복잡
    'gamma': 3,    # 낮을 수록 복잡
    'min_child_weight': 2, # 낮을 수록 복잡

    # 랜덤 샘플링을 통한 정규화
    'colsample_bylevel': 0.7,
    'colsample_bytree': 1,
    'subsample': 0.8,

    # 정규화
    'reg_alpha': 2,
    'reg_lambda': 3,

    # 학습 속도
    'learning_rate': 0.02,

    # 기본 설정
    'nthread': 4,
    'num_class': 18,
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'seed': 777,
}


In [None]:
#XGB Model score
def print_scores_with_logs(trn_scores, vld_scores):
    prefix = '        '
    cols = ['log loss']

    f = open("sol_"+SOLUTION_NUM+"_log.result","a")

    print('='*50)
    print('TRAIN EVAL')

    f.write('='*50)
    f.write('TRAIN EVAL')


    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        print('# {} Raw  : {}'.format(prefix, trn_scores[col]))

        f.write('-'*50)
        f.write('# {}'.format(col))
        f.write('# {} Mean : {}'.format(prefix, np.mean(trn_scores[col])))
        f.write('# {} Raw  : {}'.format(prefix, trn_scores[col]))


    print('='*50)
    print('VALID EVAL')

    f.write('='*50)
    f.write('VALID EVAL')

    for col in cols:
        print('-'*50)
        print('# {}'.format(col))
        print('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        print('# {} Raw  : {}'.format(prefix, vld_scores[col]))

        f.write('-'*50)
        f.write('# {}'.format(col))
        f.write('# {} Mean : {}'.format(prefix, np.mean(vld_scores[col])))
        f.write('# {} Raw  : {}'.format(prefix, vld_scores[col]))
    f.close()

def print_time(end, start):
    f = open("sol_"+SOLUTION_NUM+"_log.result","a")

    print('='*50)
    f.write('='*50)
    elapsed = end - start
    print('{} secs'.format(round(elapsed)))
    f.write('{} secs'.format(round(elapsed)))
    f.close()

def fit_and_eval(trn, target, model):
    trn_scores, vld_scores = evaluate(trn,target,model)
    print_scores_with_logs(trn_scores, vld_scores)
    print_time(time.time(), st)

## Model
#RandomForestClassifier, Xgboost

In [None]:
# RandomForest
rf_model = RandomForestClassifier(max_depth=10, n_jobs=-1, random_state=777)
fit_and_eval(trn, target, rf_model)
# 5 sec

In [None]:
# Xgboost
evaluate_xgb(trn,target)

## Parameter Optimization

In [None]:
GridSearchCV

## 캐글에 직접 결과물 제출하기
    - MAP@7 평가척도를 기반 (https://www.kaggle.com/c/santander-product-recommendation/details/evaluation)
    - 유저당 상위 7개의 제품을 추천해야함

In [None]:
# RandomForest
from datetime import datetime
import os

print('='*50)
print('# Test shape : {}'.format(tst.shape))

model = RandomForestClassifier(max_depth=20, n_jobs=-1, random_state=777)
model.fit(trn,target)

preds = model.predict_proba(tst)
preds = np.fliplr(np.argsort(preds, axis=1))

In [None]:
# XgBoost
dtrn = xgb.DMatrix(trn, label= target)
num_round = num_round # 평가 함수 기반 최적의 num_round 수치 지정
bst = xgb.train(xgb_params, dtrn, num_round, verbose_eval=True)

dtst = xgb.DMatrix(tst)
preds = bst.predict(dtst)
preds = np.fliplr(np.argsort(preds, axis=1))

In [None]:
cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
        'ind_cder_fin_ult1', 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1',
        'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
        'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
        'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
        'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
        'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
        'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']
target_cols = [cols[i] for i, col in enumerate(cols) if i in rem_targets]

In [None]:
final_preds = []
for pred in preds:
    top_products = []
    for i, product in enumerate(pred):
        top_products.append(target_cols[product])
        if i == 6:
            break
    final_preds.append(' '.join(top_products))

temp = pd.read_csv('../input/test_clean.csv')
test_id = temp['ncodpers']
out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})
file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
out_df.to_csv(os.path.join('../output',file_name), index=False)

결과물 출력은 https://www.kaggle.com/c/santander-product-recommendation/submissions/attach

## 나만의 머신러닝 파이프라인 흐름도(Flow Chart) 기록하기

- 원천 데이터
    - .

- 전처리
    - .

- 피쳐 엔지니어링 이전 데이터 dimension
    - .

- 피쳐 엔지니어링 
    - .

- 피쳐 엔지니어링 이후 데이터 dimension
    - .

- 모델 튜닝 
    - .

- 검증 결과 
    - .

- 실제 결과 
    - .

예시

- 원천 데이터 
    - Kaggle 경진대회 데이터 train_ver2.csv, test_ver2.csv (Link: https://www.kaggle.com/c/santander-product-recommendation/data)


- 전처리 
    - 결측값을 .fillna 함수를 통해 0으로 대체. (기존 데이터에 0이 존재할 경우 -1로 대체)


- 피쳐 엔지니어링 이전 데이터 dimension:
    - trn : (45619, 246)
    - target : (45619, 1) [18 classes]
    - tst : (929615, 246)


- 피쳐 엔지니어링
    - age_log : log(age + 1)
    - ind..._lag_one : 5월 사용자별 금융상품 보유현황
    - ind..._lag_two : 4월 사용자별 금융상품 보유현황
    - ind..._lag_thr : 3월 사용자별 금융상품 보유현황


- 피쳐 엔지니어링 이후 데이터 dimension:
    - trn : (45619, 250)
    - target : (45619, 1) [18 classes]
    - tst : (929615, 250)


- 모델 튜닝 
    - RandomForest : max_depth = 20 로 복잡도 조정


- 검증 결과 
    - trn logloss : 1.18
    - vld logloss : 1.28


- 실제 결과 
    - Public Leaderboard : 0.025984


## Appendix
    - RandomForest vs ExtraTrees 의 차이란?
        - P. Geurts, D. Ernst., and L. Wehenkel, “Extremely randomized trees”, Machine Learning, 63(1), 3-42, 2006
        - 1) ET의 경우, 변수 샘플링을 boostrap 샘플이 아닌 전체 데이터에서 취한다.
        - 2) ET의 경우, 샘플내 분포에 상관없이 완전한 임의 샘플링으로 데이터 샘플을 취한다.