# Feature Generator 

데이터 EDA를 기반으로 새로운 feature을 생성하겠습니다.  
순서는 다음과 같습니다. 
1. 변수 생성: `누적 행동 데이터`
2. 변수 생성: `weekday`, `holiday`
3. 변수 생성: `hour`
4. 변수 생성: `latest`
5. 변수 생성: `prefer_dvc`
6. 변수 생성: `trfc_dvc`
7. 외부데이터: `온도`, `습도`, `강수량`, `강수형태`
<br><br>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tq
pd.set_option('display.max_row',300)
%matplotlib inline
warnings.filterwarnings(action='ignore')

## load data

In [2]:
cks_dtype = {'clnt_id':'int64','sess_id':'int64','hit_seq':'int64','action_type':'int64','biz_unit':'category','sess_dt':'object','hit_tm':'object',
             'hit_pss_tm':'int64','trans_id':'float64','sech_kwd':'object','tot_pag_view_ct':'float64','tot_sess_hr_v':'float64','trfc_src':'category',
             'dvc_ctg_nm':'object','pd_c':'object','de_dt':'object','de_tm':'object','buy_am':'int64','buy_ct':'int64','clnt_gender':'category'}

online = pd.read_csv('./data/online_03.csv', dtype=cks_dtype)
trade = pd.read_csv('./data/trade_01.csv', dtype=cks_dtype)
customer = pd.read_csv('./data/customer_01.csv', dtype=cks_dtype)
item = pd.read_csv('./data/item_01.csv', dtype=cks_dtype)

In [54]:
online.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,14.0,124.0,DIRECT,mobile_app
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,14.0,124.0,PUSH,mobile_app
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,14.0,124.0,DIRECT,mobile_app
3,1,2,1,0,A01,20190922,14:09,41584,,초등가을잠바,45.0,424.0,DIRECT,mobile_app
4,1,2,2,0,A01,20190922,14:10,56113,,초등가을점퍼,45.0,424.0,DIRECT,mobile_app


In [4]:
df1_online = online.copy()

<br>

### 1. 변수 생성: `누적 행동 데이터`
- action_type을 활용하여 누적 행동 정보를 생성하겠습니다. 
- 누적 행동 정보는 고객의 과거 행동 추이를 확인 할 수 있습니다. 

In [5]:
df1_dummy = pd.get_dummies(df1_online['action_type'], prefix='cum_act', prefix_sep='_')
df1_online = pd.concat([df1_online, df1_dummy], axis=1)

In [6]:
df1_online

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,trfc_src,dvc_ctg_nm,cum_act_0,cum_act_1,cum_act_2,cum_act_3,cum_act_4,cum_act_5,cum_act_6,cum_act_7
0,1,1,1,0,A01,20190911,16:14,11880,,과일선물세트,...,DIRECT,mobile_app,1,0,0,0,0,0,0,0
1,1,1,2,0,A01,20190911,16:15,22432,,과일선물세트 백화점,...,PUSH,mobile_app,1,0,0,0,0,0,0,0
2,1,1,3,0,A01,20190911,16:15,36140,,과일바구니,...,DIRECT,mobile_app,1,0,0,0,0,0,0,0
3,1,2,1,0,A01,20190922,14:09,41584,,초등가을잠바,...,DIRECT,mobile_app,1,0,0,0,0,0,0,0
4,1,2,2,0,A01,20190922,14:10,56113,,초등가을점퍼,...,DIRECT,mobile_app,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196357,72426,1,6,2,A03,20190930,00:21,282807,,,...,DIRECT,mobile_web,0,0,1,0,0,0,0,0
3196358,72426,1,7,0,A03,20190930,00:22,312037,,핫도그,...,DIRECT,mobile_web,1,0,0,0,0,0,0,0
3196359,72426,1,8,1,A03,20190930,00:22,333968,,,...,DIRECT,mobile_web,0,1,0,0,0,0,0,0
3196360,72429,1,1,1,A03,20190919,22:09,839064,,,...,DIRECT,mobile_web,0,1,0,0,0,0,0,0


In [7]:
# 각 행동 유형별로 누적해서 합해주는 함수.
for i in range(7):
    col = 'cum_act_'+str(i)
    df1_online[col] = df1_online.groupby(['clnt_id', ''])[col].cumsum() 
    ## 수정

In [8]:
df1_online.head(205).tail(10)

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,trfc_src,dvc_ctg_nm,cum_act_0,cum_act_1,cum_act_2,cum_act_3,cum_act_4,cum_act_5,cum_act_6,cum_act_7
195,29,16,2,0,A01,20190929,17:10,93307,,아이라인,...,DIRECT,mobile_app,56,0,0,0,0,0,1,0
196,29,16,3,0,A01,20190929,17:11,131229,,쉐이딩,...,PUSH,mobile_app,57,0,0,0,0,0,1,0
197,30,1,1,0,A01,20190823,10:10,306650,,제휴혜택,...,DIRECT,mobile_app,1,0,0,0,0,0,0,0
198,31,1,1,0,A01,20190808,15:24,0,,전동킥보드악세사리,...,PORTAL_1,mobile_web,1,0,0,0,0,0,0,0
199,33,1,1,0,A01,20190807,17:53,8308,,루이까또즈지갑,...,DIRECT,mobile_web,1,0,0,0,0,0,0,0
200,34,1,1,0,A01,20190703,22:14,22622,,등산화,...,PORTAL_1,mobile_web,1,0,0,0,0,0,0,0
201,34,1,2,0,A01,20190703,22:14,49689,,등산화,...,PORTAL_1,mobile_web,2,0,0,0,0,0,0,0
202,34,1,3,0,A01,20190703,22:15,76754,,등산화,...,PORTAL_1,mobile_web,3,0,0,0,0,0,0,0
203,34,1,4,0,A01,20190703,22:16,148359,,등산화,...,PORTAL_1,mobile_web,4,0,0,0,0,0,0,0
204,34,1,5,0,A01,20190703,22:17,215510,,등산화,...,PORTAL_1,mobile_web,5,0,0,0,0,0,0,0


In [9]:
df1_online.to_csv('./data/online_04-1.csv',index=False)

In [10]:
df1_online = pd.read_csv('./data/online_04-1.csv')

<br>

### 2. 변수 생성: `weekday`, `holiday`
- 날짜 데이터를 활용하여 `weekday` 요일을 생성하겠습니다. 
- (예) 요일숫자(0-월, 1-화, 2-수, ....)
<br>

- 주말, 광복절, 추석은 `holiday`로 생성합니다. 
- (예) '2019-08-15'-광복절, '2019-09-12'-추석연휴,'2019-09-13'-추석연휴
    - 주말과 공휴일은 1, 그 외 요일은 0이 됩니다. 
    - hit_seq 1\~4에는 buy_id가 1, hit_seq 5\~10에 buy_id가 2가 됩니다.

In [11]:
# 공휴일 list
holist = ['2019-08-15','2019-09-12','2019-09-13'] 
# sess_dt col의 자료형 변경
df1_online.sess_dt = pd.to_datetime(df1_online.sess_dt, format='%Y%m%d') 
# 요일숫자(0-월, 1-화) (=dayofweek)
df1_online['day'] = df1_online['sess_dt'].dt.weekday 
df1_online['holiday'] = df1_online['day'].apply(lambda x: (x==5)|(x==6)).astype(np.int64)
df1_online['holiday'] = df1_online['holiday'] + df1_online['sess_dt'].apply(lambda x: x.strftime('%Y-%m-%d') in holist)
df1_online.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,cum_act_0,cum_act_1,cum_act_2,cum_act_3,cum_act_4,cum_act_5,cum_act_6,cum_act_7,day,holiday
0,1,1,1,0,A01,2019-09-11,16:14,11880,,과일선물세트,...,1,0,0,0,0,0,0,0,2,0
1,1,1,2,0,A01,2019-09-11,16:15,22432,,과일선물세트 백화점,...,2,0,0,0,0,0,0,0,2,0
2,1,1,3,0,A01,2019-09-11,16:15,36140,,과일바구니,...,3,0,0,0,0,0,0,0,2,0
3,1,2,1,0,A01,2019-09-22,14:09,41584,,초등가을잠바,...,4,0,0,0,0,0,0,0,6,1
4,1,2,2,0,A01,2019-09-22,14:10,56113,,초등가을점퍼,...,5,0,0,0,0,0,0,0,6,1


In [12]:
df1_online.to_csv('./data/online_04-2.csv',index=False)

In [13]:
df1_online = pd.read_csv('./data/online_04-2.csv')

<br>

### 3. 변수 생성: `hour`
- `hit_tm`을 시간대별 분류를 통해 카테고리화하였습니다.
- (예) 23:35 -> 23, 23:47 -> 23

In [14]:
df1_online['hour'] = df1_online['hit_tm'].apply(lambda x: np.int8(x[:2]))
df1_online.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,cum_act_1,cum_act_2,cum_act_3,cum_act_4,cum_act_5,cum_act_6,cum_act_7,day,holiday,hour
0,1,1,1,0,A01,2019-09-11,16:14,11880,,과일선물세트,...,0,0,0,0,0,0,0,2,0,16
1,1,1,2,0,A01,2019-09-11,16:15,22432,,과일선물세트 백화점,...,0,0,0,0,0,0,0,2,0,16
2,1,1,3,0,A01,2019-09-11,16:15,36140,,과일바구니,...,0,0,0,0,0,0,0,2,0,16
3,1,2,1,0,A01,2019-09-22,14:09,41584,,초등가을잠바,...,0,0,0,0,0,0,0,6,1,14
4,1,2,2,0,A01,2019-09-22,14:10,56113,,초등가을점퍼,...,0,0,0,0,0,0,0,6,1,14


In [15]:
df1_online.to_csv('./data/online_04-3.csv',index=False)

In [37]:
df1_online = pd.read_csv('./data/online_04-3.csv')

<br>

### 4. 변수 생성: `latest`
- 고객의 과거 행동을 담은 변수입니다. 
    - `latest_kwd_1`~`latest_kwd_6`: 가장 최근 검색한 여섯 개의 키워드를 담은 변수입니다.
    - `latest_pv_hr`
        - `pv_hr`은 온라인행동데이터의 `세션내총페이지뷰수/세션내총시간`으로 단위 페이지당 머무른 시간을 나타내는 서핑속도입니다.
        - `latest_pv_hr_1` \~ `latest_pv_hr_3`: 최근 쇼핑의 3개의 페이지 서핑 속도를 반영합니다.

In [4]:
# 3번 이상 검색된 키워드를 Type으로 크롤링한 파일 load
key2crawl = pd.read_csv('./data/key2crawl.csv')

In [38]:
# 검색 키워드와 크롤링 type을 dict 형태로 변경
key_dict = key2crawl.set_index('keyword').to_dict()['crawling']

In [48]:
# 3번 이상 검색된 키워드를 type으로 변경
online_key = df1_online['sech_kwd']
for i in range(len(online_key)):
    if online_key[i] in key_dict.keys():
        df1_online.loc[i,['sech_kwd']] = key_dict[online_key[i]]

KeyboardInterrupt: 

In [15]:
df1_online.to_csv('./data/online_04-4_1.csv',index=False)

In [37]:
df1_online = pd.read_csv('./data/online_04-4_1.csv')

In [None]:
# 가장 최근 검색한 여섯 개의 키워드를 담는 함수
def latest_kwds(df):
    for j in range(len(df)):
        if j > 0:
            temp_df = df.iloc[ : j , :]
            temp_idx = temp_df.tail(1).index
            kwd_list = temp_df[~temp_df['sech_kwd'].isnull()]['sech_kwd'].tail(6).tolist()
            kwd_list.reverse()
            if len(kwd_list) > 0:
                for idx, i in enumerate(kwd_list) :
                    df.loc[temp_idx+1, f'latest_kwd_{idx+1}'] = i
    return df

In [None]:
# 먼저 latest_kwd_x 컬럼 생성
for i in range(1, 7):
    df1_online[f'latest_kwd_{i}'] = -1

In [None]:
df1_online = df1_online.groupby(['clnt_id', 'sess_id', 'buy_id']).apply(lambda x: latest_kwds(x))

In [52]:
df1_online.to_csv('./data/online_04-4.csv',index=False)

In [53]:
df1_online = pd.read_csv('./data/online_04-4.csv')

<br>

### 5. 변수 생성 : `prefer_dvc_trfc`
- 기기와 유입 채널을 하나로 묶어줍니다. 
- (예) 

In [54]:
# dvc_ctg_nm, trfc_src에 따라 선호하는 유입 유형인 col (prefer_dvc_trfc) 생성
df1_online['prefer_dvc_trfc'] = df1_online['dvc_ctg_nm'].str.cat(df1_online['trfc_src'],sep="_")
df1_online.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,cum_act_3,cum_act_4,cum_act_5,cum_act_6,cum_act_7,day,holiday,hour,sech_clac_nm2,prefer_dvc_trfc
0,1,1,1,0,A01,2019-09-11,16:14,11880,,과일선물세트,...,0,0,0,0,0,2,0,16,,mobile_app_DIRECT
1,1,1,2,0,A01,2019-09-11,16:15,22432,,과일선물세트 백화점,...,0,0,0,0,0,2,0,16,,mobile_app_PUSH
2,1,1,3,0,A01,2019-09-11,16:15,36140,,과일바구니,...,0,0,0,0,0,2,0,16,,mobile_app_DIRECT
3,1,2,1,0,A01,2019-09-22,14:09,41584,,초등가을잠바,...,0,0,0,0,0,6,1,14,Jewelry,mobile_app_DIRECT
4,1,2,2,0,A01,2019-09-22,14:10,56113,,초등가을점퍼,...,0,0,0,0,0,6,1,14,Women's Shoes,mobile_app_DIRECT


In [55]:
df1_online['prefer_dvc_trfc'].value_counts()

mobile_app_DIRECT      1797039
mobile_app_PUSH         458757
mobile_web_DIRECT       314019
mobile_web_PUSH         267935
PC_DIRECT               161054
mobile_web_WEBSITE       38207
PC_WEBSITE               35969
PC_PORTAL_1              31605
PC_PORTAL_2              31465
mobile_web_PORTAL_1      27007
mobile_web_PORTAL_2      16656
mobile_web_PORTAL_3       9112
PC_PORTAL_3               5811
mobile_app_WEBSITE        1640
PC_PUSH                     86
Name: prefer_dvc_trfc, dtype: int64

In [57]:
df1_online.to_csv('./data/online_04-5.csv',index=False)

In [3]:
df1_online = pd.read_csv('./data/online_04-5.csv')

<br>

### 6. 외부데이터: `기온`, `습도`, `강수형태`, `강수량`
- 기상청 데이터를 활용하여 7월-9월의 데이터를 추가하겠습니다. 
- 습도, 기온, 강수형태, 강수량 feature을 생성합니다. 

In [68]:
# 외부 데이터 파일 load
TEMP = pd.read_csv('./data/기온_201907_201909.csv')
HUM = pd.read_csv('./data/습도_201907_201909.csv')
PTY = pd.read_csv('./data/강수형태_201907_201909.csv')
R06 = pd.read_csv('./data/강수량_201907_201909.csv')

In [69]:
# column명 변경
HUM.columns = ['sess_dt', 'hour','hum']
TEMP.columns =  ['sess_dt', 'hour','temp']
PTY.columns =  ['sess_dt', 'hour','pty']
R06.columns =  ['sess_dt', 'hour','r06']

In [70]:
# 월별로 나누기 위해서 각 월이 시작하는 index를 불러옴.
index_08 = HUM[HUM['sess_dt'] == ' Start : 20190801 '].index
index_09 = HUM[HUM['sess_dt'] == ' Start : 20190901 '].index

print('start 08 index:', index_08)
print('start 09 index:', index_09)

start 08 index: Int64Index([744], dtype='int64')
start 09 index: Int64Index([1489], dtype='int64')


In [71]:
# 경계 index를 기준으로 7, 8, 9월로 나눔
from functools import reduce
dfs_07 = [HUM[:744], TEMP[:744], PTY[:744], R06[:744]]
dfs_08 = [HUM[744:1489], TEMP[744:1489], PTY[744:1489], R06[744:1489]]
dfs_09 = [HUM[1489:], TEMP[1489:], PTY[1489:], R06[1489:]]

merge_07 = reduce(lambda left, right: pd.merge(left, right, on=['sess_dt', 'hour'], how='left'), dfs_07)
merge_08 = reduce(lambda left, right: pd.merge(left, right, on=['sess_dt', 'hour'], how='left'), dfs_08)
merge_09 = reduce(lambda left, right: pd.merge(left, right, on=['sess_dt', 'hour'], how='left'), dfs_09)

merge_08 = merge_08.drop([0]).reset_index(drop=True)
merge_09 = merge_09.drop([0]).reset_index(drop=True)

In [72]:
# 총 4개의 기상 데이터 merge한 예시
merge_07.head()

Unnamed: 0,sess_dt,hour,hum,temp,pty,r06
0,1,0.0,59.0,24.700001,0.0,0.0
1,1,100.0,54.0,25.799999,0.0,0.0
2,1,200.0,48.0,26.9,0.0,0.0
3,1,300.0,44.0,27.5,0.0,0.0
4,1,400.0,43.0,28.299999,0.0,0.0


In [73]:
# 기상 데이터를 합친 데이터를 online 데이터와 merge하기 위해서 처리
merge_07['sess_dt'] = merge_07['sess_dt'].apply(lambda x: '201907'+x.lstrip().zfill(2))
merge_08['sess_dt'] = merge_08['sess_dt'].apply(lambda x: '201908'+x.lstrip().zfill(2))
merge_09['sess_dt'] = merge_09['sess_dt'].apply(lambda x: '201909'+x.lstrip().zfill(2))

merge_07.sess_dt = pd.to_datetime(merge_07.sess_dt, format='%Y%m%d').astype(str)
merge_08.sess_dt = pd.to_datetime(merge_08.sess_dt, format='%Y%m%d').astype(str)
merge_09.sess_dt = pd.to_datetime(merge_09.sess_dt, format='%Y%m%d').astype(str)

merge_07['hour'] = merge_07['hour'].apply(lambda x: np.int8(x/100))
merge_08['hour'] = merge_08['hour'].apply(lambda x: np.int8(x/100))
merge_09['hour'] = merge_09['hour'].apply(lambda x: np.int8(x/100))

In [75]:
# online 데이터를 월 단위로 분리
df1_online.sort_values(by='sess_dt')
online_07 = df1_online[df1_online['sess_dt'] <= '2019-07-31']
online_08 = df1_online[df1_online['sess_dt'].apply(lambda x: x.startswith('2019-08'))]
online_09 = df1_online[df1_online['sess_dt'] > '2019-08-31']

In [76]:
# online 데이터와 기상 데이터를 월 별로 merge
online_07 = pd.merge(online_07, merge_07, on=['sess_dt', 'hour'], how='left')
online_08 = pd.merge(online_08, merge_08, on=['sess_dt', 'hour'], how='left')
online_09 = pd.merge(online_09, merge_09, on=['sess_dt', 'hour'], how='left')

In [77]:
# 다시 합쳐줌.
df2_online = pd.concat([online_07, online_08, online_09], ignore_index=True)
df2_online = df2_online.sort_values(by=['clnt_id', 'sess_id']).reset_index(drop=True)

In [78]:
df2_online.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,cum_act_7,day,holiday,hour,sech_clac_nm2,prefer_dvc_trfc,hum,temp,pty,r06
0,1,1,1,0,A01,2019-09-11,16:14,11880,,과일선물세트,...,0,2,0,16,,mobile_app_DIRECT,67.0,22.1,0.0,0.0
1,1,1,2,0,A01,2019-09-11,16:15,22432,,과일선물세트 백화점,...,0,2,0,16,,mobile_app_PUSH,67.0,22.1,0.0,0.0
2,1,1,3,0,A01,2019-09-11,16:15,36140,,과일바구니,...,0,2,0,16,,mobile_app_DIRECT,67.0,22.1,0.0,0.0
3,1,2,1,0,A01,2019-09-22,14:09,41584,,초등가을잠바,...,0,6,1,14,Jewelry,mobile_app_DIRECT,58.0,18.200001,0.0,0.0
4,1,2,2,0,A01,2019-09-22,14:10,56113,,초등가을점퍼,...,0,6,1,14,Women's Shoes,mobile_app_DIRECT,58.0,18.200001,0.0,0.0


In [81]:
df2_online.to_csv('./data/online_04-6.csv',index=False)

In [3]:
df2_online = pd.read_csv('./data/online_04-6.csv')

<br>

### 데이터 처리: `Trade + Item`
- 거래 ID와 Item을 `pd_c` 기준으로 합쳐줍니다. 


In [4]:
df3_trade = trade.merge(item, how = 'left')
df3_trade.head()

Unnamed: 0,clnt_id,trans_id,trans_seq,biz_unit,pd_c,de_dt,de_tm,buy_am,buy_ct,clac_nm1,clac_nm2,clac_nm3
0,2,42449.0,1,A02,1015.0,20190704,15:34,46430,1,Men's Clothing,Men's Upper Bodywear / Tops,Men's T-shirts
1,2,62037.0,1,A03,92.0,20190729,23:47,36000,20,Beverages,Coffee Drinks,Coffee Drinks
2,2,64691.0,1,A03,186.0,20190731,21:25,3790,1,Chilled Foods,Fish Cakes and Crab Sticks,Crab Sticks
3,2,64691.0,2,A03,151.0,20190731,21:25,3990,1,Canned / Jarred Foods,Canned Agricultural Foods,Canned Vegetable Foods
4,2,64691.0,3,A03,351.0,20190731,21:25,4690,1,Dairy Products,Processed Dairy Products,Cream and Condensed milk


In [None]:
df3_trade = df3_trade[['trans_id', 'clac_nm2']]

In [5]:
df3_dummy = pd.get_dummies(df3_trade['clac_nm2'])

In [6]:
df3_trade = pd.concat([df3_trade, df3_dummy], axis=1)
df3_trade.drop('clac_nm2', axis=1, inplace=True)
# df3_trade['trans_id'] = df3_trade['trans_id'].astype(str)

df3_trade.head()

Unnamed: 0,trans_id,Adults' Bedding,Air Freshners / Dehumidifiers / Odor Eliminators,Air Purifiers / Humidifiers / Dehumidifiers,Alcoholic Beverage Sets,Animal Clinics,Arts / Crafts Supplies,Audios,Australian Imported Beefs,Automotive Replacement Repair / Maintanance Kits,...,Women's Socks and Hosiery,Women's Special Materials Clothing,Women's Special Use Clothing,Women's Sport Shoes,Women's Underwear,Women's Upper Bodywear / Tops,Women's Wallets,Writing Pads,Writing Supplies,Yogurt
0,42449.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,62037.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,64691.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,64691.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,64691.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df3_trade = df3_trade.groupby(df3_trade['trans_id']).sum()

In [8]:
df3_trade.to_csv('./data/test1.csv')
df3_trade = pd.read_csv('./data/test1.csv')

df3_trade.head()

Unnamed: 0,trans_id,Adults' Bedding,Air Freshners / Dehumidifiers / Odor Eliminators,Air Purifiers / Humidifiers / Dehumidifiers,Alcoholic Beverage Sets,Animal Clinics,Arts / Crafts Supplies,Audios,Australian Imported Beefs,Automotive Replacement Repair / Maintanance Kits,...,Women's Socks and Hosiery,Women's Special Materials Clothing,Women's Special Use Clothing,Women's Sport Shoes,Women's Underwear,Women's Upper Bodywear / Tops,Women's Wallets,Writing Pads,Writing Supplies,Yogurt
0,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br>

### 데이터 처리: `Online + Customer`
- online 데이터와 고객 데이터를 `clnt_id` 기준으로 합쳐줍니다. 


In [9]:
df3_online = df2_online.merge(customer, how = 'left')
df3_online.head()

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,holiday,hour,sech_clac_nm2,prefer_dvc_trfc,hum,temp,pty,r06,clnt_gender,clnt_age
0,1,1,1,0,A01,2019-09-11,16:14,11880,,과일선물세트,...,0,16,,mobile_app_DIRECT,67.0,22.1,0.0,0.0,,
1,1,1,2,0,A01,2019-09-11,16:15,22432,,과일선물세트 백화점,...,0,16,,mobile_app_PUSH,67.0,22.1,0.0,0.0,,
2,1,1,3,0,A01,2019-09-11,16:15,36140,,과일바구니,...,0,16,,mobile_app_DIRECT,67.0,22.1,0.0,0.0,,
3,1,2,1,0,A01,2019-09-22,14:09,41584,,초등가을잠바,...,1,14,Jewelry,mobile_app_DIRECT,58.0,18.200001,0.0,0.0,,
4,1,2,2,0,A01,2019-09-22,14:10,56113,,초등가을점퍼,...,1,14,Women's Shoes,mobile_app_DIRECT,58.0,18.200001,0.0,0.0,,


<br>

### 데이터 처리: `Online + Trade`
- merge를 하기 전 trans_id의 중복을 제거하기 위함입니다. 


In [10]:
df3_merge = pd.merge(df3_online, df3_trade, 
                on='trans_id', how='left')

In [11]:
df3_merge.to_csv('./data/df4_merge01.csv',index=False)

In [12]:
df3_merge

Unnamed: 0,clnt_id,sess_id,hit_seq,action_type,biz_unit,sess_dt,hit_tm,hit_pss_tm,trans_id,sech_kwd,...,Women's Socks and Hosiery,Women's Special Materials Clothing,Women's Special Use Clothing,Women's Sport Shoes,Women's Underwear,Women's Upper Bodywear / Tops,Women's Wallets,Writing Pads,Writing Supplies,Yogurt
0,1,1,1,0,A01,2019-09-11,16:14,11880,,과일선물세트,...,,,,,,,,,,
1,1,1,2,0,A01,2019-09-11,16:15,22432,,과일선물세트 백화점,...,,,,,,,,,,
2,1,1,3,0,A01,2019-09-11,16:15,36140,,과일바구니,...,,,,,,,,,,
3,1,2,1,0,A01,2019-09-22,14:09,41584,,초등가을잠바,...,,,,,,,,,,
4,1,2,2,0,A01,2019-09-22,14:10,56113,,초등가을점퍼,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3196357,72428,3,3,0,A01,2019-09-14,21:50,839771,,페레가모가방,...,,,,,,,,,,
3196358,72428,4,1,0,A01,2019-09-14,23:42,243555,,펜디가방,...,,,,,,,,,,
3196359,72428,5,1,0,A01,2019-09-18,06:37,12568,,라인에디션블라우스,...,,,,,,,,,,
3196360,72429,1,1,1,A03,2019-09-19,22:09,839064,,,...,,,,,,,,,,


In [None]:
df3_merge[df3_merge.columns[:33]]

In [None]:
df3_merge.to_csv('./data/df4_merge01.csv',index=False)

In [None]:
df3_merge = pd.read_csv('./data/df4_merge01.csv')