# Wide & Deep Learning model 구현

    * 꿈이 많은 사람의 이야기 블로그 참조하여 Wide & Deep Learning model 구현해보기
      - https://lsjsj92.tistory.com/597

* 라이브러리 불러오기

In [142]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder

from tensorflow.keras.utils import plot_model

In [143]:
COLUMNS = [
    "clnt_id", "clnt_age", "hit_pss_tm", "action_type", "de_dt", 
    "hit_pss_tm", "trans_id", "tot_pag_view_ct", "tot_sess_hr_v", 
    "trfc_src", "dvc_ctg_nm"
]

CATEGORICAL_COLUMNS = [
    "clnt_gender", "trfc_src", "dvc_ctg_nm"
]

CONTINUOUS_COLUMNS = [
    "clnt_id", "clnt_age", "hit_pss_tm", "action_type", "de_dt", 
    "tot_pag_view_ct", "tot_sess_hr_v"
]

In [144]:
len(COLUMNS)

11

In [145]:
len(CATEGORICAL_COLUMNS)

3

In [146]:
len(CONTINUOUS_COLUMNS)

7

In [147]:
path=os.getenv("HOME") + ('/repo/Lpoint-Hackathon/src/new_data/all_data_nm3.csv')

In [148]:
train_data = pd.read_csv(path)

In [149]:
train_data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3
0,49906,6,627381,,63.0,2211.0,DIRECT,unknown,20190701,1990,1,F,30.0,기능성 우유
1,67265,6,313631,,10.0,313.0,unknown,mobile_app,20190701,56900,1,F,40.0,성인용 침구 세트
2,68972,6,554066,,31.0,652.0,DIRECT,unknown,20190701,9980,2,F,50.0,인스턴트 라이스
3,41763,6,1241857,,39.0,1242.0,DIRECT,unknown,20190701,1290,1,F,40.0,새송이버섯
4,15344,6,120487,,10.0,133.0,DIRECT,unknown,20190701,4100,1,F,40.0,Cokes


In [150]:
train_data.shape

(123095, 14)

In [151]:
data = train_data.copy()

In [152]:
data['clac_nm3'] = data['clac_nm3'].astype(str)

In [153]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['clac_nm3'])

In [154]:
data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3,label
0,49906,6,627381,,63.0,2211.0,DIRECT,unknown,20190701,1990,1,F,30.0,기능성 우유,95
1,67265,6,313631,,10.0,313.0,unknown,mobile_app,20190701,56900,1,F,40.0,성인용 침구 세트,448
2,68972,6,554066,,31.0,652.0,DIRECT,unknown,20190701,9980,2,F,50.0,인스턴트 라이스,730
3,41763,6,1241857,,39.0,1242.0,DIRECT,unknown,20190701,1290,1,F,40.0,새송이버섯,428
4,15344,6,120487,,10.0,133.0,DIRECT,unknown,20190701,4100,1,F,40.0,Cokes,9


In [155]:
for c in CATEGORICAL_COLUMNS:
    le = LabelEncoder()
    data[c] = le.fit_transform(data[c])

In [156]:
data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3,label
0,49906,6,627381,,63.0,2211.0,0,3,20190701,1990,1,0,30.0,기능성 우유,95
1,67265,6,313631,,10.0,313.0,6,1,20190701,56900,1,0,40.0,성인용 침구 세트,448
2,68972,6,554066,,31.0,652.0,0,3,20190701,9980,2,0,50.0,인스턴트 라이스,730
3,41763,6,1241857,,39.0,1242.0,0,3,20190701,1290,1,0,40.0,새송이버섯,428
4,15344,6,120487,,10.0,133.0,0,3,20190701,4100,1,0,40.0,Cokes,9


In [157]:
#data.drop('hit_tm', axis = 1, inplace=True)
data.drop('sech_kwd', axis = 1, inplace=True)

In [158]:
data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3,label
0,49906,6,627381,63.0,2211.0,0,3,20190701,1990,1,0,30.0,기능성 우유,95
1,67265,6,313631,10.0,313.0,6,1,20190701,56900,1,0,40.0,성인용 침구 세트,448
2,68972,6,554066,31.0,652.0,0,3,20190701,9980,2,0,50.0,인스턴트 라이스,730
3,41763,6,1241857,39.0,1242.0,0,3,20190701,1290,1,0,40.0,새송이버섯,428
4,15344,6,120487,10.0,133.0,0,3,20190701,4100,1,0,40.0,Cokes,9


## data를 Train, Test set으로 구분

In [159]:
x_train = data[data["de_dt"]<20190901]
x_test = data[data["de_dt"]>20190901]

In [160]:
x_train.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3,label
0,49906,6,627381,63.0,2211.0,0,3,20190701,1990,1,0,30.0,기능성 우유,95
1,67265,6,313631,10.0,313.0,6,1,20190701,56900,1,0,40.0,성인용 침구 세트,448
2,68972,6,554066,31.0,652.0,0,3,20190701,9980,2,0,50.0,인스턴트 라이스,730
3,41763,6,1241857,39.0,1242.0,0,3,20190701,1290,1,0,40.0,새송이버섯,428
4,15344,6,120487,10.0,133.0,0,3,20190701,4100,1,0,40.0,Cokes,9


In [161]:
x_test.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3,label
85903,23275,6,2098053,56.0,2103.0,0,3,20190902,4990,1,0,30.0,찹쌀,806
85904,31000,6,4491970,30.0,4492.0,4,3,20190902,6990,1,1,40.0,사과,422
85905,64682,6,1120341,50.0,1120.0,0,3,20190902,3990,1,0,30.0,달걀,254
85906,40955,6,945364,21.0,1042.0,0,0,20190902,27000,1,0,30.0,물,337
85907,54200,6,6870197,134.0,6870.0,4,3,20190902,1580,2,0,40.0,인스턴트 카레 / 검은콩 소스,732


In [162]:
y_train = x_train['label'].values
x_train.drop('label', axis = 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [163]:
x_train.drop('clac_nm3', axis = 1, inplace=True)

In [164]:
y_test = x_test['label'].values
x_test.drop('label', axis = 1, inplace=True)

In [165]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

84992 84992 37192 37192


## 카테고리 값들과 연속값들을 뽑아냄

In [166]:
x_train_category = np.array(x_train[CATEGORICAL_COLUMNS])
x_test_category = np.array(x_test[CATEGORICAL_COLUMNS])
x_train_continue = np.array(x_train[CONTINUOUS_COLUMNS], dtype='float64')
x_test_continue = np.array(x_test[CONTINUOUS_COLUMNS], dtype='float64')

In [167]:
print(x_train_category[:1] )
print(x_test_category[:1])
print(x_train_continue[:1])
print(x_test_continue[:1])

[[0 0 3]]
[[0 0 3]]
[[4.9906000e+04 3.0000000e+01 6.2738100e+05 6.0000000e+00 2.0190701e+07
  6.3000000e+01 2.2110000e+03]]
[[2.3275000e+04 3.0000000e+01 2.0980530e+06 6.0000000e+00 2.0190902e+07
  5.6000000e+01 2.1030000e+03]]


In [168]:
x_train_continue

array([[4.9906000e+04, 3.0000000e+01, 6.2738100e+05, ..., 2.0190701e+07,
        6.3000000e+01, 2.2110000e+03],
       [6.7265000e+04, 4.0000000e+01, 3.1363100e+05, ..., 2.0190701e+07,
        1.0000000e+01, 3.1300000e+02],
       [6.8972000e+04, 5.0000000e+01, 5.5406600e+05, ..., 2.0190701e+07,
        3.1000000e+01, 6.5200000e+02],
       ...,
       [5.5370000e+04, 3.0000000e+01, 4.2566510e+06, ..., 2.0190831e+07,
        9.7000000e+01, 4.4230000e+03],
       [6.7877000e+04, 5.0000000e+01, 3.3481840e+06, ..., 2.0190831e+07,
        6.8000000e+01, 3.4170000e+03],
       [6.5444000e+04, 4.0000000e+01, 1.2128720e+06, ..., 2.0190831e+07,
        6.7000000e+01, 1.2130000e+03]])

In [169]:
x_train_category.shape

(84992, 3)

In [170]:
x_train_continue.shape

(84992, 7)

## 정규화

In [171]:
scaler = StandardScaler()
x_train_continue = scaler.fit_transform(x_train_continue)
x_test_continue = scaler.transform(x_test_continue)

In [172]:
print(x_train_continue[:1])
print(x_test_continue[:1])

[[ 0.5968563  -1.03754514 -0.69670276 -0.15626785 -1.25113817  0.53265036
   0.32936634]]
[[-0.73341716 -1.03754514  0.42091183 -0.15626785  2.71772985  0.32306148
   0.25392887]]


* 정규화 내용 확인

In [173]:
x_train_continue[0]

array([ 0.5968563 , -1.03754514, -0.69670276, -0.15626785, -1.25113817,
        0.53265036,  0.32936634])

In [174]:
print(x_train_continue[0].sum())
print(x_train_continue[1].sum())
print(x_train_continue[2].sum())
print(x_train_continue[3].sum())
print(x_train_continue[4].sum())

-1.6827809228098114
-2.743623939309396
-0.38698475148351025
-1.794911631889985
-5.60969107382094


 - 정규화 한다고 다 더해서 1이 되거나 하지는 않는듯
 - 그렇다고 개별 값이 0~1은 아님

## Polynomial 하게 바꿔줌 
### (비선형적인 설정으로 선형 회귀를 확장하는 방법. 즉 다항식 함수로 바꿔줌)
    - 카테고리 값을 Polynomial로 바꿔줌

* sklearn.preprocessing.PolynomialFeatures 메소드
    - degree : 다항식 차수
    - interaction_only
        - default는 False
        - ex) degree = 3일 때, interaction_only=false 이면
            - a^2, a^3, b^2, b^3, ab, a^2*b, ab^2 Feature가 추가되고,
        - interaction_only=True 이면
            - ab만 추가됨
        

In [175]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

In [176]:
x_train_category

array([[0, 0, 3],
       [0, 6, 1],
       [0, 0, 3],
       ...,
       [0, 0, 3],
       [0, 0, 3],
       [0, 0, 3]])

In [177]:
x_train_category.shape

(84992, 3)

In [178]:
x_test_category.shape

(37192, 3)

In [179]:
x_train_category_poly = poly.fit_transform(x_train_category)
x_test_category_poly = poly.fit_transform(x_test_category)

In [180]:
print(x_train_category_poly[:1])
print(x_train_category_poly[:1].shape)
print(x_test_category_poly[:1])
print(x_test_category_poly[:1].shape)

[[1. 0. 0. 3. 0. 0. 0.]]
(1, 7)
[[1. 0. 0. 3. 0. 0. 0.]]
(1, 7)


=> feature가 8개 이므로, interaction은 7+6+5+4+3+2+1=28개 이고,
상수항 1을 추가하여 8+28+1=37

In [181]:
data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3,label
0,49906,6,627381,63.0,2211.0,0,3,20190701,1990,1,0,30.0,기능성 우유,95
1,67265,6,313631,10.0,313.0,6,1,20190701,56900,1,0,40.0,성인용 침구 세트,448
2,68972,6,554066,31.0,652.0,0,3,20190701,9980,2,0,50.0,인스턴트 라이스,730
3,41763,6,1241857,39.0,1242.0,0,3,20190701,1290,1,0,40.0,새송이버섯,428
4,15344,6,120487,10.0,133.0,0,3,20190701,4100,1,0,40.0,Cokes,9


In [182]:
len(data)

123095

In [183]:
data[CATEGORICAL_COLUMNS[0]]

0         0
1         0
2         0
3         0
4         0
         ..
123090    0
123091    0
123092    0
123093    1
123094    0
Name: clnt_gender, Length: 123095, dtype: int64

In [184]:
data[CATEGORICAL_COLUMNS].head()

Unnamed: 0,clnt_gender,trfc_src,dvc_ctg_nm
0,0,0,3
1,0,6,1
2,0,0,3
3,0,0,3
4,0,0,3


 * np.unique : np.arr 내 중복 제거

In [185]:
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Dropout, SpatialDropout1D, Activation, concatenate


In [186]:
from tensorflow.python.keras.layers.normalization import BatchNormalization


In [187]:
from tensorflow.python.keras.layers.advanced_activations import ReLU, PReLU, LeakyReLU, ELU
from tensorflow.python.keras.optimizers import Adam, SGD
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.keras.models import Model

In [188]:
def get_deep_model():
    
    category_inputs = []
    category_embeds = []
    
    # Categorical Data Embedding
    for i in range(len(CATEGORICAL_COLUMNS)):
        
        # input - embedding - flatten 순으로 layer 쌓기
        input_i = Input(shape=(1,), dtype='int32')
        
        dim = len(np.unique(data[CATEGORICAL_COLUMNS[i]]))
        # dim : data에서 카테고리별 요소가 몇 종류인지?
        # 예시 data[학력] = '초', '중', '고' => dim = 3
        
        embed_dim = int(np.ceil(dim ** 0.5))
        # embedding 차원을 0.5배 정도로 수행? (연산은 루트 올림인디?)
        # 왜 임베딩 차원을 이렇게 하는지 추후 검토
        
        embed_i = Embedding(dim, embed_dim, input_length=1)(input_i)
        # dim : 데이터가 몇 종류 있는지 = 임베딩 벡터를 몇 개 뽑아낼 것인지
        # embed_dim : 임베딩 처리 후 벡터의 차원 = 임베딩 벡터를 몇 차원 벡터로 뽑아 낼 것인지
        # input_length : 입력 데이터 길이
        
        flatten_i = Flatten()(embed_i)
        # category 값을 임베딩환 벡터들을 flatten
        # 서로 다른 값의 벡터 요소가 합쳐져도 괜찮은지??
        # 어차피 class에 대한 순서가 있으니 상관 없을듯
        
        category_inputs.append(input_i)
        category_embeds.append(flatten_i)
        
    # continuous 데이터 input
    continue_input = Input(shape=(len(CONTINUOUS_COLUMNS),))
    continue_dense = Dense(256, use_bias=False)(continue_input)
    # use_bias = False로 하는 이유는??
    
    # category와 continue를 합침
    concat_embeds = concatenate([continue_dense] + category_embeds)
    concat_embeds = Activation('relu')(concat_embeds)
    # Activation 효과 다시 공부
    # relu 말고 다른 것은 어떤지??
    bn_concat = BatchNormalization()(concat_embeds)
    # Batch Normalization 효과 다시 공부
    
    fc1 = Dense(512, use_bias=False)(bn_concat)
    relu1 = ReLU()(fc1)
    bn1 = BatchNormalization()(relu1)
    fc2 = Dense(256, use_bias=False)(bn1)
    relu2 = ReLU()(fc2)
    bn2 = BatchNormalization()(relu2)
    fc3 = Dense(128)(bn2)
    relu3 = ReLU()(fc3)
    
    return category_inputs, continue_input, relu3

In [189]:
x_train_category

array([[0, 0, 3],
       [0, 6, 1],
       [0, 0, 3],
       ...,
       [0, 0, 3],
       [0, 0, 3],
       [0, 0, 3]])

In [190]:
x_train_category_poly

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 6., ..., 0., 0., 6.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [191]:
x_train_category_poly.shape

(84992, 7)

In [192]:
import tensorflow as tf

In [193]:
def get_wide_model():
    dim = x_train_category_poly.shape[1]
    return tf.keras.layers.Input(shape=(dim,))

# x_train_category_poly : 카테고리 데이터를 숫자로 바꾸고, Poly Feature를 추가한 것
# Poly Feature : a, b, c Feature를 이용해서 ab, bc, ca Feature를 만든것
# 데이터의 shape 만 가져옴

    * input - embedding - flatten 순으로 layer 쌓기

In [194]:
category_inputs, continue_input, deep_model = get_deep_model()
wide_model = get_wide_model()

In [195]:
wide_model

<tf.Tensor 'input_10:0' shape=(None, 7) dtype=float32>

In [196]:
deep_model

<tf.Tensor 're_lu_5/Relu:0' shape=(None, 128) dtype=float32>

### Wide모델과 Deep model을 합치기

* wide model, deep model 합치기

In [197]:
out_layer = concatenate([deep_model, wide_model])

In [198]:
out_layer

<tf.Tensor 'concatenate_3/concat:0' shape=(None, 135) dtype=float32>

* 입력 값들 shape 확인

In [199]:
continue_input

<tf.Tensor 'input_9:0' shape=(None, 7) dtype=float32>

In [200]:
category_inputs

[<tf.Tensor 'input_6:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_8:0' shape=(None, 1) dtype=int32>]

In [201]:
wide_model

<tf.Tensor 'input_10:0' shape=(None, 7) dtype=float32>

* 입력 값들 종합하기

In [202]:
inputs = [continue_input] + category_inputs + [wide_model]

In [203]:
inputs

[<tf.Tensor 'input_9:0' shape=(None, 7) dtype=float32>,
 <tf.Tensor 'input_6:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_7:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_8:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_10:0' shape=(None, 7) dtype=float32>]

In [204]:
len(data['label'].value_counts())

1055

* wide model, deep model 합친 것을 한 점으로 모으기

In [205]:
output = Dense(len(data['label'].value_counts()), activation='sigmoid')(out_layer)
# 여기서는 왜 relu가 아니고 sigmoid를 썻는지???
# 마지막 출력 값 범위를 0~1로 출력하기 위함 일듯?

In [206]:
model = Model(inputs=inputs, outputs=output)

In [207]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            [(None, 7)]          0                                            
_______________________________________________________________________________________

In [208]:
checkpoint = ModelCheckpoint(filepath='./data/wide-deep.h5', monitor='val_loss', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

### 입력 데이터

    * 위에서 정의한 리스트 변수 inputs에 맞추어
    * continue 데이터 => category 데이터 => poly data 순으로 입력 값을 넣어준다

 * 데이터 확인

* x_train_continue
    - 숫자형 데이터(나이, 학력 수치, gain, loss, 업무시간)를 float형으로 변환하고
    - StandardScaler()를 통해 정규화 한 것

In [209]:
x_train_continue

array([[ 0.5968563 , -1.03754514, -0.69670276, ..., -1.25113817,
         0.53265036,  0.32936634],
       [ 1.4639742 ,  0.18555448, -0.93513226, ..., -1.25113817,
        -1.05423692, -0.99637742],
       [ 1.54924238,  1.4086541 , -0.75241737, ..., -1.25113817,
        -0.42547026, -0.75958757],
       ...,
       [ 0.86979438, -1.03754514,  2.06130521, ...,  1.31579139,
         1.55065353,  1.87443758],
       [ 1.49454487,  1.4086541 ,  1.37092967, ...,  1.31579139,
         0.68235671,  1.17175148],
       [ 1.37301149,  0.18555448, -0.25176787, ...,  1.31579139,
         0.65241544, -0.3677318 ]])

In [210]:
x_train_continue.shape

(84992, 7)

* x_train_category
    - 범주형 데이터(업종, 학력, 결혼 여부, 직업, 가족관계, 인종, 성별, 국적)을  
    - LabelEncoder()를 통해 숫자(정수)로 변경(인코딩)

* x_train_category_poly
    - 각 feature를 2개씩 쌍을 지어 내적한 결과로 Feature를 추가한 데이터

In [211]:
x_train_category

array([[0, 0, 3],
       [0, 6, 1],
       [0, 0, 3],
       ...,
       [0, 0, 3],
       [0, 0, 3],
       [0, 0, 3]])

In [212]:
x_train_category.shape

(84992, 3)

In [213]:
[x_train_category[:, i] for i in range(x_train_category.shape[1])]

[array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 6, 0, ..., 0, 0, 0]),
 array([3, 1, 3, ..., 3, 3, 3])]

In [214]:
[x_train_category_poly]

[array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 6., ..., 0., 0., 6.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]])]

In [215]:
x_train_category_poly.shape

(84992, 7)

 * 모델 입력 데이터 설정 부분
 * category랑 category_poly랑 중복 아닌지??? => 나중에 확인할 것

In [216]:
input_data = [x_train_continue] + [x_train_category[:, i] for i in range(x_train_category.shape[1])] + [x_train_category_poly]

In [217]:
len(input_data)

5

In [218]:
# 이게 실수형 데이터 들
input_data[0]

array([[ 0.5968563 , -1.03754514, -0.69670276, ..., -1.25113817,
         0.53265036,  0.32936634],
       [ 1.4639742 ,  0.18555448, -0.93513226, ..., -1.25113817,
        -1.05423692, -0.99637742],
       [ 1.54924238,  1.4086541 , -0.75241737, ..., -1.25113817,
        -0.42547026, -0.75958757],
       ...,
       [ 0.86979438, -1.03754514,  2.06130521, ...,  1.31579139,
         1.55065353,  1.87443758],
       [ 1.49454487,  1.4086541 ,  1.37092967, ...,  1.31579139,
         0.68235671,  1.17175148],
       [ 1.37301149,  0.18555448, -0.25176787, ...,  1.31579139,
         0.65241544, -0.3677318 ]])

In [219]:
print(input_data[1])

[0 0 0 ... 0 0 0]


In [220]:
print(len(input_data[1]))

84992


* 모델 파라미터 설정

In [221]:
epochs = 1
optimizer = 'adam'
batch_size = 128

 * y : 수입이 50k 초과인지 아닌지
 * 초과면 1, 아니면 0

In [222]:
test = np.eye(1055)[y_train]

In [223]:
y_train

array([  95,  448,  730, ..., 1051,  432, 1051])

In [224]:
test[0].argmax()

95

In [225]:
y_train.shape

(84992,)

In [226]:
input_data[0].shape

(84992, 7)

In [227]:
model.compile(optimizer=optimizer, loss='mse', metrics=['TopKCategoricalAccuracy'])
model.fit(input_data, test, epochs=epochs, batch_size=batch_size, validation_split=0.15, callbacks=[checkpoint, early_stopping])

Epoch 00001: val_loss improved from inf to 0.00095, saving model to ./data/wide-deep.h5


<tensorflow.python.keras.callbacks.History at 0x7f061000aa50>

## 평가

In [228]:
# train input data와 같은 방식으로 test data를 input 형식에 맞추어줌
eval_input_data = [x_test_continue] + [x_test_category[:, i] for i in range(x_test_category.shape[1])] + [x_test_category_poly]

In [229]:
test1 = np.eye(1055)[y_test]

In [230]:
loss, acc = model.evaluate(eval_input_data, test1)



In [231]:
print(f'test_loss: {loss} - test_acc: {acc}')
# 문자열 앞에 f는 formating이었나? 확인

test_loss: 0.0009463226888328791 - test_acc: 0.11150247603654861


* 모델 그래프로 그리기
    - 밑에 함수 수행하기 전에 터미널 명령어로 설치해야함
    1. pip install pydot
    2. sudo apt install graphviz

In [232]:
# 정규화된 데이터간 거리 구해보기
from scipy.spatial import distance

a = [0,0,0]
b = [0,3,4]
dist_ab = distance.euclidean(a,b)
print(dist_ab)

5.0


In [233]:
predict_y = model.predict(eval_input_data)

### 정확도 평가

In [264]:
def get_acc(score_matrix, top_n, test_matix):
    avg_acc = 0
    for i in range(len(score_matrix)):
        top = score_matrix.iloc[i].nlargest(top_n).index
        tmp = 0
        for j in range(len(top)):
#             print('test_matix', test_matix["target"][i])
#             print('top', top[j])
            
            if top[j] == test_matix["target"][i]:
                tmp += 1
        acc = tmp / len(top)
        avg_acc += acc / len(score_matrix)

    return avg_acc

In [286]:
dic = {'clnt_id': x_test['clnt_id'],'target' : y_test}
target_matrix = pd.DataFrame(dic).reset_index()
target_matrix = target_matrix.drop(['index'], axis=1)

target_matrix.head()

Unnamed: 0,clnt_id,target
0,23275,806
1,31000,422
2,64682,254
3,40955,337
4,54200,732


In [288]:
small_p = pd.DataFrame(predict_y[:1000])
small_p.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054
0,0.001101,0.000697,0.001059,0.000112,5.4e-05,0.000416,0.004709,0.000118,0.00058,1.4e-05,...,0.001289,6.7e-05,0.000497,0.000118,0.000186,8.6e-05,0.00039,0.000778,2.6e-05,0.000761
1,0.003764,0.002695,0.002318,0.000195,0.000146,0.001931,0.002673,0.000802,0.001769,0.000139,...,0.001504,0.000259,0.0006,0.000691,0.000288,0.000247,0.000831,0.001781,0.000271,0.00243
2,0.000136,0.000315,0.000467,3.3e-05,1.8e-05,2.5e-05,0.001205,3.3e-05,0.000212,5e-06,...,0.000331,3e-06,7.2e-05,1.5e-05,0.000103,1.4e-05,0.000426,0.00034,3e-06,0.000341
3,0.000209,0.000375,0.00041,4.1e-05,2e-05,2.6e-05,0.001254,3.7e-05,0.00015,3e-06,...,0.000388,6e-06,0.000114,1.1e-05,9.2e-05,1.7e-05,0.000373,0.000352,6e-06,0.000352
4,0.001144,0.00344,0.001801,0.000324,0.000497,0.001939,0.000578,0.000485,0.001677,0.000229,...,0.001035,0.000301,0.00172,0.000649,0.000495,0.000418,0.000785,0.001992,0.000662,0.002127


In [289]:
accuracy  = get_acc(small_p, 5, target_matrix)

print(f"정확도: {accuracy*100}%")

정확도: 2.519999999999996%


### 정확도 평가

In [258]:
def get_pred_list(predict_y, top_n, target_matrix, columns):
    test_matrix = target_matrix.copy()
    pred_matrix = predict_y.copy()
    pred_matrix.rename(columns = columns['hangle'], inplace = True)
    for i in range(len(pred_matrix)):
        top = pred_matrix.iloc[i].nlargest(top_n).index
        top = pd.DataFrame(top.astype(str).to_frame().apply(lambda x: ", ".join(x)))
        test_matrix.loc[i, 'pred'] = top.values
    test_matrix['target'] = test_matrix['target'].apply(lambda x: columns['hangle'][x])
    return test_matrix

In [290]:
d={'hangle': train_data['clac_nm3'], 'label': data['label']}
df = pd.DataFrame(data=d).drop_duplicates()
cate2papago = df.set_index('label').to_dict()

In [246]:
test = get_pred_list(small_p, 5, target_matrix, cate2papago)

In [248]:
test[test['pred'].notna()]

Unnamed: 0,clnt_id,target,pred
0,23275,찹쌀,"[달걀, 우유, 라면, 물, 두부]"
1,31000,사과,"[달걀, 스푼형 요거트, 일반 스낵, 우유, 기타 크랩]"
2,64682,달걀,"[우유, 달걀, 라면, 두부, 물]"
3,40955,물,"[우유, 달걀, 라면, 물, 소세지]"
4,54200,인스턴트 카레 / 검은콩 소스,"[달걀, 일반 스낵, 새우, 국내산 돼지고기 - 삼겹살, 두부]"
...,...,...,...
995,37053,옥수수 스낵,"[달걀, 우유, 두부, 스푼형 요거트, 여자 로퍼]"
996,69408,라면,"[달걀, 우유, 두부, 일반 스낵, 국내 Beefs-생크스.]"
997,71409,크래커,"[우유, 달걀, 두부, 라면, 소년의 역할극 장난감]"
998,28501,고구마,"[우유, 달걀, 스푼형 요거트, 라면, 두부]"
