# Wide & Deep Learning model 구현

    * 꿈이 많은 사람의 이야기 블로그 참조하여 Wide & Deep Learning model 구현해보기
      - https://lsjsj92.tistory.com/597

* 라이브러리 불러오기

In [50]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model

In [123]:
path=os.getenv("HOME") + ('/repo/Lpoint-Hackathon/src/new_data/all_data_nm3.csv')
Input = pd.read_csv(path)
data = Input.copy()
data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3
0,49906,6,627381,,63.0,2211.0,DIRECT,unknown,20190701,1990,1,F,30.0,기능성 우유
1,67265,6,313631,,10.0,313.0,unknown,mobile_app,20190701,56900,1,F,40.0,성인용 침구 세트
2,68972,6,554066,,31.0,652.0,DIRECT,unknown,20190701,9980,2,F,50.0,인스턴트 라이스
3,41763,6,1241857,,39.0,1242.0,DIRECT,unknown,20190701,1290,1,F,40.0,새송이버섯
4,15344,6,120487,,10.0,133.0,DIRECT,unknown,20190701,4100,1,F,40.0,Cokes


## 데이터 전처리

In [126]:
COLUMNS = [
    "clnt_id", "clnt_age", "hit_pss_tm", "action_type", "de_dt", 
    "hit_pss_tm", "trans_id", "tot_pag_view_ct", "tot_sess_hr_v", 
    "trfc_src", "dvc_ctg_nm"
]

CATEGORICAL_COLUMNS = [
    "clnt_gender", "trfc_src", "dvc_ctg_nm"
]

CONTINUOUS_COLUMNS = [
    "clnt_id", "clnt_age", "hit_pss_tm", "action_type", "de_dt", 
    "tot_pag_view_ct", "tot_sess_hr_v"
]

In [127]:
for c in CATEGORICAL_COLUMNS:
    le = LabelEncoder()
    data[c] = le.fit_transform(data[c])

In [128]:
data.head()

Unnamed: 0,clnt_id,action_type,hit_pss_tm,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age,clac_nm3
0,49906,6,627381,,63.0,2211.0,0,3,20190701,1990,1,0,30.0,기능성 우유
1,67265,6,313631,,10.0,313.0,6,1,20190701,56900,1,0,40.0,성인용 침구 세트
2,68972,6,554066,,31.0,652.0,0,3,20190701,9980,2,0,50.0,인스턴트 라이스
3,41763,6,1241857,,39.0,1242.0,0,3,20190701,1290,1,0,40.0,새송이버섯
4,15344,6,120487,,10.0,133.0,0,3,20190701,4100,1,0,40.0,Cokes


## Train 데이터, Test 데이터 분할

In [136]:
data['clac_nm3'] = data['clac_nm3'].astype(str)
le = LabelEncoder()
data['clac_nm3'] = le.fit_transform(data['clac_nm3'])
Label = data['clac_nm3']

In [137]:
lab_len = len(data['clac_nm3'].value_counts())
lable_key = data['clac_nm3'].value_counts().keys()
data.drop(['clac_nm3'], axis=1)

Unnamed: 0,clnt_id,action_type,hit_pss_tm,sech_kwd,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,de_dt,buy_am,buy_ct,clnt_gender,clnt_age
0,49906,6,627381,,63.0,2211.0,0,3,20190701,1990,1,0,30.0
1,67265,6,313631,,10.0,313.0,6,1,20190701,56900,1,0,40.0
2,68972,6,554066,,31.0,652.0,0,3,20190701,9980,2,0,50.0
3,41763,6,1241857,,39.0,1242.0,0,3,20190701,1290,1,0,40.0
4,15344,6,120487,,10.0,133.0,0,3,20190701,4100,1,0,40.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123090,64811,6,1158981,,34.0,1159.0,0,0,20190930,1000,1,0,50.0
123091,49,6,803090,,24.0,806.0,0,3,20190930,2000,1,0,50.0
123092,72078,6,951307,,54.0,1200.0,0,3,20190930,3980,1,0,40.0
123093,39156,6,2592159,,68.0,2624.0,0,3,20190930,990,1,1,40.0


In [57]:
label = Label.copy()
label = np.eye(lab_len)[label]

In [58]:
lable_key

Int64Index([254, 275, 688, 337, 739, 267, 410, 221, 509,  95,
            ...
            268, 426, 682, 971, 907, 779, 746, 651, 459,  31],
           dtype='int64', length=1055)

In [59]:
# label = pd.DataFrame(label, columns=lable_key)

In [60]:
train_x, test_x , train_y , test_y = train_test_split(data , label , test_size=0.2, shuffle=False)

In [61]:
print('Train 데이터: ', train_x.shape)
print('Test 데이터: ', test_x.shape)
print('Train 라벨: ', train_y.shape)
print('Test 라벨: ', test_y.shape)

Train 데이터:  (98476, 14)
Test 데이터:  (24619, 14)
Train 라벨:  (98476, 1055)
Test 라벨:  (24619, 1055)


In [62]:
train_x, val_x , train_y , val_y = train_test_split(train_x , train_y , test_size=0.1, shuffle=False)

In [63]:
print('Train 데이터: ', train_x.shape)
print('Val 데이터: ', val_x.shape)
print('Train 라벨: ', train_y.shape)
print('Val 라벨: ', val_y.shape)

Train 데이터:  (88628, 14)
Val 데이터:  (9848, 14)
Train 라벨:  (88628, 1055)
Val 라벨:  (9848, 1055)


In [15]:
import gc
def df_to_dataset(dataframe, label, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = label.copy()
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels.values))
    if shuffle:
        ds = ds.repeat()
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    gc.collect()
    return ds

In [16]:
batch_size = 128 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train_x, train_y)
val_ds = df_to_dataset(val_x, val_y)
test_ds = df_to_dataset(test_x, test_y)

## 카테고리 값들과 연속값들을 뽑아냄

In [17]:
train_x_category = np.array(train_x[CATEGORICAL_COLUMNS])
test_x_category = np.array(test_x[CATEGORICAL_COLUMNS])
val_x_category = np.array(val_x[CATEGORICAL_COLUMNS])

train_x_continue = np.array(train_x[CONTINUOUS_COLUMNS], dtype='float64')
test_x_continue = np.array(test_x[CONTINUOUS_COLUMNS], dtype='float64')
val_x_continue = np.array(val_x[CONTINUOUS_COLUMNS], dtype='float64')

## 정규화

In [18]:
scaler = StandardScaler()
train_x_continue = scaler.fit_transform(train_x_continue)
test_x_continue = scaler.transform(test_x_continue)
val_x_continue = scaler.transform(val_x_continue)

* 정규화 내용 확인

In [19]:
print(train_x_continue[0].sum())
print(train_x_continue[1].sum())
print(train_x_continue[2].sum())
print(train_x_continue[3].sum())
print(train_x_continue[4].sum())

-1.6401872159228421
-2.7128231458173024
-0.35443517464351554
-1.7566430544076208
-5.57768293430038


 - 정규화 한다고 다 더해서 1이 되거나 하지는 않는듯
 - 그렇다고 개별 값이 0~1은 아님

## Polynomial 하게 바꿔줌 
### (비선형적인 설정으로 선형 회귀를 확장하는 방법. 즉 다항식 함수로 바꿔줌)
    - 카테고리 값을 Polynomial로 바꿔줌

* sklearn.preprocessing.PolynomialFeatures 메소드
    - degree : 다항식 차수
    - interaction_only
        - default는 False
        - ex) degree = 3일 때, interaction_only=false 이면
            - a^2, a^3, b^2, b^3, ab, a^2*b, ab^2 Feature가 추가되고,
        - interaction_only=True 이면
            - ab만 추가됨
        

In [20]:
poly = PolynomialFeatures(degree=2, interaction_only=True)

In [21]:
train_x_category_poly = poly.fit_transform(train_x_continue)
test_x_category_poly = poly.fit_transform(test_x_continue)
val_x_category_poly = poly.fit_transform(val_x_continue)

=> feature가 8개 이므로, interaction은 7+6+5+4+3+2+1=28개 이고,
상수항 1을 추가하여 8+28+1=37

 * np.unique : np.arr 내 중복 제거

In [22]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.python.keras.layers.advanced_activations import ReLU, PReLU, LeakyReLU, ELU
from tensorflow.python.keras.optimizers import Adam, SGD
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.python.keras.models import Model
import tensorflow.keras.backend as K

In [23]:
def get_deep_model():
    
    category_inputs = []
    category_embeds = []
    
    # Categorical Data Embedding
    for i in range(len(CATEGORICAL_COLUMNS)):
        
        # input - embedding - flatten 순으로 layer 쌓기
        input_i = Input(shape=(1,), dtype='int32')
        
        dim = len(np.unique(data[CATEGORICAL_COLUMNS[i]]))
        # dim : data에서 카테고리별 요소가 몇 종류인지?
        # 예시 data[학력] = '초', '중', '고' => dim = 3
        
        embed_dim = int(np.ceil(dim ** 0.5))
        # embedding 차원을 0.5배 정도로 수행? (연산은 루트 올림인디?)
        # 왜 임베딩 차원을 이렇게 하는지 추후 검토
        
        embed_i = Embedding(dim, embed_dim, input_length=1)(input_i)
        # dim : 데이터가 몇 종류 있는지 = 임베딩 벡터를 몇 개 뽑아낼 것인지
        # embed_dim : 임베딩 처리 후 벡터의 차원 = 임베딩 벡터를 몇 차원 벡터로 뽑아 낼 것인지
        # input_length : 입력 데이터 길이
        
        flatten_i = Flatten()(embed_i)
        # category 값을 임베딩환 벡터들을 flatten
        # 서로 다른 값의 벡터 요소가 합쳐져도 괜찮은지??
        # 어차피 class에 대한 순서가 있으니 상관 없을듯
        
        category_inputs.append(input_i)
        category_embeds.append(flatten_i)
        
    # continuous 데이터 input
    continue_input = Input(shape=(len(CONTINUOUS_COLUMNS),))
    continue_dense = Dense(256, use_bias=False)(continue_input)
    # use_bias = False로 하는 이유는??
    
    # category와 continue를 합침
    concat_embeds = concatenate([continue_dense] + category_embeds)
    concat_embeds = Activation('relu')(concat_embeds)
    # Activation 효과 다시 공부
    # relu 말고 다른 것은 어떤지??
    bn_concat = BatchNormalization()(concat_embeds)
    # Batch Normalization 효과 다시 공부
    
    fc1 = Dense(512, use_bias=False)(bn_concat)
    relu1 = ReLU()(fc1)
    bn1 = BatchNormalization()(relu1)
    fc2 = Dense(256, use_bias=False)(bn1)
    relu2 = ReLU()(fc2)
    bn2 = BatchNormalization()(relu2)
    fc3 = Dense(128)(bn2)
    relu3 = ReLU()(fc3)
    
    return category_inputs, continue_input, relu3

In [24]:
def get_wide_model(poly):
    dim = poly.shape[1]
    return tf.keras.layers.Input(shape=(dim,))

# x_train_category_poly : 카테고리 데이터를 숫자로 바꾸고, Poly Feature를 추가한 것
# Poly Feature : a, b, c Feature를 이용해서 ab, bc, ca Feature를 만든것
# 데이터의 shape 만 가져옴

    * input - embedding - flatten 순으로 layer 쌓기

In [25]:
category_inputs, continue_input, deep_model = get_deep_model()
wide_model = get_wide_model(train_x_category_poly)

In [26]:
wide_model

<tf.Tensor 'input_5:0' shape=(None, 29) dtype=float32>

In [27]:
deep_model

<tf.Tensor 're_lu_2/Relu:0' shape=(None, 128) dtype=float32>

### Wide모델과 Deep model을 합치기

* wide model, deep model 합치기

In [28]:
out_layer = concatenate([deep_model, wide_model])
out_layer

<tf.Tensor 'concatenate_1/concat:0' shape=(None, 157) dtype=float32>

* 입력 값들 shape 확인

In [29]:
continue_input

<tf.Tensor 'input_4:0' shape=(None, 7) dtype=float32>

In [30]:
category_inputs

[<tf.Tensor 'input_1:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_2:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_3:0' shape=(None, 1) dtype=int32>]

In [31]:
wide_model

<tf.Tensor 'input_5:0' shape=(None, 29) dtype=float32>

* 입력 값들 종합하기

In [32]:
inputs = [continue_input] + category_inputs + [wide_model]
inputs

[<tf.Tensor 'input_4:0' shape=(None, 7) dtype=float32>,
 <tf.Tensor 'input_1:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_2:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_3:0' shape=(None, 1) dtype=int32>,
 <tf.Tensor 'input_5:0' shape=(None, 29) dtype=float32>]

* wide model, deep model 합친 것을 한 점으로 모으기

In [73]:
output = Dense(len(data['clac_nm3'].value_counts()), activation='sigmoid')(out_layer)
# 여기서는 왜 relu가 아니고 sigmoid를 썻는지???
# 마지막 출력 값 범위를 0~1로 출력하기 위함 일듯?

In [74]:
model = Model(inputs=inputs, outputs=output)
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 7)]          0                                            
_______________________________________________________________________________________

In [75]:
checkpoint = ModelCheckpoint(filepath='./data/wide-deep.h5', monitor='val_loss', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

### 입력 데이터

    * 위에서 정의한 리스트 변수 inputs에 맞추어
    * continue 데이터 => category 데이터 => poly data 순으로 입력 값을 넣어준다

 * 데이터 확인

* x_train_continue
    - 숫자형 데이터(나이, 학력 수치, gain, loss, 업무시간)를 float형으로 변환하고
    - StandardScaler()를 통해 정규화 한 것

In [76]:
train_x_continue

array([[ 0.59579673, -1.0332683 , -0.69691289, ..., -1.21881681,
         0.53646717,  0.33240989],
       [ 1.4622614 ,  0.18845219, -0.93592448, ..., -1.21881681,
        -1.05550571, -0.99742673],
       [ 1.54746534,  1.41017268, -0.75276352, ..., -1.21881681,
        -0.424724  , -0.75990586],
       ...,
       [-0.66519163,  0.18845219, -0.55282109, ...,  2.34855346,
        -0.12435176, -0.64429836],
       [ 0.80898128, -1.0332683 , -0.75433357, ...,  2.34855346,
        -0.57491012, -0.73117915],
       [ 1.27782757, -2.25498879, -0.10684869, ...,  2.34855346,
         0.05587158, -0.23441721]])

In [77]:
train_x_continue.shape

(88628, 7)

* x_train_category
    - 범주형 데이터(업종, 학력, 결혼 여부, 직업, 가족관계, 인종, 성별, 국적)을  
    - LabelEncoder()를 통해 숫자(정수)로 변경(인코딩)

* x_train_category_poly
    - 각 feature를 2개씩 쌍을 지어 내적한 결과로 Feature를 추가한 데이터

In [78]:
train_x_category

array([[0, 0, 3],
       [0, 6, 1],
       [0, 0, 3],
       ...,
       [0, 0, 3],
       [0, 0, 3],
       [0, 0, 3]])

In [79]:
train_x_category.shape

(88628, 3)

In [80]:
[train_x_category[:, i] for i in range(train_x_category.shape[1])]

[array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 6, 0, ..., 0, 0, 0]),
 array([3, 1, 3, ..., 3, 3, 3])]

In [81]:
[train_x_category]

[array([[0, 0, 3],
        [0, 6, 1],
        [0, 0, 3],
        ...,
        [0, 0, 3],
        [0, 0, 3],
        [0, 0, 3]])]

In [82]:
train_x_category_poly.shape

(88628, 29)

 * 모델 입력 데이터 설정 부분
 * category랑 category_poly랑 중복 아닌지??? => 나중에 확인할 것

In [83]:
input_data = [train_x_continue] + [train_x_category[:, i] for i in range(train_x_category.shape[1])] + [train_x_category_poly]
len(input_data)

5

In [84]:
# 이게 실수형 데이터 들
input_data[0]

array([[ 0.59579673, -1.0332683 , -0.69691289, ..., -1.21881681,
         0.53646717,  0.33240989],
       [ 1.4622614 ,  0.18845219, -0.93592448, ..., -1.21881681,
        -1.05550571, -0.99742673],
       [ 1.54746534,  1.41017268, -0.75276352, ..., -1.21881681,
        -0.424724  , -0.75990586],
       ...,
       [-0.66519163,  0.18845219, -0.55282109, ...,  2.34855346,
        -0.12435176, -0.64429836],
       [ 0.80898128, -1.0332683 , -0.75433357, ...,  2.34855346,
        -0.57491012, -0.73117915],
       [ 1.27782757, -2.25498879, -0.10684869, ...,  2.34855346,
         0.05587158, -0.23441721]])

* 모델 파라미터 설정

 * y : 수입이 50k 초과인지 아닌지
 * 초과면 1, 아니면 0

In [85]:
def binary_accuracy(y_true, y_pred):
    true = K.equal(y_true, 1.0 ) 
    pred = K.greater(y_pred , 0.5)
    true2 = K.cast(true , dtype = float)
    pred2 = K.cast(pred , dtype = float)
    return  K.sum(true2 * pred2) / K.sum(true2) 

gamma = 2.0
epsilon = K.epsilon()

def focal_loss(y_true, y_pred):
    # https://www.kaggle.com/mathormad/resnet50-v2-keras-focal-loss-mix-up
    pt = y_pred * y_true + (1-y_pred) * (1-y_true)
    pt = K.clip(pt, epsilon, 1-epsilon)
    CE = -K.log(pt)
    FL = K.pow(1-pt, gamma) * CE
    loss = K.sum(FL, axis=1)
    return loss
    return K.mean(K.sum(loss, axis=1))

model.compile(optimizer='adam',
              loss=focal_loss   , # focal_loss  ,  # 'binary_crossentropy',
              metrics=[ binary_accuracy ]) 

In [86]:
checkpoint_path = "/ckpt/my_checkpoint/KM-{epoch:04d}.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 save_weights_only=True,
                                                 save_best_only = True , 
                                                 save_freq = 'epoch' , 
                                                 verbose=1)
# https://www.kaggle.com/rejpalcz/focalloss-for-keras
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10):
    '''
    Wrapper function to create a LearningRateScheduler with step decay schedule.
    '''
    def schedule(epoch):
        return initial_lr * (decay_factor ** np.floor(epoch/step_size))
    
    return tf.keras.callbacks.LearningRateScheduler(schedule)

lr_sched = step_decay_schedule(initial_lr=1e-4, decay_factor=0.75, step_size=2)

  # `val_loss`가 2번의 에포크에 걸쳐 향상되지 않으면 훈련을 멈춥니다.
Early = tf.keras.callbacks.EarlyStopping(min_delta=0.0001, 
                                         patience=10 ,
                                         monitor='val_loss')

In [87]:
input_data[0].shape

(88628, 7)

In [88]:
model.fit(input_data, train_y, epochs=epochs, batch_size=batch_size, validation_split=0.15, callbacks=[checkpoint, early_stopping])

Epoch 1/1000
Epoch 00001: val_loss improved from inf to 2.20302, saving model to ./data/wide-deep.h5
Epoch 2/1000
Epoch 00002: val_loss improved from 2.20302 to 2.14471, saving model to ./data/wide-deep.h5
Epoch 3/1000
Epoch 00003: val_loss improved from 2.14471 to 2.14277, saving model to ./data/wide-deep.h5
Epoch 4/1000
Epoch 00004: val_loss did not improve from 2.14277
Epoch 5/1000
Epoch 00005: val_loss did not improve from 2.14277
Epoch 6/1000
Epoch 00006: val_loss did not improve from 2.14277
Epoch 7/1000
Epoch 00007: val_loss did not improve from 2.14277
Epoch 8/1000
Epoch 00008: val_loss did not improve from 2.14277


<tensorflow.python.keras.callbacks.History at 0x7ff8b7b58290>

## 평가

In [90]:
# train input data와 같은 방식으로 test data를 input 형식에 맞추어줌
eval_input_data = [test_x_continue] + [test_x_category[:, i] for i in range(test_x_category.shape[1])] + [test_x_category_poly]

In [91]:
loss, acc = model.evaluate(eval_input_data, test_y)



In [92]:
print(f'test_loss: {loss} - test_acc: {acc}')
# 문자열 앞에 f는 formating이었나? 확인

test_loss: 2.785388469696045 - test_acc: 0.0005681818001903594


* 모델 그래프로 그리기
    - 밑에 함수 수행하기 전에 터미널 명령어로 설치해야함
    1. pip install pydot
    2. sudo apt install graphviz

In [93]:
# 정규화된 데이터간 거리 구해보기
from scipy.spatial import distance

a = [0,0,0]
b = [0,3,4]
dist_ab = distance.euclidean(a,b)
print(dist_ab)

5.0


In [94]:
predict_y = model.predict(eval_input_data)

### 정확도 평가

In [95]:
def get_acc(score_matrix, top_n, test_matix):
    avg_acc = 0
    for i in range(len(score_matrix)):
        top = score_matrix.iloc[i].nlargest(top_n).index
        tmp = 0
        for j in range(len(top)):
#             print('test_matix', test_matix["target"][i])
#             print('top', top[j])
            
            if top[j] == test_matix["target"][i]:
                tmp += 1
        acc = tmp / len(top)
        avg_acc += acc / len(score_matrix)

    return avg_acc

In [110]:
dic = {'clnt_id': test_x['clnt_id'],'target' : test_x['clac_nm3']}
target_matrix = pd.DataFrame(dic).reset_index()
target_matrix = target_matrix.drop(['index'], axis=1)

target_matrix.head()

Unnamed: 0,clnt_id,target
0,42301,647
1,29233,775
2,58449,364
3,48924,364
4,47681,647


In [111]:
small_p = pd.DataFrame(predict_y[:1000])
small_p.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1045,1046,1047,1048,1049,1050,1051,1052,1053,1054
0,0.00387,0.009187,0.013846,0.001066,0.050542,0.012555,0.017187,0.040968,0.066419,0.03221,...,0.001556,0.003507,0.001765,0.006174,0.009276,0.002257,0.05442,0.00499,0.016043,0.002339
1,0.015519,0.007073,0.029056,0.004541,0.086528,0.004747,0.049089,0.037475,0.037094,0.049752,...,0.002761,0.001996,0.004467,0.002363,0.027529,0.002927,0.103238,0.018922,0.008862,0.00235
2,0.001252,0.048368,0.045424,0.000945,0.090145,0.016341,0.086308,0.094145,0.137432,0.054816,...,0.001171,0.003011,0.005142,0.00155,0.045851,0.0028,0.096974,0.003262,0.068804,0.004877
3,0.022368,0.008663,0.030572,0.00606,0.073585,0.004553,0.042916,0.029611,0.041933,0.05836,...,0.001998,0.001469,0.003201,0.002157,0.029728,0.002877,0.108355,0.019072,0.00663,0.002142
4,0.0609,0.062453,0.007792,0.026579,0.022882,0.006127,0.009927,0.007433,0.020697,0.02888,...,0.004389,0.015503,0.005152,0.001835,0.021648,0.001776,0.030017,0.009923,0.079559,0.000965


In [112]:
accuracy  = get_acc(small_p, 5, target_matrix)

print(f"정확도: {accuracy*100}%")

정확도: 2.0999999999999983%


### 정확도 평가

In [113]:
def get_pred_list(predict_y, top_n, target_matrix, columns):
    test_matrix = target_matrix.copy()
    pred_matrix = predict_y.copy()
    pred_matrix.rename(columns = columns['hangle'], inplace = True)
    for i in range(len(pred_matrix)):
        top = pred_matrix.iloc[i].nlargest(top_n).index
        top = pd.DataFrame(top.astype(str).to_frame().apply(lambda x: ", ".join(x)))
        test_matrix.loc[i, 'pred'] = top.values
    test_matrix['target'] = test_matrix['target'].apply(lambda x: columns['hangle'][x])
    return test_matrix

In [141]:
d={'hangle': Input['clac_nm3'], 'label': data['clac_nm3']}
df = pd.DataFrame(data=d).drop_duplicates()
cate2papago = df.set_index('label').to_dict()

In [142]:
test = get_pred_list(small_p, 5, target_matrix, cate2papago)

In [143]:
test[test['pred'].notna()]

Unnamed: 0,clnt_id,target,pred
0,42301,여자 로퍼,"[여자 골프 의류 세트, 여성 가죽 의류, 키즈 우산, 스포츠 가방, 치즈'!]"
1,29233,제너럴 요구르트,"[즉석 죽, 포도, 국내 Beefs-Rounds, 옥수수 스낵, 우유]"
2,58449,밤,"[포도, 아기 매트리스 패드, 냉동 떡볶이, 냉동 튀김 식품, 라면]"
3,48924,밤,"[옥수수 스낵, 즉석 죽, 제너럴 티 드링크, 국내 Beefs-Rounds, 포도]"
4,47681,여자 로퍼,"[여자 로퍼, 여성 청바지, 펫 도그 푸드, 조리 기구 세트, 여성 가죽 의류]"
...,...,...,...
995,16419,여성 스웨터 / 풀오버,"[남성 정장, 여성 청바지, 기타 컴퓨터 액세서리, 다른 영양학적 Supplemen..."
996,70886,팬케이크 믹스,"[옥수수 스낵, 국내 Beefs-Rounds, 라면, 즉석 죽, 제너럴 티 드링크]"
997,16419,여성 스웨터 / 풀오버,"[남성 정장, 여성 청바지, 기타 컴퓨터 액세서리, 다른 영양학적 Supplemen..."
998,47330,기능성 우유,"[즉석 죽, 옥수수 스낵, 포도, 일반 스낵, 라면]"
