In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
import warnings
import math
from math import sqrt
import tensorflow.keras.backend as K
import sys
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Dense
from deepctr.layers.core import PredictionLayer, DNN
from deepctr.layers.utils import add_func
from deepctr.models import WDL

import os

In [2]:
path = os.getenv("HOME")+ "/Documents/LPOINT/data/"

In [3]:
userInfo = pd.read_csv(path+"고객 정보(CKS).csv")
transInfo = pd.read_csv(path+"거래 정보(CKS).csv")
onlineInfo = pd.read_csv(path+"온라인 행동 정보(CKS).csv")
prodInfo = pd.read_csv(path+"04상품분류정보.csv")

In [4]:
df = pd.merge(onlineInfo, transInfo, on=['trans_id', 'clnt_id', 'biz_unit'])
df = pd.merge(df, userInfo, on=['clnt_id'])
df = df.drop(['clnt_id', 'sess_id', 'trans_id', 'sech_kwd', 'hit_tm', 'de_tm'], axis=1)

In [5]:
df = pd.get_dummies(df, columns = ['pd_c'])

In [6]:
df.head()

Unnamed: 0,hit_seq,action_type,biz_unit,sess_dt,hit_pss_tm,tot_pag_view_ct,tot_sess_hr_v,trfc_src,dvc_ctg_nm,trans_seq,...,pd_c_1656.0,pd_c_1658.0,pd_c_1659.0,pd_c_1660.0,pd_c_1661.0,pd_c_1662.0,pd_c_1663.0,pd_c_1664.0,pd_c_1666.0,pd_c_1667.0
0,37,6,A03,20190728,791905,61,911,DIRECT,unknown,1,...,0,0,0,0,0,0,0,0,0,0
1,68,6,A03,20190731,3251794,85,3256,DIRECT,unknown,1,...,0,0,0,0,0,0,0,0,0,0
2,68,6,A03,20190731,3251794,85,3256,DIRECT,unknown,2,...,0,0,0,0,0,0,0,0,0,0
3,68,6,A03,20190731,3251794,85,3256,DIRECT,unknown,3,...,0,0,0,0,0,0,0,0,0,0
4,68,6,A03,20190731,3251794,85,3256,DIRECT,unknown,4,...,0,0,0,0,0,0,0,0,0,0


In [7]:
print(df.shape)

(127358, 1074)


In [8]:
df.columns[:20]

Index(['hit_seq', 'action_type', 'biz_unit', 'sess_dt', 'hit_pss_tm',
       'tot_pag_view_ct', 'tot_sess_hr_v', 'trfc_src', 'dvc_ctg_nm',
       'trans_seq', 'de_dt', 'buy_am', 'buy_ct', 'clnt_gender', 'clnt_age',
       'pd_c_6.0', 'pd_c_7.0', 'pd_c_17.0', 'pd_c_23.0', 'pd_c_30.0'],
      dtype='object')

In [9]:
import re
columns = df.columns.tolist()

In [10]:
target = [ i for i in columns if re.search("pd_c" , i) is not None ]
name = [ i for i in columns if not re.search("pd_c" , i) is not None ]

In [11]:
ALL_FIELDS = ['hit_seq', 'action_type', 'biz_unit', 'sess_dt', 'hit_pss_tm',
       'tot_pag_view_ct', 'tot_sess_hr_v', 'trfc_src', 'dvc_ctg_nm',
       'trans_seq', 'de_dt', 'buy_am', 'buy_ct', 'clnt_gender', 'clnt_age']
CONT_FIELDS = ['hit_seq', 'sess_dt', 'hit_pss_tm','tot_pag_view_ct', 'tot_sess_hr_v', 'trans_seq', 'de_dt', 'buy_am', 'buy_ct','clnt_age']
CAT_FIELDS = list(set(ALL_FIELDS).difference(CONT_FIELDS))

# Hyper-parameters for Experiment
NUM_BIN = 10
BATCH_SIZE = 256
EMBEDDING_SIZE = 5

### Preprocess

In [12]:
# Preprocess
from itertools import repeat
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def get_modified_data(X, all_fields, continuous_fields, categorical_fields, is_bin=False):
    field_dict = dict()
    field_index = []
    X_modified = pd.DataFrame()

    for index, col in enumerate(X.columns):
        if col not in all_fields:
            print("{} not included: Check your column list".format(col))
            raise ValueError

        if col in continuous_fields:
            scaler = MinMaxScaler()

            # 연속형 변수도 구간화 할 것인가?
            if is_bin:
                X_bin = pd.cut(scaler.fit_transform(X[[col]]).reshape(-1, ), NUM_BIN, labels=False)
                X_bin = pd.Series(X_bin).astype('str')

                X_bin_col = pd.get_dummies(X_bin, prefix=col, prefix_sep='-')
                field_dict[index] = list(X_bin_col.columns)
                field_index.extend(repeat(index, X_bin_col.shape[1]))
                X_modified = pd.concat([X_modified, X_bin_col], axis=1)

            else:
                X_cont_col = pd.DataFrame(scaler.fit_transform(X[[col]]), columns=[col])
                field_dict[index] = col
                field_index.append(index)
                X_modified = pd.concat([X_modified, X_cont_col], axis=1)

        if col in categorical_fields:
            X_cat_col = pd.get_dummies(X[col], prefix=col, prefix_sep='-')
            field_dict[index] = list(X_cat_col.columns)
            field_index.extend(repeat(index, X_cat_col.shape[1]))
            X_modified = pd.concat([X_modified, X_cat_col], axis=1)

    print('Data Prepared...')
    print('X shape: {}'.format(X_modified.shape))
    print('# of Feature: {}'.format(len(field_index)))
    print('# of Field: {}'.format(len(field_dict)))

    return field_dict, field_index, X_modified

### FM Layer

In [13]:
class FM_layer(tf.keras.layers.Layer):
    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(FM_layer, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        # Parameters of FM Layer
        # w: capture 1st order interactions
        # V: capture 2nd order interactions
        self.w = tf.Variable(tf.random.normal(shape=[num_feature],
                                              mean=0.0, stddev=1.0), name='w')
        self.V = tf.Variable(tf.random.normal(shape=(num_field, embedding_size),
                                              mean=0.0, stddev=0.01), name='V')

    def call(self, inputs):
        x_batch = tf.reshape(inputs, [-1, self.num_feature, 1])
        # Parameter V를 field_index에 맞게 복사하여 num_feature에 맞게 늘림
        embeds = tf.nn.embedding_lookup(params=self.V, ids=self.field_index)

        # Deep Component에서 쓸 Input
        # (batch_size, num_feature, embedding_size)
        new_inputs = tf.math.multiply(x_batch, embeds)

        # (batch_size, )
        linear_terms = tf.reduce_sum(
            tf.math.multiply(self.w, inputs), axis=1, keepdims=False)

        # (batch_size, )
        interactions = 0.5 * tf.subtract(
            tf.square(tf.reduce_sum(new_inputs, [1, 2])),
            tf.reduce_sum(tf.square(new_inputs), [1, 2])
        )

        linear_terms = tf.reshape(linear_terms, [-1, 1])
        interactions = tf.reshape(interactions, [-1, 1])

        y_fm = tf.concat([linear_terms, interactions], 1)

        return y_fm, new_inputs

In [92]:
import tensorflow as tf
tf.keras.backend.set_floatx('float32')

class DeepFM(tf.keras.Model):

    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(DeepFM, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        self.fm_layer = FM_layer(num_feature, num_field, embedding_size, field_index)

        self.layers1 = tf.keras.layers.Dense(units=firDense, activation='relu')
        self.dropout1 = tf.keras.layers.Dropout(rate=firDropout)
        self.layers2 = tf.keras.layers.Dense(units=secDense, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=secDropout)
        self.layers3 = tf.keras.layers.Dense(units=thrDesne, activation='relu')

        self.final = tf.keras.layers.Dense(units=1059, activation='sigmoid')

    def __repr__(self):
        return "DeepFM Model: #Field: {}, #Feature: {}, ES: {}".format(
            self.num_field, self.num_feature, self.embedding_size)

    def call(self, inputs):
        # 1) FM Component: (num_batch, 2)
        y_fm, new_inputs = self.fm_layer(inputs)

        # retrieve Dense Vectors: (num_batch, num_feature*embedding_size)
        new_inputs = tf.reshape(new_inputs, [-1, self.num_feature*self.embedding_size])

        # 2) Deep Component
        y_deep = self.layers1(new_inputs)
        y_deep = self.dropout1(y_deep)
        y_deep = self.layers2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.layers3(y_deep)

        # Concatenation
        y_pred = tf.concat([y_fm, y_deep], 1)
        y_pred = self.final(y_pred)
        y_pred = tf.reshape(y_pred, [-1, 1059])

        return y_pred

In [111]:
import numpy as np
import pandas as pd
from time import perf_counter
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import BinaryAccuracy, AUC


def get_data(df):
    X = df[name]
    Y = df[target]

    X.columns = ALL_FIELDS
    field_dict, field_index, X_modified = \
        get_modified_data(X, ALL_FIELDS, CONT_FIELDS, CAT_FIELDS, False)

    X_train, X_test, Y_train, Y_test = train_test_split(X_modified, Y, test_size=0.2, shuffle=False)

    X_val, X_test, Y_val, Y_test = train_test_split(X_test, Y_test, test_size=0.5)

    train_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_train.values, tf.float32), tf.cast(Y_train, tf.float32))) \
        .shuffle(30000).batch(BATCH_SIZE)

    test_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_test.values, tf.float32), tf.cast(Y_test, tf.float32))) \
        .shuffle(10000).batch(BATCH_SIZE)
    
    val_ds = tf.data.Dataset.from_tensor_slices(
        (tf.cast(X_val.values, tf.float32), tf.cast(Y_val, tf.float32))) \
        .shuffle(10000).batch(BATCH_SIZE)    

    return train_ds, val_ds, test_ds, field_dict, field_index


def focal_loss(y_true, y_pred):
    gamma = 2.0
    epsilon = K.epsilon()
    # https://www.kaggle.com/mathormad/resnet50-v2-keras-focal-loss-mix-up
    pt = y_pred * y_true + (1-y_pred) * (1-y_true)
    pt = K.clip(pt, epsilon, 1-epsilon)
    CE = -K.log(pt)
    FL = K.pow(1-pt, gamma) * CE
    loss = K.sum(FL, axis=1)
    return loss
    return K.mean(K.sum(loss, axis=1))

# Batch 단위 학습
def train_on_batch(model, optimizer, acc, auc, inputs, targets, val_inputs, val_targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        loss = focal_loss(y_true=targets, y_pred=y_pred)

    grads = tape.gradient(target=loss, sources=model.trainable_variables)

    # apply_gradients()를 통해 processed gradients를 적용함
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    # accuracy & auc
    acc.update_state(targets, y_pred)
    auc.update_state(targets, y_pred)
    
    val_y_pred = model(val_inputs)
    val_loss = focal_loss(y_true=val_targets, y_pred=val_y_pred)
    return loss, val_loss

In [112]:
train_ds, val_ds, test_ds, field_dict, field_index = get_data(df[:-138])

Data Prepared...
X shape: (127220, 28)
# of Feature: 28
# of Field: 15


In [117]:
for x, y in train_ds:
    print(x)
    print(y)
    break;

tf.Tensor(
[[0.28899083 1.         0.         ... 0.         1.         0.6       ]
 [0.47706422 1.         0.         ... 1.         0.         1.        ]
 [0.01834862 1.         0.         ... 1.         0.         0.6       ]
 ...
 [0.14678898 1.         0.         ... 1.         0.         0.8       ]
 [0.22018349 1.         0.         ... 1.         0.         0.6       ]
 [0.01834862 1.         0.         ... 0.         1.         0.4       ]], shape=(256, 28), dtype=float32)
tf.Tensor(
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]], shape=(256, 1059), dtype=float32)


In [126]:
a = [[1, 2, 3], [2,3,4], [3,4,5], [4,5,6], [5,6,7]]
b = [[11,12,13], [12,13,14], [13,14,15], [14,15,16], [15,16,17]]

x_first, x_second, y_first, y_second = train_test_split(a, b, test_size=0.2, shuffle=False)

print("x_train:", x_first)
print("x_test:", x_second)
print("y_train:", y_first)
print("y_test:", y_second)

x_train: [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]
x_test: [[5, 6, 7]]
y_train: [[11, 12, 13], [12, 13, 14], [13, 14, 15], [14, 15, 16]]
y_test: [[15, 16, 17]]


### TRIAL 1

In [93]:
firDense = 64
firDropout = 0.2
secDense = 16
secDropout = 0.2
thrDesne = 16
EMBEDDING_SIZE = 5

In [89]:
# 반복 학습 함수
model = DeepFM(embedding_size=EMBEDDING_SIZE, num_feature=len(field_index),
               num_field=len(field_dict), field_index=field_index)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

print("Start Training: Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
start = perf_counter()
for i in range(50):
    acc = BinaryAccuracy(threshold=0.5)
    auc = AUC()
    loss_history = []
    val_loss_history = []
    for train, val in zip(train_ds, val_ds):
        x, y = train
        x_val, y_val = val
        loss, val_loss = train_on_batch(model, optimizer, acc, auc, x, y, x_val, y_val)
        loss_history.append(loss)
        val_loss_history.append(val_loss)
    print("Epoch {:03d}: test_loss: {:.4f}, AUC: {:.4f}, val_loss: {:.4f}".format(
        i+1, np.mean(loss_history[0:-1]), auc.result().numpy(), np.mean(val_loss_history[0:-1])))
    
test_acc = BinaryAccuracy(threshold=0.5)
test_auc = AUC()
for x, y in test_ds:
    y_pred = model(x)
    test_acc.update_state(y, y_pred)
    test_auc.update_state(y, y_pred)

print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
print("Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
print("걸린 시간: {:.3f}".format(perf_counter() - start))
#     model.save_weights('weights/weights-epoch({})-batch({})-embedding({}).h5'.format(
#         epochs, BATCH_SIZE, EMBEDDING_SIZE))

Data Prepared...
X shape: (127220, 28)
# of Feature: 28
# of Field: 15
Start Training: Batch Size: 256, Embedding Size: 5
Epoch 001: test_loss: 27.3005, AUC: 0.6328, val_loss: 23.4844
Epoch 002: test_loss: 2.2040, AUC: 0.8818, val_loss: 2.1976
Epoch 003: test_loss: 2.0740, AUC: 0.9019, val_loss: 2.0846
Epoch 004: test_loss: 2.0283, AUC: 0.9097, val_loss: 2.0317
Epoch 005: test_loss: 1.9725, AUC: 0.9224, val_loss: 1.9847
Epoch 006: test_loss: 1.9546, AUC: 0.9263, val_loss: 1.9697
Epoch 007: test_loss: 1.9519, AUC: 0.9275, val_loss: 1.9619
Epoch 008: test_loss: 1.9371, AUC: 0.9300, val_loss: 1.9568
Epoch 009: test_loss: 1.9307, AUC: 0.9312, val_loss: 1.9507
Epoch 010: test_loss: 1.9263, AUC: 0.9314, val_loss: 1.9500
Epoch 011: test_loss: 1.9213, AUC: 0.9333, val_loss: 1.9441
Epoch 012: test_loss: 1.9190, AUC: 0.9332, val_loss: 1.9421
Epoch 013: test_loss: 1.9255, AUC: 0.9327, val_loss: 1.9436
Epoch 014: test_loss: 1.9158, AUC: 0.9347, val_loss: 1.9413
Epoch 015: test_loss: 1.9136, AUC: 0

In [96]:
pred_ans = model.predict(test_ds, batch_size=256)

In [97]:
test_matrix = pd.DataFrame(index=range(100))
pred_matrix = pd.DataFrame(pred_ans[:100])
pred_matrix.head()

top_n = 5

for i in range(len(pred_matrix)):
        top = pred_matrix.iloc[i].nlargest(top_n).index
        top = pd.DataFrame(top.astype(str).to_frame().apply(lambda x: ", ".join(x)))
        test_matrix.loc[i, 'pred'] = top.values
test_matrix.head(20)

Unnamed: 0,pred
0,"[628, 900, 901, 59, 358]"
1,"[[628, 900, 901, 358, 387]]"
2,"[[1056, 1044, 1057, 865, 342]]"
3,"[[628, 901, 231, 1023, 358]]"
4,"[[628, 901, 358, 231, 1023]]"
5,"[[628, 59, 901, 231, 358]]"
6,"[[628, 901, 358, 1023, 231]]"
7,"[[628, 900, 901, 358, 59]]"
8,"[[628, 901, 358, 59, 231]]"
9,"[[628, 901, 358, 1023, 231]]"


## trial2

In [100]:
EMBEDDING_SIZE = 15
firDense = 128
firDropout = 0.1
secDense = 64
secDropout = 0.1
thrDesne = 64

In [101]:
# 반복 학습 함수
train_ds, val_ds, test_ds, field_dict, field_index = get_data(df[:-138])

model = DeepFM(embedding_size=EMBEDDING_SIZE, num_feature=len(field_index),
               num_field=len(field_dict), field_index=field_index)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

print("Start Training: Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
start = perf_counter()
for i in range(50):
    acc = BinaryAccuracy(threshold=0.5)
    auc = AUC()
    loss_history = []
    val_loss_history = []
    for train, val in zip(train_ds, val_ds):
        x, y = train
        x_val, y_val = val
        loss, val_loss = train_on_batch(model, optimizer, acc, auc, x, y, x_val, y_val)
        loss_history.append(loss)
        val_loss_history.append(val_loss)
    print("Epoch {:03d}: test_loss: {:.4f}, AUC: {:.4f}, val_loss: {:.4f}".format(
        i+1, np.mean(loss_history[0:-1]), auc.result().numpy(), np.mean(val_loss_history[0:-1])))
    
test_acc = BinaryAccuracy(threshold=0.5)
test_auc = AUC()
for x, y in test_ds:
    y_pred = model(x)
    test_acc.update_state(y, y_pred)
    test_auc.update_state(y, y_pred)

print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
print("Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
print("걸린 시간: {:.3f}".format(perf_counter() - start))
#     model.save_weights('weights/weights-epoch({})-batch({})-embedding({}).h5'.format(
#         epochs, BATCH_SIZE, EMBEDDING_SIZE))

Data Prepared...
X shape: (127220, 28)
# of Feature: 28
# of Field: 15
Start Training: Batch Size: 256, Embedding Size: 15
Epoch 001: test_loss: 24.2363, AUC: 0.5814, val_loss: 20.5816
Epoch 002: test_loss: 3.1707, AUC: 0.8031, val_loss: 3.0136
Epoch 003: test_loss: 2.0185, AUC: 0.9125, val_loss: 2.0209
Epoch 004: test_loss: 1.9635, AUC: 0.9239, val_loss: 1.9715
Epoch 005: test_loss: 1.9419, AUC: 0.9291, val_loss: 1.9581
Epoch 006: test_loss: 1.9302, AUC: 0.9316, val_loss: 1.9506
Epoch 007: test_loss: 1.9343, AUC: 0.9316, val_loss: 1.9495
Epoch 008: test_loss: 1.9345, AUC: 0.9307, val_loss: 1.9541
Epoch 009: test_loss: 1.9400, AUC: 0.9305, val_loss: 1.9488
Epoch 010: test_loss: 1.9300, AUC: 0.9314, val_loss: 1.9453
Epoch 011: test_loss: 1.9250, AUC: 0.9326, val_loss: 1.9465
Epoch 012: test_loss: 1.9256, AUC: 0.9327, val_loss: 1.9429
Epoch 013: test_loss: 1.9258, AUC: 0.9330, val_loss: 1.9439
Epoch 014: test_loss: 1.9130, AUC: 0.9341, val_loss: 1.9459
Epoch 015: test_loss: 1.9243, AUC: 

In [102]:
pred_ans = model.predict(test_ds, batch_size=256)

In [103]:
pred_ans

array([[1.1886565e-02, 3.0345744e-03, 7.4943946e-06, ..., 1.2637867e-02,
        1.6204301e-02, 9.8752575e-03],
       [4.0835600e-02, 1.0990860e-02, 1.3139497e-03, ..., 3.3399362e-02,
        1.9132830e-02, 2.0829625e-02],
       [1.3667778e-02, 2.7242918e-03, 4.1060393e-06, ..., 1.2782857e-02,
        1.5518939e-02, 9.7792530e-03],
       ...,
       [1.6819661e-02, 3.2440170e-03, 8.7261078e-06, ..., 1.4976841e-02,
        1.6046958e-02, 1.0751506e-02],
       [3.9195511e-02, 5.9467917e-03, 4.9102349e-05, ..., 3.2371514e-02,
        1.8495634e-02, 1.8009597e-02],
       [2.1456510e-02, 4.7085136e-03, 1.7640672e-05, ..., 2.2629205e-02,
        1.8581698e-02, 1.4387791e-02]], dtype=float32)

In [104]:
test_matrix = pd.DataFrame(index=range(100))
pred_matrix = pd.DataFrame(pred_ans[:100])
pred_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1049,1050,1051,1052,1053,1054,1055,1056,1057,1058
0,0.011887,0.003035,7e-06,0.015159,0.017426,0.016392,0.02373,0.022504,0.004966,0.001028,...,0.009796,0.005939,0.001628,0.007994,0.014034,0.006137,0.008348,0.012638,0.016204,0.009875
1,0.040836,0.010991,0.001314,0.037697,0.020673,0.026854,0.017779,0.022432,0.063195,0.003942,...,0.016381,0.025668,0.010678,0.018092,0.022965,0.020557,0.020524,0.033399,0.019133,0.02083
2,0.013668,0.002724,4e-06,0.016725,0.020377,0.018811,0.023393,0.026251,0.005075,0.000748,...,0.010056,0.006559,0.001602,0.006888,0.013283,0.006444,0.008251,0.012783,0.015519,0.009779
3,0.012659,0.003428,3.3e-05,0.027681,0.022349,0.023753,0.014313,0.01863,0.012625,0.000882,...,0.012263,0.00771,0.002244,0.007317,0.017642,0.006856,0.012106,0.021578,0.01136,0.014454
4,0.035347,0.005303,2.4e-05,0.022541,0.094298,0.031021,0.019293,0.047911,0.008607,0.000937,...,0.016675,0.020217,0.003339,0.003498,0.013495,0.012128,0.016464,0.02263,0.018652,0.013604


In [105]:
top_n = 5

for i in range(len(pred_matrix)):
        top = pred_matrix.iloc[i].nlargest(top_n).index
        top = pd.DataFrame(top.astype(str).to_frame().apply(lambda x: ", ".join(x)))
        test_matrix.loc[i, 'pred'] = top.values

In [106]:
test_matrix.head(20)

Unnamed: 0,pred
0,"[628, 232, 900, 901, 790]"
1,"[[901, 358, 628, 232, 1022]]"
2,"[[628, 232, 901, 900, 107]]"
3,"[[901, 628, 358, 790, 232]]"
4,"[[901, 628, 59, 1023, 231]]"
5,"[[901, 59, 628, 358, 232]]"
6,"[[790, 901, 628, 358, 399]]"
7,"[[628, 901, 399, 358, 231]]"
8,"[[901, 628, 358, 231, 232]]"
9,"[[59, 790, 792, 231, 899]]"


## trial3

In [107]:
EMBEDDING_SIZE = 15
firDense = 128
firDropout = 0.1
secDense = 64
secDropout = 0.1
thrDesne = 64
EPOCH = 1000

In [108]:
# 반복 학습 함수
train_ds, val_ds, test_ds, field_dict, field_index = get_data(df[:-138])

model = DeepFM(embedding_size=EMBEDDING_SIZE, num_feature=len(field_index),
               num_field=len(field_dict), field_index=field_index)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

print("Start Training: Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
start = perf_counter()
for i in range(EPOCH):
    acc = BinaryAccuracy(threshold=0.5)
    auc = AUC()
    loss_history = []
    val_loss_history = []
    for train, val in zip(train_ds, val_ds):
        x, y = train
        x_val, y_val = val
        loss, val_loss = train_on_batch(model, optimizer, acc, auc, x, y, x_val, y_val)
        loss_history.append(loss)
        val_loss_history.append(val_loss)
    print("Epoch {:03d}: test_loss: {:.4f}, AUC: {:.4f}, val_loss: {:.4f}".format(
        i+1, np.mean(loss_history[0:-1]), auc.result().numpy(), np.mean(val_loss_history[0:-1])))
    
test_acc = BinaryAccuracy(threshold=0.5)
test_auc = AUC()
for x, y in test_ds:
    y_pred = model(x)
    test_acc.update_state(y, y_pred)
    test_auc.update_state(y, y_pred)

print("테스트 ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
print("Batch Size: {}, Embedding Size: {}".format(BATCH_SIZE, EMBEDDING_SIZE))
print("걸린 시간: {:.3f}".format(perf_counter() - start))
#     model.save_weights('weights/weights-epoch({})-batch({})-embedding({}).h5'.format(
#         epochs, BATCH_SIZE, EMBEDDING_SIZE))

Data Prepared...
X shape: (127220, 28)
# of Feature: 28
# of Field: 15
Start Training: Batch Size: 256, Embedding Size: 15
Epoch 001: test_loss: 22.9554, AUC: 0.5812, val_loss: 19.2937
Epoch 002: test_loss: 2.3891, AUC: 0.8592, val_loss: 2.3532
Epoch 003: test_loss: 2.0459, AUC: 0.9071, val_loss: 2.0394
Epoch 004: test_loss: 2.0369, AUC: 0.9096, val_loss: 2.0311
Epoch 005: test_loss: 2.0301, AUC: 0.9098, val_loss: 2.0278
Epoch 006: test_loss: 2.0158, AUC: 0.9120, val_loss: 2.0172
Epoch 007: test_loss: 2.0008, AUC: 0.9146, val_loss: 2.0090
Epoch 008: test_loss: 2.0028, AUC: 0.9158, val_loss: 1.9982
Epoch 009: test_loss: 1.9886, AUC: 0.9179, val_loss: 1.9866
Epoch 010: test_loss: 1.9819, AUC: 0.9205, val_loss: 1.9793
Epoch 011: test_loss: 1.9656, AUC: 0.9232, val_loss: 1.9652
Epoch 012: test_loss: 1.9542, AUC: 0.9263, val_loss: 1.9635
Epoch 013: test_loss: 1.9499, AUC: 0.9269, val_loss: 1.9592
Epoch 014: test_loss: 1.9506, AUC: 0.9268, val_loss: 1.9527
Epoch 015: test_loss: 1.9451, AUC: 

Epoch 136: test_loss: 1.7839, AUC: 0.9526, val_loss: 1.9302
Epoch 137: test_loss: 1.7885, AUC: 0.9518, val_loss: 1.9299
Epoch 138: test_loss: 1.7955, AUC: 0.9517, val_loss: 1.9262
Epoch 139: test_loss: 1.7905, AUC: 0.9521, val_loss: 1.9272
Epoch 140: test_loss: 1.7820, AUC: 0.9528, val_loss: 1.9305
Epoch 141: test_loss: 1.7894, AUC: 0.9518, val_loss: 1.9320
Epoch 142: test_loss: 1.7944, AUC: 0.9515, val_loss: 1.9314
Epoch 143: test_loss: 1.7836, AUC: 0.9528, val_loss: 1.9317
Epoch 144: test_loss: 1.7815, AUC: 0.9527, val_loss: 1.9307
Epoch 145: test_loss: 1.7848, AUC: 0.9529, val_loss: 1.9329
Epoch 146: test_loss: 1.7785, AUC: 0.9536, val_loss: 1.9347
Epoch 147: test_loss: 1.7837, AUC: 0.9525, val_loss: 1.9282
Epoch 148: test_loss: 1.7939, AUC: 0.9519, val_loss: 1.9320
Epoch 149: test_loss: 1.7885, AUC: 0.9519, val_loss: 1.9363
Epoch 150: test_loss: 1.7794, AUC: 0.9527, val_loss: 1.9336
Epoch 151: test_loss: 1.7824, AUC: 0.9532, val_loss: 1.9345
Epoch 152: test_loss: 1.7870, AUC: 0.952

Epoch 273: test_loss: 1.5876, AUC: 0.9708, val_loss: 1.8211
Epoch 274: test_loss: 1.5661, AUC: 0.9729, val_loss: 1.8006
Epoch 275: test_loss: 1.5633, AUC: 0.9728, val_loss: 1.7945
Epoch 276: test_loss: 1.5720, AUC: 0.9723, val_loss: 1.7981
Epoch 277: test_loss: 1.5746, AUC: 0.9723, val_loss: 1.8019
Epoch 278: test_loss: 1.5945, AUC: 0.9707, val_loss: 1.8292
Epoch 279: test_loss: 1.5659, AUC: 0.9729, val_loss: 1.8000
Epoch 280: test_loss: 1.5725, AUC: 0.9724, val_loss: 1.8039
Epoch 281: test_loss: 1.5667, AUC: 0.9727, val_loss: 1.7971
Epoch 282: test_loss: 1.5552, AUC: 0.9736, val_loss: 1.7937
Epoch 283: test_loss: 1.5843, AUC: 0.9714, val_loss: 1.8259
Epoch 284: test_loss: 1.5808, AUC: 0.9720, val_loss: 1.8099
Epoch 285: test_loss: 1.5606, AUC: 0.9730, val_loss: 1.7937
Epoch 286: test_loss: 1.5554, AUC: 0.9732, val_loss: 1.8060
Epoch 287: test_loss: 1.5623, AUC: 0.9731, val_loss: 1.8007
Epoch 288: test_loss: 1.5434, AUC: 0.9746, val_loss: 1.7910
Epoch 289: test_loss: 1.5726, AUC: 0.972

Epoch 410: test_loss: 1.5206, AUC: 0.9761, val_loss: 1.8294
Epoch 411: test_loss: 1.5302, AUC: 0.9752, val_loss: 1.8483
Epoch 412: test_loss: 1.5246, AUC: 0.9756, val_loss: 1.8307
Epoch 413: test_loss: 1.5161, AUC: 0.9759, val_loss: 1.8303
Epoch 414: test_loss: 1.5181, AUC: 0.9762, val_loss: 1.8282
Epoch 415: test_loss: 1.5243, AUC: 0.9757, val_loss: 1.8419
Epoch 416: test_loss: 1.5183, AUC: 0.9761, val_loss: 1.8317
Epoch 417: test_loss: 1.5367, AUC: 0.9750, val_loss: 1.8476
Epoch 418: test_loss: 1.5078, AUC: 0.9767, val_loss: 1.8275
Epoch 419: test_loss: 1.5153, AUC: 0.9762, val_loss: 1.8356
Epoch 420: test_loss: 1.5200, AUC: 0.9758, val_loss: 1.8379
Epoch 421: test_loss: 1.5364, AUC: 0.9747, val_loss: 1.8580
Epoch 422: test_loss: 1.5335, AUC: 0.9752, val_loss: 1.8348
Epoch 423: test_loss: 1.5315, AUC: 0.9753, val_loss: 1.8396
Epoch 424: test_loss: 1.5146, AUC: 0.9764, val_loss: 1.8282
Epoch 425: test_loss: 1.5085, AUC: 0.9765, val_loss: 1.8288
Epoch 426: test_loss: 1.5141, AUC: 0.976

Epoch 547: test_loss: 1.5088, AUC: 0.9763, val_loss: 1.8574
Epoch 548: test_loss: 1.5161, AUC: 0.9762, val_loss: 1.8677
Epoch 549: test_loss: 1.4948, AUC: 0.9775, val_loss: 1.8577
Epoch 550: test_loss: 1.5140, AUC: 0.9763, val_loss: 1.8800
Epoch 551: test_loss: 1.4992, AUC: 0.9771, val_loss: 1.8654
Epoch 552: test_loss: 1.5145, AUC: 0.9766, val_loss: 1.8699
Epoch 553: test_loss: 1.4983, AUC: 0.9772, val_loss: 1.8682
Epoch 554: test_loss: 1.5087, AUC: 0.9768, val_loss: 1.8620
Epoch 555: test_loss: 1.4963, AUC: 0.9771, val_loss: 1.8633
Epoch 556: test_loss: 1.5070, AUC: 0.9767, val_loss: 1.8745
Epoch 557: test_loss: 1.5074, AUC: 0.9771, val_loss: 1.8622
Epoch 558: test_loss: 1.5012, AUC: 0.9770, val_loss: 1.8624
Epoch 559: test_loss: 1.5076, AUC: 0.9768, val_loss: 1.8571
Epoch 560: test_loss: 1.5073, AUC: 0.9768, val_loss: 1.8678
Epoch 561: test_loss: 1.5051, AUC: 0.9766, val_loss: 1.8695
Epoch 562: test_loss: 1.5247, AUC: 0.9757, val_loss: 1.8883
Epoch 563: test_loss: 1.5053, AUC: 0.976

Epoch 684: test_loss: 1.4839, AUC: 0.9781, val_loss: 1.8918
Epoch 685: test_loss: 1.4970, AUC: 0.9773, val_loss: 1.9040
Epoch 686: test_loss: 1.5002, AUC: 0.9772, val_loss: 1.8946
Epoch 687: test_loss: 1.4948, AUC: 0.9774, val_loss: 1.8998
Epoch 688: test_loss: 1.4952, AUC: 0.9776, val_loss: 1.8941
Epoch 689: test_loss: 1.4968, AUC: 0.9774, val_loss: 1.8831
Epoch 690: test_loss: 1.5087, AUC: 0.9767, val_loss: 1.8916
Epoch 691: test_loss: 1.5012, AUC: 0.9772, val_loss: 1.8926
Epoch 692: test_loss: 1.4980, AUC: 0.9774, val_loss: 1.8917
Epoch 693: test_loss: 1.5179, AUC: 0.9759, val_loss: 1.9019
Epoch 694: test_loss: 1.5070, AUC: 0.9769, val_loss: 1.8893
Epoch 695: test_loss: 1.4915, AUC: 0.9777, val_loss: 1.8874
Epoch 696: test_loss: 1.5066, AUC: 0.9772, val_loss: 1.8857
Epoch 697: test_loss: 1.4935, AUC: 0.9775, val_loss: 1.8806
Epoch 698: test_loss: 1.4926, AUC: 0.9777, val_loss: 1.8940
Epoch 699: test_loss: 1.5121, AUC: 0.9767, val_loss: 1.9000
Epoch 700: test_loss: 1.4924, AUC: 0.977

Epoch 821: test_loss: 1.4965, AUC: 0.9774, val_loss: 1.9136
Epoch 822: test_loss: 1.4887, AUC: 0.9778, val_loss: 1.9040
Epoch 823: test_loss: 1.4854, AUC: 0.9782, val_loss: 1.9105
Epoch 824: test_loss: 1.4871, AUC: 0.9780, val_loss: 1.9050
Epoch 825: test_loss: 1.5051, AUC: 0.9770, val_loss: 1.9032
Epoch 826: test_loss: 1.4908, AUC: 0.9778, val_loss: 1.9066
Epoch 827: test_loss: 1.4991, AUC: 0.9776, val_loss: 1.9020
Epoch 828: test_loss: 1.4892, AUC: 0.9780, val_loss: 1.9025
Epoch 829: test_loss: 1.5145, AUC: 0.9764, val_loss: 1.9188
Epoch 830: test_loss: 1.4864, AUC: 0.9781, val_loss: 1.8929
Epoch 831: test_loss: 1.4802, AUC: 0.9785, val_loss: 1.8981
Epoch 832: test_loss: 1.4682, AUC: 0.9790, val_loss: 1.9099
Epoch 833: test_loss: 1.5046, AUC: 0.9770, val_loss: 1.9161
Epoch 834: test_loss: 1.5526, AUC: 0.9736, val_loss: 1.9533
Epoch 835: test_loss: 1.6261, AUC: 0.9694, val_loss: 1.9770
Epoch 836: test_loss: 1.5313, AUC: 0.9755, val_loss: 1.8817
Epoch 837: test_loss: 1.4944, AUC: 0.977

Epoch 958: test_loss: 1.4937, AUC: 0.9779, val_loss: 1.9192
Epoch 959: test_loss: 1.4865, AUC: 0.9782, val_loss: 1.9171
Epoch 960: test_loss: 1.5019, AUC: 0.9771, val_loss: 1.9313
Epoch 961: test_loss: 1.4862, AUC: 0.9782, val_loss: 1.9101
Epoch 962: test_loss: 1.5028, AUC: 0.9771, val_loss: 1.9280
Epoch 963: test_loss: 1.4912, AUC: 0.9778, val_loss: 1.9182
Epoch 964: test_loss: 1.4997, AUC: 0.9771, val_loss: 1.9148
Epoch 965: test_loss: 1.4854, AUC: 0.9779, val_loss: 1.9036
Epoch 966: test_loss: 1.4858, AUC: 0.9784, val_loss: 1.9026
Epoch 967: test_loss: 1.4780, AUC: 0.9786, val_loss: 1.9125
Epoch 968: test_loss: 17.5159, AUC: 0.7436, val_loss: 17.7976
Epoch 969: test_loss: 15.5763, AUC: 0.5000, val_loss: 15.5544
Epoch 970: test_loss: 15.5591, AUC: 0.5000, val_loss: 15.5566
Epoch 971: test_loss: 15.5502, AUC: 0.5000, val_loss: 15.5502
Epoch 972: test_loss: 15.5630, AUC: 0.5000, val_loss: 15.5540
Epoch 973: test_loss: 15.5232, AUC: 0.5000, val_loss: 15.5502
Epoch 974: test_loss: 15.556

In [109]:
pred_ans = model.predict(test_ds, batch_size=256)

In [110]:
test_matrix = pd.DataFrame(index=range(100))
pred_matrix = pd.DataFrame(pred_ans[:100])
pred_matrix.head()

top_n = 5

for i in range(len(pred_matrix)):
        top = pred_matrix.iloc[i].nlargest(top_n).index
        top = pd.DataFrame(top.astype(str).to_frame().apply(lambda x: ", ".join(x)))
        test_matrix.loc[i, 'pred'] = top.values
test_matrix.head(20)

Unnamed: 0,pred
0,"[0, 1, 2, 3, 4]"
1,"[[0, 1, 2, 3, 4]]"
2,"[[94, 0, 1, 2, 3]]"
3,"[[0, 1, 2, 3, 4]]"
4,"[[0, 1, 2, 3, 4]]"
5,"[[0, 1, 2, 3, 4]]"
6,"[[0, 1, 2, 3, 4]]"
7,"[[94, 0, 1, 2, 3]]"
8,"[[0, 1, 2, 3, 4]]"
9,"[[0, 1, 2, 3, 4]]"
