## DeepFM+Transformer

In [2]:
import os
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [3]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
test_df = pd.read_csv('../data/ml-1m/test_df.csv')
data = train_df.append(test_df)
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))
data.head()

train_df.shape: (1956191, 14), test_df.shape: (12078, 14)


Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
0,1741,"2904,2008,230,1169,2064,3475,1952,1228,943,275...","898,948,3022,1888,2891,1080,2204,618,1372,852,...","3722,1244,3023,1594,3348,288,323,36,1226,1254,...",3266,1,4,974711543,50,1,4,29,456645670000000000000,6
1,1880,"1280,1213,1075,1274,1135,3292,2902,2008,3000,3...","3410,2666,3723,1109,2046,2078,1891,3639,2874,2...","3101,2671,1271,2406,3616,1158,2483,2858,2861,2...",3266,0,0,0,50,1,4,29,456645670000000000000,6
2,3292,"1895,1016,2030,2981,3008,2068,998,1002,1006,28...","3438,1948,3692,1963,1964,1178,1360,2134,3027,2...","2171,2972,3031,3176,1110,2041,2711,1059,3554,3...",3266,1,5,968098376,50,2,6,29,456645670000000000000,6
3,566,"2789,2069,1013,613,3657,3677,774,1965,3219,164...","1248,590,897,605,2118,2694,110,3483,2751,1024,...","1644,643,3377,1557,1599,2058,159,2270,50,1074,...",3266,1,2,976210413,50,1,3,29,456645670000000000000,6
4,1088,"52,1394,2851,2643,2909,416,1665,733,984,2266,1...","1233,2772,1815,539,3431,2761,838,1225,11,3029,...","2703,886,1253,1261,2365,3087,941,305,3061,711,...",3266,1,5,1023534057,50,2,1,29,456645670000000000000,6


##### 模型构建

In [4]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

feature_columns = [
    SparseFeature('user_id', data.user_id.max()+1, embedding_size=4),
    SparseFeature('gender', data.gender.max()+1, embedding_size=4),
    SparseFeature('age', data.age.max()+1, embedding_size=4),
    SparseFeature('item_id', data.item_id.max()+1, embedding_size=4),
    SparseFeature('item_cate_id', data.item_cate_id.max()+1, embedding_size=4),
    DenseFeature('hist_len', 1),
    VarLenSparseFeature('hist_item_id', data.item_id.max()+1, embedding_size=4, maxlen=50)
]

feature_columns

[SparseFeature(name='user_id', vocabulary_size=6041, embedding_size=4),
 SparseFeature(name='gender', vocabulary_size=3, embedding_size=4),
 SparseFeature(name='age', vocabulary_size=8, embedding_size=4),
 SparseFeature(name='item_id', vocabulary_size=3884, embedding_size=4),
 SparseFeature(name='item_cate_id', vocabulary_size=19, embedding_size=4),
 DenseFeature(name='hist_len', dimension=1),
 VarLenSparseFeature(name='hist_item_id', vocabulary_size=3884, embedding_size=4, maxlen=50)]

In [5]:
class FM_Layer(Layer):
    def __init__(self):
        super(FM_Layer, self).__init__()
        
    def call(self, inputs):
        concat_embed_values = inputs
        print('concat_embed_values.shape: ', concat_embed_values.shape) # (None, 26, 4)
        sum_square = tf.square(tf.reduce_sum(concat_embed_values, axis=1, keepdims=True)) # (None, 1, 4)
        print('sum_square.shape: ', sum_square.shape)
        square_sum = tf.reduce_sum(concat_embed_values * concat_embed_values, axis=1, keepdims=True) # (None, 1, 4)
        print('square_sum.shape: ', square_sum.shape)
        output = sum_square - square_sum # 和的平方-平方的和
        output = 0.5 * tf.reduce_sum(output, axis=2, keepdims=False) # (None 1)
        print('output.shape: ', output.shape)
        return output
    
    def compute_output_shape(self, input_shape):
        return (None, 1)
    
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')
    
def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict
    
def build_embedding_layers(feature_columns, is_linear):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    varlen_sparse_feature_columns = list(filter(lambda x: isinstance(x, VarLenSparseFeature), feature_columns))
    if is_linear:
        # 序列特征不参与线性模型的运算
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, 1, name='1d_emb_' + f.name)
    else:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='kd_emb_' + f.name)
        for f in varlen_sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict
    
def get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns):
    """ 数值特征拼接一起传入全连接层 + 类别特征onehot，flatten，add """
    concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values()))
    dense_logits_output = Dense(1)(concat_dense_inputs)
    
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    embedding_layer_dict = build_embedding_layers(sparse_feature_columns, is_linear=True)
    
    # embedding(input)查表操作，返回对应input的嵌入向量
    sparse_1d_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = Flatten()(embedding_layer_dict[f.name](_input))
        sparse_1d_embed_list.append(_embed)
    
    sparse_logits_output = Add()(sparse_1d_embed_list)
    linear_logits = Add()([dense_logits_output, sparse_logits_output])
    return linear_logits
    
def get_fm_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns):
    """ 取出input所对应的嵌入向量拼接在一起，计算和的平和-平方的和 """
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = kd_embedding_layer_dict[f.name](_input)
        sparse_kd_embed_list.append(_embed)
    
    concat_sparse_kd_embed_list = Concatenate(axis=1)(sparse_kd_embed_list)
    fm_logits = FM_Layer()(concat_sparse_kd_embed_list)
    return fm_logits
    
def get_dnn_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns):
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = kd_embedding_layer_dict[f.name](_input)
        flatten_embed = Flatten()(_embed)
        sparse_kd_embed_list.append(flatten_embed)
    concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed_list)
    #print('concat_sparse_kd_embed.shape: ', concat_sparse_kd_embed.shape)
    
    # DNN
    dnn_out = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(4)(concat_sparse_kd_embed))))
    #dnn_out = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(4)(dnn_out))))
    dnn_logits = Dense(1)(dnn_out)
    return dnn_logits

def DeepFM(feature_columns):
    """ Instantiates FM architecture
    :param feature_columns 
    :return: A kears model instance
    """
    dense_input_dict, sparse_input_dict, _ = build_input_layers(feature_columns)
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values())
    
    # linear w * x + b
    linear_logits = get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns)
    
    # fm 0.5 * [sum(vixi)**2 - sum(vixi*vixi)
    kd_embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=False)
    fm_logits = get_fm_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns)
    
    # dnn next_a = σ(w * a + b)
    dnn_logits = get_dnn_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns)
    
    output_logits = Add()([linear_logits, fm_logits, dnn_logits])
    output_layer = Activation("sigmoid")(output_logits)
    model = Model(input_list, output_layer)
    return model

model = DeepFM(feature_columns)
model.summary()

concat_embed_values.shape:  (None, 5, 4)
sum_square.shape:  (None, 1, 4)
square_sum.shape:  (None, 1, 4)
output.shape:  (None, 1)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None, 1)]          0         

In [6]:
def padding_mask(seq):
    """ 为避免padding对句子语义产生影响，需要将padding位mask掉，该函数作为所有为0的位置mask后为1。 """
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32) # (None, seq_len)
    # 扩充维度用于attention矩阵
    return seq[:, np.newaxis, np.newaxis, :] # (None, 1, 1, seq_len)

def scaled_dot_product_attention(q, k, v, mask):
    """ Q、K向量做点积操作，然后通过softmax归一化得到权重，对V加权求和得到atention向量 """
    # q (None, head_num, seq_len, depth) (None, 2, 50, 2)
    # mask (None, 1, 1, seq_len) (None, 1, 1, 50)
    matmul_qk = tf.matmul(q, k, transpose_b=True) # (None, 2, 50, 50)
    #print('matmul_qk.shape: ', matmul_qk.shape) 
    
    k_dim = tf.cast(tf.shape(k)[-1], tf.float32) # 分头后emb_size
    #print('k_dim: ', k_dim)
    scaled_attention = matmul_qk / tf.math.sqrt(k_dim) # (None, 2, 50, 50)
    if mask is not None:
        scaled_attention += (mask * -1e9)
    #print('scaled_attention.shape: ', scaled_attention.shape)
    attention_weight = tf.nn.softmax(scaled_attention, axis=-1) # (None, 2, 50, 50)
    #print('attention_weight.shape: ', attention_weight.shape)
    
    output = tf.matmul(attention_weight, v) # (None, 2, 50, 50) * (None, 2, 50, 2) 
    #print('output.shape: ', output.shape) 
    #print('******')
    return output # (None, 2, 50, 2)
    
class MultiHeadAttention(Layer):
    """ 多头attention
        q, k, v input => Dense => 分头 => scaled_dot_product_attention => concat, dense => attention embedding 
    """
    def __init__(self, emb_size, head_num):
        super(MultiHeadAttention, self).__init__()
        self.emb_size = emb_size
        self.head_num = head_num
        
        # emb的维度可以分为多个部分
        assert emb_size % head_num == 0
        
        self.depth = emb_size // head_num
        self.wq = Dense(emb_size)
        self.wk = Dense(emb_size)
        self.wv = Dense(emb_size)
        self.dense = Dense(emb_size)
        
    def split_head(self, x, batch_size):
        """ 分头操作，将嵌入矩阵的维度分成多个部分 """
        x = tf.reshape(x, (batch_size, -1, self.head_num, self.depth)) # (None, 50, 4) => (None, 50, 2, 2) 
        return tf.transpose(x, perm=[0, 2, 1, 3]) # (None, 2, 50, 2) => (None, head_num, seq_len, depth)
        
    def call(self, inputs):
        q, k, v, mask = inputs
        batch_size = tf.shape(q)[0]
        
        # 分头前的前向网络
        q = self.wq(q) # (None, 50, 4)
        k = self.wk(k)
        v = self.wv(v)
        
        # 分头操作, 将嵌入矩阵的最后一个维度emb，拆分为两个维度，head_num和depth(emb)
        q = self.split_head(q, batch_size) # (None, head_num, seq_len, depth)
        k = self.split_head(k, batch_size)
        v = self.split_head(v, batch_size)
    
        # 缩放点积注意力层(PS:这里可以切换为其他注意力方式)
        scaled_attention = scaled_dot_product_attention(q, k, v, mask) # (None, head_num, seq_len, depth)
        #print('scaled_attention.shape: ', scaled_attention.shape)
        
        # 合并多头的维度
        scaled_attention = tf.transpose(scaled_attention, [0, 2, 1, 3]) # (None, seq_len, head_num, depth)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.emb_size)) # (None, max_seq_len, emb_size)
        
        # 全连接
        output = self.dense(concat_attention)
        #print('output.shape: ', output.shape) # (None, 50, 4)
        return output # (None, 50, 4)
        
def feed_forward_network(emb_size, middle_unit_num):
    return Sequential([
        Dense(middle_unit_num, activation='relu'),
        Dense(emb_size, activation='relu')
    ])
         
class LayerNormalization(Layer):
    """ 层标准化 """
    def __init__(self, epsilon=1e-6):
        super(LayerNormalization, self).__init__()
        self.eps = epsilon
    def build(self, input_shape):
        super(LayerNormalization, self).build(input_shape)
        self.gamma = self.add_weight(name='gamma',
                                     shape=input_shape[-1:],
                                     initializer=tf.ones_initializer(),
                                     trainable=True)
        self.beta = self.add_weight(name='beta',
                                    shape=input_shape[-1:],
                                    initializer=tf.zeros_initializer(),
                                    trainable=True)
    def call(self, x):
        mean = tf.keras.backend.mean(x, axis=-1, keepdims=True)
        std = tf.keras.backend.std(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

class EncoderLayer(Layer):
    """ Encoder层，输入序列所对应的嵌入矩阵 """
    def __init__(self,
                 emb_size, 
                 head_num, 
                 middle_unit_num,
                 max_seq_len,
                 dropout_rate=0.2, dropout_training=False):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(emb_size, head_num)
        self.ffn = feed_forward_network(emb_size, middle_unit_num)
        
        self.norm1 = LayerNormalization()
        self.norm2 = LayerNormalization()
        
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
        
        self.dropout_training = dropout_training
        
    def call(self, inputs, mask):
        # inputs Embedding (None, 50, 4)
        # 多头Attention => add & norm => feed forward => add & norm, 返回包含上下文信息的嵌入矩阵
        # 多层注意力网络
        mha_out = self.mha([inputs, inputs, inputs, mask]) # 嵌入矩阵(None, 50, 4)
        #print('mha_out.shape: ', mha_out.shape) #  (None, 50, 4)
        # add & norm
        mha_out = self.dropout1(mha_out, training=self.dropout_training)
        #print('mha_out.shape: ', mha_out.shape)
        norm_mha_out = self.norm1(inputs + mha_out) # (None, 50, 4)
        #print('norm_mha_out.shape: ', norm_mha_out.shape)
        # 前馈神经网络
        ffn_out = self.ffn(norm_mha_out) # (None, 50, 4)
        #print('ffn_out.shape: ', ffn_out.shape)
        ffn_out = self.dropout2(ffn_out, training=self.dropout_training)
        norm_ffn_out = self.norm2(norm_mha_out + ffn_out)
        #print('norm_ffn_out.shape: ', norm_ffn_out.shape)
        return norm_ffn_out # (None, 50, 4)

class PositionalEncoding(Layer):
    def __init__(self, sequence_len=None, emb_size=None):
        super(PositionalEncoding, self).__init__()
        self.sequence_len = sequence_len
        self.emb_size = emb_size
    
    def call(self, inputs):
        if self.emb_size == None:
            self.emb_size = int(inputs.shape[-1])
        
        position_emb = np.array([
            [pos/np.power(10000, 2.0*i/self.emb_size) for i in range(self.emb_size)] for pos in range(self.sequence_len)
        ])
        
        position_emb[:, 0::2] = np.sin(position_emb[:, 0::2]) # 2i
        position_emb[:, 1::2] = np.cos(position_emb[:, 1::2]) # 2i+1
        
        #print('position_emb.shape: ', position_emb.shape)
        #print('inputs.shape: ', inputs.shape)
        
        position_emb = tf.cast(position_emb, dtype=tf.float32) # 位置编码和原始embedding相加 (50, 4)
        #print('position_emb+inputs.shape: ', (position_emb + inputs).shape)
        return position_emb + inputs # (None, 50, 4)
     
class Encoder(Layer):
    def __init__(self, 
                 layer_num, # Encoder层数
                 emb_size, # 嵌入向量维度
                 head_num, # 多头attention的头数
                 middle_unit_num, # 前馈神经网络隐层神经元数量
                 max_seq_len, # 序列长度
                 dropout_rate=0.2, 
                 dropout_training=False):
        super(Encoder, self).__init__()
        self.layer_num = layer_num
        self.pos_embedding = PositionalEncoding(sequence_len=max_seq_len, emb_size=emb_size)
        self.encoder_layers = [EncoderLayer(emb_size, 
                                            head_num, 
                                            middle_unit_num,
                                            max_seq_len,
                                            dropout_rate=dropout_rate, 
                                            dropout_training=dropout_training) for _ in range(layer_num)]

    def call(self, inputs):
        emb, mask = inputs # (None, 50, 4) (None, 1, 1, 50)
        
        emb = self.pos_embedding(emb)
        
        for i in range(self.layer_num):
            emb = self.encoder_layers[i](emb, mask)        
        return emb

In [None]:
def embedding_lookup(columns, input_dict, embedding_layer_dict, flatten=False):
    """ 根据feature_columns或column_names查表，得到对应embedding向量列表 """
    embedding_list = []
    for f in columns:
        if type(f) == str:
            column_name = f
        else:
            column_name = f.name
        _input = input_dict[column_name]
        _embed = embedding_layer_dict[column_name]
        embed_layer = _embed(_input)
        if flatten:
            embed_layer = Flatten()(embed_layer)
        embedding_list.append(embed_layer)
    return embedding_list

def get_dnn_transformer_logits(sparse_input_dict,
                               varlen_sparse_input_dict,
                               kd_embedding_layer_dict, 
                               feature_columns,
                               behavior_column_names, 
                               behavior_seq_column_names):
    """ DNN侧: concate(Attention pooling embedding + flatten embedding) =》DNN """
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = kd_embedding_layer_dict[f.name](_input)
        flatten_embed = Flatten()(_embed)
        sparse_kd_embed_list.append(flatten_embed)
    # attention pooling
    # 当前物品的embedding列表
    query_embed_list = embedding_lookup(behavior_column_names, sparse_input_dict, kd_embedding_layer_dict, flatten=False) # [(None, 1, 4)]
    # 当前行为序列的embedding列表
    keys_embed_list = embedding_lookup(behavior_seq_column_names, varlen_sparse_input_dict, kd_embedding_layer_dict, flatten=False) # [(None, 50, 4)]
    
    # Transformer
    layer_num = 3
    emb_size = 4 
    head_num = 2
    middle_unit_num = 8
    max_seq_len = 50 # 序列长度
    
    seq_embed_list = []
    for i in range(len(query_embed_list)):
        padding_mask_list = padding_mask(varlen_sparse_input_dict[behavior_seq_column_names[i]])
        print('padding_mask_list: ', padding_mask_list)
        encoder_out = Encoder(layer_num,
                              emb_size,
                              head_num,
                              middle_unit_num, 
                              max_seq_len)([keys_embed_list[i], padding_mask_list]) # (None, 50, 4)
        
        flatten_embed = Flatten()(encoder_out)
        seq_embed_list.append(flatten_embed)
    
    # Attention pooling
#     seq_embed_list = []
#     # 使用注意力机制将历史行为序列对应的embedding进行池化(这里可能有多个ID及ID对应的历史序列，eg：点击物品与点击行为序列，搜索物品和搜索行为序列)
#     for i in range(len(query_embed_list)):
#         seq_embed = AttentionPoolingLayer()([query_embed_list[i], keys_embed_list[i]])
#         seq_embed_list.append(seq_embed)
    print('sparse_kd_embed_list.shape: ', sparse_kd_embed_list[0].shape)
    print('seq_embed_list.shape: ', seq_embed_list[0].shape)
    concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed_list+seq_embed_list)
    #print('concat_sparse_kd_embed.shape: ', concat_sparse_kd_embed.shape)
    
    # DNN
    dnn_out = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(4)(concat_sparse_kd_embed))))
    #dnn_out = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(4)(dnn_out))))
    dnn_logits = Dense(1)(dnn_out)
    return dnn_logits

def DeepFM_Transformer(feature_columns, behavior_column_names, behavior_seq_column_names):
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = build_input_layers(feature_columns)
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values()) + list(varlen_sparse_input_dict.values())
    
    # linear w * x + b
    linear_logits = get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns)
    
    # fm 0.5 * [sum(vixi)**2 - sum(vixi*vixi)
    kd_embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=False)
    fm_logits = get_fm_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns)
    
    # transformer dnn next_a = σ(w * a + b)
    dnn_logits = get_dnn_transformer_logits(sparse_input_dict,
                                            varlen_sparse_input_dict,
                                            kd_embedding_layer_dict, 
                                            feature_columns,
                                            behavior_column_names, 
                                            behavior_seq_column_names)
    
    output_logits = Add()([linear_logits, fm_logits, dnn_logits])
    output_layer = Activation("sigmoid")(output_logits)
    model = Model(input_list, output_layer)
    return model

behavior_column_names, behavior_seq_column_names = ['item_id'], ['hist_item_id']
model = DeepFM_Transformer(feature_columns, behavior_column_names, behavior_seq_column_names)
model.summary()

In [None]:
%%time
train_input = {
    'user_id': np.array(train_df['user_id']),
    'gender': np.array(train_df['gender']),
    'age': np.array(train_df['age']),
    'item_id': np.array(train_df['item_id']),
    'item_cate_id': np.array(train_df['item_cate_id']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in train_df['hist_item_id']]),
    'hist_len': np.array(train_df['hist_len']),
}
test_input = {
    'user_id': np.array(test_df['user_id']),
    'gender': np.array(test_df['gender']),
    'age': np.array(test_df['age']),
    'item_id': np.array(test_df['item_id']),
    'item_cate_id': np.array(test_df['item_cate_id']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df['hist_item_id']]),
    'hist_len': np.array(test_df['hist_len']),
}

# 模型训练
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

In [None]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]), test_df['label'].values)