## DIN

In [10]:
import os
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [11]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
test_df = pd.read_csv('../data/ml-1m/test_df.csv')
data = train_df.append(test_df)
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))
data.head()

train_df.shape: (1956191, 14), test_df.shape: (12078, 14)


Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
1181738,4791,"728,3021,1888,298,2549,391,1210,45,385,3358,24...","1062,534,1576,2504,2222,3746,3207,1225,2971,10...","1179,762,1707,1202,1268,660,1405,1895,1212,374...",1621,0,0,0,50,1,2,78,265326540000000000000,14
1269480,2572,"1337,1373,1074,161,548,3197,1637,1105,3350,472...","150,624,1469,2693,1603,2285,258,2729,1082,901,...","2287,1681,3106,296,1697,3199,16,25,3091,270,0,...",1187,1,5,973909898,50,1,4,43,2091112092000000000000,2
1270329,710,"2995,1602,3766,2734,2053,3640,2625,2591,2232,2...","957,1914,1327,1935,3771,1886,3039,3206,1766,42...","1952,3456,2401,909,1275,1200,958,642,2185,712,...",1147,0,0,0,50,1,3,53,2028202926000000000000,5
827379,3556,"1352,1540,3298,346,2338,2572,1674,2205,2047,85...","163,699,892,2694,2106,3411,2032,2560,648,2093,...","32,2848,1082,1376,1544,3107,2917,3367,2919,169...",1347,0,0,0,50,1,3,77,231723180000000000000,4
224295,1404,"771,2998,463,353,1371,1852,2334,1546,899,2790,...","1220,186,3683,1246,3356,233,2316,3293,1,1250,0...","1206,3617,2281,3436,34,2905,2140,1996,3475,39,...",2289,0,0,0,50,1,6,79,14234113412341334140000000000,8


##### 模型构建

In [12]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

feature_columns = [
    SparseFeature('user_id', data.user_id.max()+1, embedding_size=4),
    SparseFeature('gender', data.gender.max()+1, embedding_size=4),
    SparseFeature('age', data.age.max()+1, embedding_size=4),
    SparseFeature('item_id', data.item_id.max()+1, embedding_size=4),
    SparseFeature('item_cate_id', data.item_cate_id.max()+1, embedding_size=4),
    DenseFeature('hist_len', 1),
    VarLenSparseFeature('hist_item_id', data.item_id.max()+1, embedding_size=4, maxlen=50)
]

feature_columns

[SparseFeature(name='user_id', vocabulary_size=6041, embedding_size=4),
 SparseFeature(name='gender', vocabulary_size=3, embedding_size=4),
 SparseFeature(name='age', vocabulary_size=8, embedding_size=4),
 SparseFeature(name='item_id', vocabulary_size=3884, embedding_size=4),
 SparseFeature(name='item_cate_id', vocabulary_size=19, embedding_size=4),
 DenseFeature(name='hist_len', dimension=1),
 VarLenSparseFeature(name='hist_item_id', vocabulary_size=3884, embedding_size=4, maxlen=50)]

In [13]:
class LocalActivationUnit(Layer):
    """ 对用户行为embedding和物品embedding做元素减、乘运算，进一步挖掘二者之间的关系 """
    def __init__(self, hidden_units=(128, 64), activation='prelu'):
        super(LocalActivationUnit, self).__init__()
        self.hidden_units = hidden_units
        self.linear = Dense(1)
        self.dnn = [Dense(unit, activation=PReLU() if activation=='prelu' else Dice()) for unit in hidden_units]
        
    def call(self, inputs):
        query, keys = inputs
        key_len = keys.get_shape()[1]
        
        # 复制query使其与keys维度一致
        query = tf.tile(query, multiples=[1, key_len, 1]) # (None, 1, 4) => (None, 50, 4)
        
        # 对特征进行拼接（原始向量、向量差、积）
        concat_attention_input = tf.concat([query, keys, query-keys, query*keys], axis=-1) # (None, 50, 4*4)
        # 全连接层
        attention_out = concat_attention_input
        for fc in self.dnn:
            attention_out = fc(attention_out) # (None, 50, 64)
        attention_out = self.linear(attention_out) # (None, 50, 1)
        attention_out = tf.squeeze(attention_out, -1) # (None, 50)
        return attention_out
    
class AttentionPoolingLayer(Layer):
    def __init__(self, attention_hidden_units=(128, 64)):
        super(AttentionPoolingLayer, self).__init__()
        self.attention_hidden_units = attention_hidden_units
        self.activation_unit = LocalActivationUnit(self.attention_hidden_units)
    
    def call(self, inputs):
        query, keys = inputs # (None, 1, 4) (None, 50, 4)
        
        # 得到行为序列下每个物品对应的注意力权重
        attention_score = self.activation_unit([query, keys]) # (None, 50)
        
        # 1. 将attention_score中padding所对应的items保持权重为0。
        keys_mask = tf.not_equal(keys[:, :, 0], 0) # (None, 50) embedding矩阵中的非零向量设置为True(这里根据向量第一个元素是否为0判断是否为非零向量)
        paddings = tf.zeros_like(attention_score) # (None, 50) 创建一个所有元素都为0的张量
        outputs = tf.where(keys_mask, attention_score, paddings) # (None, 50) keys_mask为True的元素值用attention_score填充，其余用paddings填充
        
        # 2. 将权重扩充到keys相同维度，进行矩阵乘法（等价于向量与权重加权求和操作）。
        outputs = tf.expand_dims(outputs, axis=1) # (None, 1, 50)
        outputs = tf.matmul(outputs, keys) # (None, 1, 50) * (None, 50, 4) => (None, 1, 4) 矩阵乘法相当于对所有向量进行加权求和，返回用户兴趣向量
        outputs = tf.squeeze(outputs, axis=1) # (None, 4)
        return outputs # (None, 4)
    
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict
    
def concat_input_list(input_list):
    """ 合并input列表 """
    _num = len(input_list)
    if _num > 1:
        return Concatenate(axis=1)(input_list)
    elif len(input_list) == 1:
        return input_list[0]
    else:
        return None

def build_embedding_layers(feature_columns):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    for f in feature_columns:
        if isinstance(f, SparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='emb_' + f.name)
        elif isinstance(f, VarLenSparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict

def embedding_lookup(columns, input_dict, embedding_layer_dict, flatten=False):
    """ 根据feature_columns或column_names查表，得到对应embedding向量列表 """
    embedding_list = []
    for f in columns:
        if type(f) == str:
            column_name = f
        else:
            column_name = f.name
        _input = input_dict[column_name]
        _embed = embedding_layer_dict[column_name]
        embed_layer = _embed(_input)
        if flatten:
            embed_layer = Flatten()(embed_layer)
        embedding_list.append(embed_layer)
    return embedding_list

def get_dnn_logits(dnn_input, hidden_units=(100, 40), activation='prelu'):
    dnn_list = [Dense(unit, activation=PReLU() if activation=='prelu' else Dice()) for unit in hidden_units]
    dnn_out = dnn_input
    for dnn in dnn_list:
        dnn_out = dnn(dnn_out)
    dnn_logits = Dense(1, activation='sigmoid')(dnn_out)
    return dnn_logits

def DIN(feature_columns, behavior_column_names, behavior_seq_column_names):
    """ Deep Interest Network """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = build_input_layers(feature_columns)
    # Input
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values()) + list(varlen_sparse_input_dict.values())
    
    # dense feature (input->concat)
    concat_dense_input_list = concat_input_list(list(dense_input_dict.values()))
    
    # sparse feature (input->embed->concat)
    embedding_layer_dict = build_embedding_layers(feature_columns)
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    flatten_sparse_embed_list = embedding_lookup(sparse_feature_columns, sparse_input_dict, embedding_layer_dict, flatten=True)
    concat_flatten_sparse_embed_list = concat_input_list(flatten_sparse_embed_list)
    
    # 当前物品的embedding列表
    query_embed_list = embedding_lookup(behavior_column_names, sparse_input_dict, embedding_layer_dict, flatten=False) # [(None, 1, 4)]
    # 当前行为序列的embedding列表
    keys_embed_list = embedding_lookup(behavior_seq_column_names, varlen_sparse_input_dict, embedding_layer_dict, flatten=False) # [(None, 50, 4)]
    
    seq_embed_list = []
    # 使用注意力机制将历史行为序列对应的embedding进行池化(这里可能有多个ID及ID对应的历史序列，eg：点击物品与点击行为序列，搜索物品和搜索行为序列)
    for i in range(len(query_embed_list)):
        seq_embed = AttentionPoolingLayer()([query_embed_list[i], keys_embed_list[i]])
        seq_embed_list.append(seq_embed)
    
    # 拼接用户兴趣向量(拼接序列embeddings和注意力权重，然后加权求和)
    concat_seq_embed_list = concat_input_list(seq_embed_list) # (None, 4)
    
    #dnn_input = Concatenate(axis=1)([concat_dense_input_list, concat_flatten_sparse_embed_list]) # Embedding + MLP结构
    # concat dense feature + concat sparse embeddings + concat seq embeddings
    dnn_input = Concatenate(axis=1)([concat_dense_input_list, concat_flatten_sparse_embed_list, concat_seq_embed_list]) # DIN结构
    
    dnn_logits = get_dnn_logits(dnn_input, hidden_units=(64, 32), activation='prelu')
    model = Model(input_list, dnn_logits)
    return model

behavior_column_names, behavior_seq_column_names = ['item_id'], ['hist_item_id']
model = DIN(feature_columns, behavior_column_names, behavior_seq_column_names)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None, 1)]          0                                            
____________________________________________________________________________________________

In [None]:
train_input = {
    'user_id': np.array(train_df['user_id']),
    'gender': np.array(train_df['gender']),
    'age': np.array(train_df['age']),
    'item_id': np.array(train_df['item_id']),
    'item_cate_id': np.array(train_df['item_cate_id']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in train_df['hist_item_id']]),
    'hist_len': np.array(train_df['hist_len']),
}
test_input = {
    'user_id': np.array(test_df['user_id']),
    'gender': np.array(test_df['gender']),
    'age': np.array(test_df['age']),
    'item_id': np.array(test_df['item_id']),
    'item_cate_id': np.array(test_df['item_cate_id']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df['hist_item_id']]),
    'hist_len': np.array(test_df['hist_len']),
}

# 模型训练
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

In [None]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]), test_df['label'].values)