In [3]:
import os
import faiss
import numpy as np
import pandas as pd
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.initializers import Zeros, glorot_normal

##### 获取数据

In [4]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
test_df = pd.read_csv('../data/ml-1m/test_df.csv')
train_df = train_df.sample(frac=1.0)
data = train_df.append(test_df)
print('train.shape: {}, test.shape: {}'.format(train_df.shape, test_df.shape))
data.head()

train.shape: (1956191, 14), test.shape: (12078, 14)


Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
32134,5982,"1814,1518,353,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","1814,1518,353,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",2993,0,0,0,3,1,4,23,124632390000000000000,5
856016,3824,"2974,2383,2392,2547,2667,3326,2191,2405,3364,2...","2382,2051,3808,1910,1912,2722,2882,2397,2409,3...","1913,3774,2312,2331,2680,3698,2810,2305,1369,2...",1361,1,2,966541021,50,1,3,63,233811930000000000000,5
492110,182,"3683,2046,504,524,1673,3510,643,3685,3687,3416...","1047,3184,1544,3121,3233,2368,1,3046,2328,353,...","1657,316,334,508,1196,1960,1727,109,160,3187,0...",39,1,4,977085349,47,1,2,76,8500000000000000,5
1622647,4269,"2301,2345,3382,2077,2471,1120,1357,2030,465,23...","2461,2503,1221,1255,2596,1282,957,1354,3635,12...","2919,3638,1182,741,258,32,913,1197,1189,2900,0...",2436,0,0,0,50,1,4,80,357135720000000000000,5
80387,1087,"3459,2339,2589,1336,1376,1608,2572,1114,1540,1...","1059,587,589,1112,2048,2047,2968,1899,2346,135...","1240,1268,958,2803,1180,2298,3393,3036,1272,28...",497,0,0,0,50,1,3,74,7489600000000000000,5


##### 模型构建

用户侧

- 用户ID
- 性别
- 年龄
- 序列长度
- 序列物品ID

物品侧

- 物品ID
- 物品类别ID

提取预训练embedding向量，例如提取user侧embedding向量。

1. 构建一个完整的模型结构。
2. 将用户/物品侧输入和输出张量作为模型属性。
3. 实例化一个新Model，选取用户输入输出，输入为用户Input层特征，输出为用户embedding向量。

In [5]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

user_feature_columns = [
    SparseFeature('user_id', data.user_id.max()+1, embedding_size=4),
    SparseFeature('gender', data.gender.max()+1, embedding_size=4),
    SparseFeature('age', data.age.max()+1, embedding_size=4),
    DenseFeature('hist_len', 1),
    VarLenSparseFeature('hist_item_id', data.item_id.max()+1, embedding_size=4, maxlen=50)
]
item_feature_columns = [
    SparseFeature('item_id', data.item_id.max()+1, embedding_size=4),
    SparseFeature('item_cate_id', data.item_cate_id.max()+1, embedding_size=4),
]
user_feature_columns, item_feature_columns

([SparseFeature(name='user_id', vocabulary_size=6041, embedding_size=4),
  SparseFeature(name='gender', vocabulary_size=3, embedding_size=4),
  SparseFeature(name='age', vocabulary_size=8, embedding_size=4),
  DenseFeature(name='hist_len', dimension=1),
  VarLenSparseFeature(name='hist_item_id', vocabulary_size=3884, embedding_size=4, maxlen=50)],
 [SparseFeature(name='item_id', vocabulary_size=3884, embedding_size=4),
  SparseFeature(name='item_cate_id', vocabulary_size=19, embedding_size=4)])

In [18]:
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict

def build_embedding_layers(feature_columns):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    for f in feature_columns:
        if isinstance(f, SparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='emb_' + f.name)
        elif isinstance(f, VarLenSparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict

def embedding_lookup(columns, input_dict, embedding_layer_dict, flatten=False):
    """ 根据feature_columns或column_names查表，得到对应embedding向量列表 """
    embedding_list = []
    for f in columns:
        if type(f) == str:
            column_name = f
        else:
            column_name = f.name
        _input = input_dict[column_name]
        _embed = embedding_layer_dict[column_name]
        embed_layer = _embed(_input)
        if flatten:
            embed_layer = Flatten()(embed_layer)
        embedding_list.append(embed_layer)
    return embedding_list

def concat_input_list(input_list):
    """ 合并input列表 """
    _num = len(input_list)
    if _num > 1:
        return Concatenate(axis=1)(input_list)
    elif len(input_list) == 1:
        return input_list[0]
    else:
        return None

class SequencePoolingLayer(Layer):
    """ 根据变长序列embeddings和序列长度input，求pooling embedding """
    def __init__(self, mode='mean'):
        super(SequencePoolingLayer, self).__init__()
        self.mode = mode
        self.epsilon = tf.constant(1e-8, tf.float32)
    
    def build(self, input_shape):
        self.max_seq_len = int(input_shape[0][1])
        self.embedding_size = int(input_shape[0][2])
    
    def call(self, inputs):
        seq_embed, seq_len = inputs # (None, 50, 4) (None, 1)
        mask = tf.sequence_mask(seq_len, self.max_seq_len, dtype=tf.float32) # 构造mask张量 (None, 1, 50)
        mask = tf.transpose(mask, (0, 2, 1)) # (None, 50, 1)
        mask = tf.tile(mask, [1, 1, self.embedding_size]) # (None, 50, 4)
        masked_seq_embed = seq_embed * mask # (None, 50, 4) 序列中补0部分的embedding为0
        
        # max pooling
        if self.mode == 'max':
            return tf.reduce_max(masked_seq_embed, 1, keepdims=True) # (None, 1, 4)
        elif self.mode == 'mean':
            _sum = tf.reduce_sum(masked_seq_embed, 1, keepdims=False)
            _mean = tf.divide(_sum, tf.cast(seq_len, tf.float32) + self.epsilon)
            return tf.expand_dims(_mean, axis=1) # (None, 1, 4)
        elif self.mode == 'sum':
            return tf.reduce_sum(masked_seq_embed, 1, keepdims=True) #(None, 1, 4)
        else:
            raise Exception("seq pooling mode error")

class EmbeddingIndex(Layer):
    def __init__(self, index):
        super(EmbeddingIndex, self).__init__()
        self.index = index
        
    def build(self, input_shape):
        super(EmbeddingIndex, self).build(input_shape)
    
    def call(self, x):
        return tf.constant(self.index)

class SampleSoftmaxLayer(Layer):
    def __init__(self, num_sampled=5):
        super(SampleSoftmaxLayer, self).__init__()
        self.num_sampled = num_sampled
    
    def build(self, input_shape):
        super(SampleSoftmaxLayer, self).build(input_shape)
        self.item_size = input_shape[0][0] # item个数
        self.zero_bias = self.add_weight(shape=[self.item_size],
                                         initializer=Zeros(),
                                         dtype=tf.float32,
                                         trainable=False,
                                         name='bias')
    
    def call(self, inputs):
        item_embedding_matrix, user_out, item_input = inputs
        loss = tf.nn.sampled_softmax_loss(weights=item_embedding_matrix,
                                          biases=self.zero_bias,
                                          labels=item_input,
                                          inputs=user_out,
                                          num_sampled=self.num_sampled,
                                          num_classes=self.item_size,
                                          num_true=1,
                                         )
        loss = tf.expand_dims(loss, axis=1)
        return loss
    
    def compute_output_shape(self, input_shape):
        return (None, 1)

    def get_config(self):
        config = {'num_sampled': self.num_sampled}
        base_config = super(SampleSoftmaxLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class GetItemEmbeddingLayer(Layer):
    """ 根据input从embedding矩阵中获取相应的embedding向量 """
    def __init__(self):
        super(GetItemEmbeddingLayer, self).__init__()
    
    def call(self, inputs):
        item_embedding_matrix, item_input = inputs
        item_input = tf.cast(item_input, dtype=tf.int32)
        _gather = tf.gather(item_embedding_matrix, item_input)
        return tf.squeeze(_gather, axis=1)
    
def get_dnn(dnn_input, hidden_units=[64, 32], activation='relu', l2=0.01):
    dnn_list = [Dense(unit, activation=activation, kernel_regularizer=tf.keras.regularizers.l2(l2=l2)) for unit in hidden_units]
    dnn_out = dnn_input
    for dnn in dnn_list:
        dnn_out = Dropout(0.5)(dnn(dnn_out)) # dnn(dnn_out) 
    return dnn_out
    
def YouTubeNet(user_feature_columns,
               item_feature_columns,
               num_sampled=5,
               user_dnn_hidden_units=[64, 32]):
    user_dense_input_dict, user_sparse_input_dict, user_varlen_sparse_input_dict = build_input_layers(user_feature_columns)
    item_dense_input_dict, item_sparse_input_dict, item_varlen_sparse_input_dict = build_input_layers(item_feature_columns)
    
    # user/item Input
    user_input_list = list(user_dense_input_dict.values()) + list(user_sparse_input_dict.values()) + list(user_varlen_sparse_input_dict.values())
    item_input_list = list(item_dense_input_dict.values()) + list(item_sparse_input_dict.values()) + list(item_varlen_sparse_input_dict.values())
    
    # 用户侧 concat(dense feature: input + sparse feature: input->flatten embed->concat + varlen sparse feature:input->pooling embed)
    user_dense_input_list = list(user_dense_input_dict.values())
    user_embedding_layer_dict = build_embedding_layers(user_feature_columns)
    
    user_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), user_feature_columns))
    user_varlen_sparse_feature_columns = list(filter(lambda x: isinstance(x, VarLenSparseFeature), user_feature_columns))

    flatten_user_sparse_embed_list = embedding_lookup(user_sparse_feature_columns, user_sparse_input_dict, user_embedding_layer_dict, flatten=True)
    user_varlen_sparse_embed_list = embedding_lookup(user_varlen_sparse_feature_columns, user_varlen_sparse_input_dict, user_embedding_layer_dict)
    
    # 行为序列池化
    _varlen_embed = user_varlen_sparse_embed_list[0]
    _varlen_len_input = user_dense_input_dict['hist_len']
    
    pooling_user_varlen_sparse_embed = SequencePoolingLayer(mode='mean')([_varlen_embed, _varlen_len_input])
    flatten_pooling_user_varlen_sparse_embed = Flatten()(pooling_user_varlen_sparse_embed)
    concat_flatten_user_sparse_embed_list = concat_input_list(user_dense_input_list + flatten_user_sparse_embed_list + [flatten_pooling_user_varlen_sparse_embed])
    user_dnn_out = get_dnn(concat_flatten_user_sparse_embed_list, hidden_units=user_dnn_hidden_units) # (None, 4)

    # 物品侧
    item_embedding_layer_dict = build_embedding_layers(item_feature_columns)
    item_vocabulary_size = item_feature_columns[0].vocabulary_size
    item_column_name = item_feature_columns[0].name
    # 获取嵌入矩阵
    item_index = EmbeddingIndex(list(range(item_vocabulary_size)))(item_sparse_input_dict[item_column_name])
    item_embedding_matrix = item_embedding_layer_dict[item_column_name](item_index)
    
    # concat user and item
    #item_emb = item_embedding_layer_dict[item_column_name](item_sparse_input_dict[item_column_name])
    #user_dnn_out = Concatenate(axis=1)([user_dnn_out, Flatten()(item_emb)])
    #output = Dense(1, activation='sigmoid')(user_dnn_out)

    # softmax layer
    output = SampleSoftmaxLayer(num_sampled=num_sampled)([
        item_embedding_matrix, # 物品权重矩阵
        user_dnn_out, # 用户全连接的隐层
        item_sparse_input_dict[item_column_name] # 输出层标签
    ])
    model = Model(inputs=user_input_list + item_input_list, outputs=output)
    
    # 获取embedding向量
    _item_embedding = GetItemEmbeddingLayer()([item_embedding_matrix, item_sparse_input_dict[item_column_name]])
    model.__setattr__('user_input', user_input_list)
    model.__setattr__('user_embedding', user_dnn_out)
    model.__setattr__('item_input', item_input_list)
    model.__setattr__('item_embedding', _item_embedding)
    
    user_embed_model = Model(inputs=model.user_input, outputs=model.user_embedding)
    item_embed_model = Model(inputs=model.item_input, outputs=model.item_embedding)
    
    return model, user_embed_model, item_embed_model

model, user_embed_model, item_embed_model = YouTubeNet(user_feature_columns,
                                                       item_feature_columns,
                                                       num_sampled=2,
                                                       user_dnn_hidden_units=[8, 4])
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
hist_item_id (InputLayer)       [(None, 50)]         0                                            
__________________________________________________________________________________________________
hist_len (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
____________________________________________________________________________________________

In [19]:
train_input = {
    'user_id': np.array(train_df['user_id']),
    'gender': np.array(train_df['gender']),
    'age': np.array(train_df['age']),
    'item_id': np.array(train_df['item_id']),
    'item_cate_id': np.array(train_df['item_cate_id']),
    'item_date': np.array(train_df['item_date']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in train_df['hist_item_id']]),
    'hist_len': np.array(train_df['hist_len']),
}
test_input = {
    'user_id': np.array(test_df['user_id']),
    'gender': np.array(test_df['gender']),
    'age': np.array(test_df['age']),
    'item_id': np.array(test_df['item_id']),
    'item_cate_id': np.array(test_df['item_cate_id']),
    'item_date': np.array(test_df['item_date']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df['hist_item_id']]),
    'hist_len': np.array(test_df['hist_len']),
}
# 用户表
user_input = {
    'user_id': np.array(test_df[test_df.label == 1]['user_id']),
    'gender': np.array(test_df[test_df.label == 1]['gender']),
    'age': np.array(test_df[test_df.label == 1]['age']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df[test_df.label == 1]['hist_item_id']]),
    'hist_len': np.array(test_df[test_df.label == 1]['hist_len']),
}
# 物品表
item_df = data[['item_id', 'item_cate_id', 'item_date']].drop_duplicates(['item_id'])
item_input = {
    'item_id': np.array(item_df['item_id']),
    'item_cate_id': np.array(item_df['item_cate_id']),
    'item_date': np.array(item_df['item_date']),
}

In [26]:
# 模型训练
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]

def sampled_softmax_loss(y_true, y_pred):
    return K.mean(y_pred)

model.compile('adam',
              loss=sampled_softmax_loss)

model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100


  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 00020: early stopping


<keras.callbacks.History at 0x257499d04a8>

In [27]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]).astype(np.float64), test_df['label'].values)

模型准确率:0.4611690677264448, AUC得分:0.4539537125797104, LogLoss:4.179249416828256
              precision    recall  f1-score   support

           0       0.47      0.69      0.56      6038
           1       0.43      0.24      0.30      6040

    accuracy                           0.46     12078
   macro avg       0.45      0.46      0.43     12078
weighted avg       0.45      0.46      0.43     12078



##### Embedding召回

① 提取user和item embedding向量。

② 构建faiss索引求用户TopN相似物品。

③ 评估召回率和hit rate。

④ 保存用户、物品向量到本地。

In [28]:
def get_recall(true_y, pred_y, top_n=50):
    """ 召回率 """
    return len(set(pred_y[:top_n])&set(true_y)) * 1.0 / len(true_y)

# 1. 提取embedding向量。
user_embeddings = user_embed_model.predict(user_input, batch_size=2**12)
item_embeddings = item_embed_model.predict(item_input, batch_size=2**12)
test_user_item_dict = test_df[test_df.label == 1][['user_id', 'item_id']].set_index('user_id').item_id.to_dict()

# 2. faiss求TopN相似物品。
embedding_size = 4
index = faiss.IndexFlatIP(embedding_size)
index.add(item_embeddings)
D, I = index.search(np.ascontiguousarray(user_embeddings), 50)

# 3. 评估召回率和hit rate。
hit = 0
recall_list = []
for i, uid in tqdm(enumerate(user_input['user_id'])):
    preds = [item_df['item_id'].values[j] for j in I[i]]
    recall = get_recall([test_user_item_dict[uid]], preds, top_n=50)
    recall_list.append(recall)
    if test_user_item_dict[uid] in preds:
        hit += 1

print('recall: ', np.mean(recall_list))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


0it [00:00, ?it/s]

recall:  0.075
