In [1]:
import os
import faiss
import numpy as np
import pandas as pd
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

import warnings
warnings.filterwarnings("ignore")

##### 获取数据

In [2]:
train_df = pd.read_csv('../data/ml-1m/train_df.csv')
test_df = pd.read_csv('../data/ml-1m/test_df.csv')
train_df = train_df.sample(frac=1.0)
data = train_df.append(test_df)
print('train.shape: {}, test.shape: {}'.format(train_df.shape, test_df.shape))
data.head()

train.shape: (1956191, 14), test.shape: (12078, 14)


Unnamed: 0,user_id,hist_item_id,hist_s1,hist_s2,item_id,label,rating,click_timestamp,hist_len,gender,age,item_date,item_title,item_cate_id
597744,3547,"2289,1029,791,1172,1034,605,1516,2531,2694,117...","2234,1961,124,757,1252,2817,2056,1898,3187,226...","45,3431,535,263,1570,2087,1577,1783,2268,1583,...",506,1,4,966836312,50,1,4,74,971260000000000000,8
1903300,4551,"901,1210,1010,258,3100,1576,3530,3730,3503,351...","2624,438,1844,3178,161,498,1389,344,2632,3358,...","605,2262,1671,189,304,25,3660,1041,2371,316,0,...",1776,1,4,964559469,35,1,2,78,6718770000000000000,8
1084615,3311,"1272,3725,225,2126,589,1569,2844,3510,1205,111...","350,1035,2917,1406,3232,1547,10,1933,1798,2335...","2695,374,3696,2338,1350,1702,3636,1036,1733,34...",2341,1,2,967955234,50,1,3,60,2952150000000000000,1
1398942,5782,"1246,1999,1016,505,1109,1283,2284,1020,2729,28...","353,150,583,280,1177,2069,1059,3353,2459,1945,...","1929,584,2450,1893,1227,316,34,62,294,3184,0,0...",412,0,0,0,50,2,4,74,8268270000000000000,5
788932,4444,"2615,2631,2639,2937,2642,2702,2703,2948,2365,2...","2695,2919,258,1179,1181,1240,587,1180,2803,119...","2513,2434,3151,2530,1479,2624,2327,2328,2694,2...",346,1,2,965090434,50,2,3,75,7111971271300000000000,1


##### 模型构建

用户侧

- 用户ID
- 性别
- 年龄
- 序列长度
- 序列物品ID

物品侧

- 物品ID
- 物品类别ID

提取预训练embedding向量，例如提取user侧embedding向量。

1. 构建一个完整的模型结构。
2. 将用户/物品侧输入和输出张量作为模型属性。
3. 实例化一个新Model，选取用户输入输出，输入为用户Input层特征，输出为用户embedding向量。

tf.clip_by_value(t, min, max) 将张量t的元素范围限制在[min, max]之间。

In [3]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

user_feature_columns = [
    SparseFeature('user_id', data.user_id.max()+1, embedding_size=4),
    SparseFeature('gender', data.gender.max()+1, embedding_size=4),
    SparseFeature('age', data.age.max()+1, embedding_size=4),
    DenseFeature('hist_len', 1),
    VarLenSparseFeature('hist_item_id', data.item_id.max()+1, embedding_size=4, maxlen=50)
]
item_feature_columns = [
    SparseFeature('item_id', data.item_id.max()+1, embedding_size=4),
    SparseFeature('item_cate_id', data.item_cate_id.max()+1, embedding_size=4),
    #DenseFeature('item_date', 1),
]
user_feature_columns, item_feature_columns

([SparseFeature(name='user_id', vocabulary_size=6041, embedding_size=4),
  SparseFeature(name='gender', vocabulary_size=3, embedding_size=4),
  SparseFeature(name='age', vocabulary_size=8, embedding_size=4),
  DenseFeature(name='hist_len', dimension=1),
  VarLenSparseFeature(name='hist_item_id', vocabulary_size=3884, embedding_size=4, maxlen=50)],
 [SparseFeature(name='item_id', vocabulary_size=3884, embedding_size=4),
  SparseFeature(name='item_cate_id', vocabulary_size=19, embedding_size=4)])

In [4]:
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict

def concat_input_list(input_list):
    """ 合并input列表 """
    _num = len(input_list)
    if _num > 1:
        return Concatenate(axis=1)(input_list)
    elif len(input_list) == 1:
        return input_list[0]
    else:
        return None

def build_embedding_layers(feature_columns):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    for f in feature_columns:
        if isinstance(f, SparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='emb_' + f.name)
        elif isinstance(f, VarLenSparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict
    
def embedding_lookup(columns, input_dict, embedding_layer_dict, flatten=False):
    """ 根据feature_columns或column_names查表，得到对应embedding向量列表 """
    embedding_list = []
    for f in columns:
        if type(f) == str:
            column_name = f
        else:
            column_name = f.name
        _input = input_dict[column_name]
        _embed = embedding_layer_dict[column_name]
        embed_layer = _embed(_input)
        if flatten:
            embed_layer = Flatten()(embed_layer)
        embedding_list.append(embed_layer)
    return embedding_list
    
def get_dnn(dnn_input, hidden_units=[64, 32], activation='relu', l2=0.01):
    print('hidden_units: ', hidden_units)
    dnn_list = [Dense(unit, activation=activation, kernel_regularizer=tf.keras.regularizers.l2(l2=l2)) for unit in hidden_units]
    dnn_out = dnn_input
    for dnn in dnn_list:
        dnn_out = Dropout(0.5)(dnn(dnn_out)) # dnn(dnn_out)
    return dnn_out

class Similarity(Layer):
    def __init__(self, gamma=1, similarity_type='cosine'):
        super(Similarity, self).__init__()
        self.gamma = 1
        self.similarity_type = similarity_type
        
    def call(self, inputs):
        query, candidate = inputs
        if self.similarity_type == 'cosine':
            score = tf.reduce_sum(tf.multiply(query, candidate), -1, keepdims=True) # 点积
            norm_query = tf.norm(query, axis=-1, keepdims=True) 
            norm_candidate = tf.norm(candidate, axis=-1, keepdims=True)
            score = tf.divide(score, norm_query * norm_candidate + 1e-8)
        elif self.similarity_type == 'inner_product':
            score = tf.reduce_sum(tf.multiply(query, candidate), -1, keepdims=True) # 点积
        else:
            raise Exception("similarity_type error")
        return score
    
    def compute_output_shape(self, input_shape):
        return (None, 1)
    
def DSSM(user_feature_columns,
         item_feature_columns,
         user_dnn_hidden_units=[8, 4],
         item_dnn_hidden_units=[8, 4],
         dnn_activation='relu',
         l2=0.01, dnn_dropout=0.5):
    """ Deep Structured Semantic Model """
    user_dense_input_dict, user_sparse_input_dict, _ = build_input_layers(user_feature_columns)
    item_dense_input_dict, item_sparse_input_dict, _ = build_input_layers(item_feature_columns)
    
    # user/item Input
    user_input_list = list(user_dense_input_dict.values()) + list(user_sparse_input_dict.values())
    item_input_list = list(item_dense_input_dict.values()) + list(item_sparse_input_dict.values())

    # 用户侧 concat(dense feature: input + sparse feature: input->flatten embed->concat) =》DNN =》user_embedding
    user_dense_input_list = list(user_dense_input_dict.values())
    user_embedding_layer_dict = build_embedding_layers(user_feature_columns)
    user_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), user_feature_columns))
    flatten_user_sparse_embed_list = embedding_lookup(user_sparse_feature_columns, user_sparse_input_dict, user_embedding_layer_dict, flatten=True)
    user_dnn_input = concat_input_list(user_dense_input_list + flatten_user_sparse_embed_list)
    user_dnn_out = get_dnn(user_dnn_input, hidden_units=user_dnn_hidden_units)
    
    # 物品侧 concat(dense feature: input + sparse feature: input->flatten embed->concat) =》DNN =》item_embedding
    item_dense_input_list = list(item_dense_input_dict.values())
    item_embedding_layer_dict = build_embedding_layers(item_feature_columns)
    item_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), item_feature_columns))
    flatten_item_sparse_embed_list = embedding_lookup(item_sparse_feature_columns, item_sparse_input_dict, item_embedding_layer_dict, flatten=True)
    item_dnn_input = concat_input_list(item_dense_input_list + flatten_item_sparse_embed_list)
    item_dnn_out = get_dnn(item_dnn_input, hidden_units=item_dnn_hidden_units)
    
    # cosine inner_product
    score = Similarity()([user_dnn_out, item_dnn_out]) # (None, 1)
    
    output_layer = Dense(1, activation='sigmoid')(score)
    model = Model(user_input_list+item_input_list, output_layer)
    
    model.__setattr__("user_input", user_input_list)
    model.__setattr__("item_input", item_input_list)
    model.__setattr__("user_embedding", user_dnn_out)
    model.__setattr__("item_embedding", item_dnn_out)
    
    user_embed_model = Model(inputs=model.user_input, outputs=model.user_embedding)
    item_embed_model = Model(inputs=model.item_input, outputs=model.item_embedding)
    return model, user_embed_model, item_embed_model

model, user_embed_model, item_embed_model = DSSM(user_feature_columns,
                                                 item_feature_columns, 
                                                 user_dnn_hidden_units=[4],
                                                 item_dnn_hidden_units=[4])
model.summary()

hidden_units:  [4]
hidden_units:  [4]
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
age (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None, 1)]          0                                            
________________________________________________________

##### 模型训练

In [15]:
train_input = {
    'user_id': np.array(train_df['user_id']),
    'gender': np.array(train_df['gender']),
    'age': np.array(train_df['age']),
    'item_id': np.array(train_df['item_id']),
    'item_cate_id': np.array(train_df['item_cate_id']),
    'item_date': np.array(train_df['item_date']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in train_df['hist_item_id']]),
    'hist_len': np.array(train_df['hist_len']),
}
test_input = {
    'user_id': np.array(test_df['user_id']),
    'gender': np.array(test_df['gender']),
    'age': np.array(test_df['age']),
    'item_id': np.array(test_df['item_id']),
    'item_cate_id': np.array(test_df['item_cate_id']),
    'item_date': np.array(test_df['item_date']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df['hist_item_id']]),
    'hist_len': np.array(test_df['hist_len']),
}
# 用户表
user_input = {
    'user_id': np.array(test_df[test_df.label == 1]['user_id']),
    'gender': np.array(test_df[test_df.label == 1]['gender']),
    'age': np.array(test_df[test_df.label == 1]['age']),
    'hist_item_id': np.array([[int(i) for i in s.split(',')] for s in test_df[test_df.label == 1]['hist_item_id']]),
    'hist_len': np.array(test_df[test_df.label == 1]['hist_len']),
}
# 物品表
item_df = data[['item_id', 'item_cate_id', 'item_date']].drop_duplicates(['item_id'])
item_input = {
    'item_id': np.array(item_df['item_id']),
    'item_cate_id': np.array(item_df['item_cate_id']),
    'item_date': np.array(item_df['item_date']),
}

In [7]:
# 模型训练
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 00029: early stopping


<keras.callbacks.History at 0x138e1586668>

In [8]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]), test_df['label'].values)

模型准确率:0.7321576419937076, AUC得分:0.7706611301711677, LogLoss:0.5971096853772957
              precision    recall  f1-score   support

           0       0.72      0.76      0.74      6038
           1       0.74      0.71      0.73      6040

    accuracy                           0.73     12078
   macro avg       0.73      0.73      0.73     12078
weighted avg       0.73      0.73      0.73     12078



##### Embedding召回

① 提取user和item embedding向量。

② 构建faiss索引求用户TopN相似物品。

③ 评估召回率和hit rate。

④ 保存用户、物品向量到本地。

In [16]:
def get_recall(true_y, pred_y, top_n=50):
    """ 召回率 """
    return len(set(pred_y[:top_n])&set(true_y)) * 1.0 / len(true_y)

# 1. 提取embedding向量。
user_embeddings = user_embed_model.predict(user_input, batch_size=2**12)
item_embeddings = item_embed_model.predict(item_input, batch_size=2**12)
test_user_item_dict = test_df[test_df.label == 1][['user_id', 'item_id']].set_index('user_id').item_id.to_dict()

# 2. faiss求TopN相似物品。
embedding_size = 4
index = faiss.IndexFlatIP(embedding_size)
index.add(item_embeddings)
D, I = index.search(np.ascontiguousarray(user_embeddings), 50)

# 3. 评估召回率和hit rate。
hit = 0
recall_list = []
for i, uid in tqdm(enumerate(user_input['user_id'])):
    preds = [item_df['item_id'].values[j] for j in I[i]]
    recall = get_recall([test_user_item_dict[uid]], preds, top_n=50)
    recall_list.append(recall)
    if test_user_item_dict[uid] in preds:
        hit += 1

print('recall: ', np.mean(recall_list))

0it [00:00, ?it/s]

recall:  0.10231788079470198


##### 模型评估
recall:  0.10233482364629906