## NCF

In [1]:
import os
import faiss
import joblib
import numpy as np
import pandas as pd
from collections import namedtuple
from tqdm import tqdm_notebook as tqdm

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [2]:
def get_ml1m_data():
    """ 读取ml1m数据集 """
    train_path = '../data/ml-1m/train_df.csv'
    test_path = '../data/ml-1m/test_df.csv'
    encoder_dict_path = '../data/ml-1m/encoder_dict.pkl'
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    encoder_dict = joblib.load(encoder_dict_path)
    return train_df, test_df, encoder_dict

def parse_strlist(x):
    return [int(i) for i in x.split(',') if i.strip() != '']

train_df, test_df, encoder_dict = get_ml1m_data()
train_df = train_df.sample(frac=1.0)
data = pd.concat([train_df, test_df], axis=0)
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))

train_df.shape: (1956191, 14), test_df.shape: (12078, 14)


##### 模型构建

In [3]:
def get_embeding_dim(x):
    return int(x**.25) + 1

SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])

user_feature_columns = [
    SparseFeature('user_id', data.user_id.max()+1, embedding_size=8),
]
item_feature_columns = [
    SparseFeature('item_id', data.item_id.max()+1, embedding_size=8),
]
user_feature_columns, item_feature_columns

([SparseFeature(name='user_id', vocabulary_size=6041, embedding_size=8)],
 [SparseFeature(name='item_id', vocabulary_size=3884, embedding_size=8)])

In [4]:
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')
    
def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict
    
def build_embedding_layers(feature_columns, prefix='sparse_'):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    for f in feature_columns:
        if isinstance(f, SparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name=prefix + 'emb_' + f.name)
        elif isinstance(f, VarLenSparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name=prefix + 'var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict
    
def embedding_lookup(columns, input_dict, embedding_layer_dict, flatten=False):
    """ 根据feature_columns或column_names查表，得到对应embedding向量列表 """
    embedding_list = []
    for f in columns:
        if type(f) == str:
            column_name = f
        else:
            column_name = f.name
        _input = input_dict[column_name]
        _embed = embedding_layer_dict[column_name]
        embed_layer = _embed(_input)
        if flatten:
            embed_layer = Flatten()(embed_layer)
        embedding_list.append(embed_layer)
    return embedding_list
  
def get_dnn(dnn_input, hidden_units=[64, 32], activation='relu', l2=0.01):
    print('hidden_units: ', hidden_units)
    dnn_list = [Dense(unit, activation=activation, kernel_regularizer=tf.keras.regularizers.l2(l2=l2)) for unit in hidden_units]
    dnn_out = dnn_input
    for dnn in dnn_list:
        dnn_out = Dropout(0.5)(dnn(dnn_out)) # dnn(dnn_out)
    return dnn_out
    
def NCF(user_feature_columns,
        item_feature_columns, 
        hidden_units):
    user_dense_input_dict, user_sparse_input_dict, _ = build_input_layers(user_feature_columns)
    item_dense_input_dict, item_sparse_input_dict, _ = build_input_layers(item_feature_columns)
    
    # user/item Input
    user_input_list = list(user_sparse_input_dict.values())#list(gmf_user_sparse_input_dict.values()) + list(mlp_user_sparse_input_dict.values())
    item_input_list = list(item_sparse_input_dict.values())#list(gmf_item_sparse_input_dict.values()) + list(mlp_item_sparse_input_dict.values())

    gmf_user_embedding_layer_dict = build_embedding_layers(user_feature_columns, prefix='gmf')
    gmf_item_embedding_layer_dict = build_embedding_layers(item_feature_columns, prefix='gmf')
    print(user_sparse_input_dict, gmf_user_embedding_layer_dict)
    gmf_user_sparse_embed_list = embedding_lookup(user_feature_columns, user_sparse_input_dict, gmf_user_embedding_layer_dict, flatten=True)
    gmf_item_sparse_embed_list = embedding_lookup(item_feature_columns, item_sparse_input_dict, gmf_item_embedding_layer_dict, flatten=True)
    
    
    mlp_user_embedding_layer_dict = build_embedding_layers(user_feature_columns, prefix='mlp')
    mlp_item_embedding_layer_dict = build_embedding_layers(item_feature_columns, prefix='mlp')
        
    mlp_user_sparse_embed_list = embedding_lookup(user_feature_columns, user_sparse_input_dict, mlp_user_embedding_layer_dict, flatten=True)
    mlp_item_sparse_embed_list = embedding_lookup(item_feature_columns, item_sparse_input_dict, mlp_item_embedding_layer_dict, flatten=True)
    
    # GMF
    gmf_output = gmf_user_sparse_embed_list[0] * gmf_item_sparse_embed_list[0]
    
    # MLP
    dnn_input = Concatenate(axis=1)(mlp_user_sparse_embed_list+mlp_item_sparse_embed_list)
    dnn_output = get_dnn(dnn_input, hidden_units=hidden_units)
    
    print('dnn_output.shape: ', dnn_output.shape)
    print('gmf_output.shape: ', gmf_output.shape)
    concate_mlp_gmf = Concatenate(axis=1)([dnn_output, gmf_output])
    
    output_layer = Dense(1, activation='sigmoid')(concate_mlp_gmf)
    
    model = Model(user_input_list+item_input_list, output_layer)
    
    model.__setattr__("user_input", user_input_list)
    model.__setattr__("item_input", item_input_list)
    model.__setattr__("user_embedding", mlp_user_sparse_embed_list[0])
    model.__setattr__("item_embedding", mlp_item_sparse_embed_list[0])
    return model

model = NCF(user_feature_columns,
            item_feature_columns,  
            hidden_units=[8])
model.summary()

{'user_id': <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'user_id')>} {'user_id': <keras.layers.embeddings.Embedding object at 0x000001CDE2D50BE0>}
hidden_units:  [8]
dnn_output.shape:  (None, 8)
gmf_output.shape:  (None, 8)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlpemb_user_id (Embedding)      (None, 1, 8)         48336       user_id[0][0]                    
__________________________________________________

In [6]:
train_input = {
    'user_id': np.array(train_df['user_id']),
    'item_id': np.array(train_df['item_id'])
}
test_input = {
    'user_id': np.array(test_df['user_id']),
    'item_id': np.array(test_df['item_id'])
}
# 物品表
item_df = data[['item_id', 'item_cate_id']].drop_duplicates(['item_id'])
item_input = {
    'item_id': np.array(item_df['item_id']),
    'item_cate_id': np.array(item_df['item_cate_id']),
}

# 模型训练
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping


<keras.callbacks.History at 0x1cde35f99e8>

In [7]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]).astype(np.float64), test_df['label'].values)

模型准确率:0.7977314124855108, AUC得分:0.877665595818097, LogLoss:0.4834402552561367
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      6038
           1       0.81      0.78      0.79      6040

    accuracy                           0.80     12078
   macro avg       0.80      0.80      0.80     12078
weighted avg       0.80      0.80      0.80     12078

