## ESMM

In [2]:
import os
import joblib
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
import tensorflow.keras.backend as K
from tensorflow.python.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [3]:
def get_adult_data():
    """ 读取adult数据集 """
    column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
                        'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
                        'income_50k']
    train_df = pd.read_csv(
        '../data/adult/adult.data',
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names
    )
    test_df = pd.read_csv(
        '../data/adult/adult.test',
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names
    )
    train_df.dropna(inplace=True)
    test_df.dropna(inplace=True)
    test_df['income_50k'] = test_df['income_50k'].apply(lambda x: x[:-1])
    return train_df, test_df
    
train_df, test_df = get_adult_data()
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))

train_df.shape: (32561, 15), test_df.shape: (16281, 15)


##### 数据预处理

In [4]:
data = pd.concat([train_df, test_df], axis=0)
data['income_50k'] = data['income_50k'].apply(lambda x: 0 if x == ' <=50K' else 1)
data['marital_status'] = data['marital_status'].apply(lambda x: 0 if x == ' Never-married' else 1)

tasks = ['income_50k', 'marital_status']
dense_column_names =['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
sparse_column_names = ['workclass', 'education', 'occupation', 'relationship', 'race', 'sex', 'native_country']

def data_processing(df, dense_column_names, sparse_column_names, tasks):
    df[dense_column_names] = df[dense_column_names].fillna(0.0)
    for f in dense_column_names:
        df[f] = df[f].apply(lambda x: float(x))
        df[f] = df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    df[sparse_column_names] = df[sparse_column_names].fillna("-1")
    for f in sparse_column_names:
        lbe = LabelEncoder()
        df[f] = lbe.fit_transform(df[f])
    return df[dense_column_names + sparse_column_names + tasks]

df = data_processing(data, dense_column_names, sparse_column_names, tasks)
train_df, test_df = df.iloc[0:train_df.shape[0]], df.iloc[train_df.shape[0]:]
print('train_df.shape:{}, test_df.shape:{}'.format(train_df.shape, test_df.shape))

train_df.shape:(32561, 15), test_df.shape:(16281, 15)


##### 模型构建

In [5]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

feature_columns = [SparseFeature(f, vocabulary_size=df[f].nunique(), embedding_size=4) for f in sparse_column_names] + \
[DenseFeature(f, 1) for f in dense_column_names]
# 模拟用户侧特征和物品侧特征
user_feature_columns = [feature_columns[i] for i in range(len(feature_columns)) if i % 2 == 0]
item_feature_columns = [feature_columns[i] for i in range(len(feature_columns)) if i % 2 == 1]
user_feature_columns, item_feature_columns

([SparseFeature(name='workclass', vocabulary_size=9, embedding_size=4),
  SparseFeature(name='occupation', vocabulary_size=15, embedding_size=4),
  SparseFeature(name='race', vocabulary_size=5, embedding_size=4),
  SparseFeature(name='native_country', vocabulary_size=42, embedding_size=4),
  DenseFeature(name='fnlwgt', dimension=1),
  DenseFeature(name='capital_gain', dimension=1),
  DenseFeature(name='hours_per_week', dimension=1)],
 [SparseFeature(name='education', vocabulary_size=16, embedding_size=4),
  SparseFeature(name='relationship', vocabulary_size=6, embedding_size=4),
  SparseFeature(name='sex', vocabulary_size=2, embedding_size=4),
  DenseFeature(name='age', dimension=1),
  DenseFeature(name='education_num', dimension=1),
  DenseFeature(name='capital_loss', dimension=1)])

In [6]:
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict

def build_embedding_layers(feature_columns):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    for f in feature_columns:
        if isinstance(f, SparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='emb_' + f.name)
        elif isinstance(f, VarLenSparseFeature):
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict

def embedding_lookup(columns, input_dict, embedding_layer_dict, flatten=False):
    """ 根据feature_columns或column_names查表，得到对应embedding向量列表 """
    embedding_list = []
    for f in columns:
        if type(f) == str:
            column_name = f
        else:
            column_name = f.name
        _input = input_dict[column_name]
        _embed = embedding_layer_dict[column_name]
        embed_layer = _embed(_input)
        if flatten:
            embed_layer = Flatten()(embed_layer)
        embedding_list.append(embed_layer)
    return embedding_list

def concat_input_list(input_list):
    """ 合并input列表 """
    _num = len(input_list)
    if _num > 1:
        return Concatenate(axis=1)(input_list)
    elif len(input_list) == 1:
        return input_list[0]
    else:
        return None

def get_dnn(dnn_input, hidden_units=[64, 32], activation='relu', l2=0.01, task_name='ctr_output'):
    dnn_out = dnn_input
    dnn_out = Dropout(0.5)(dnn_out)
    dnn_out = BatchNormalization()(dnn_out)
    
    print('hidden_units: ', hidden_units)
    dnn_list = [Dense(unit, activation=activation, kernel_regularizer=tf.keras.regularizers.l2(l2=l2)) for unit in hidden_units]
    for dnn in dnn_list:
        dnn_out = dnn(dnn_out)#Dropout(0.5)(dnn(dnn_out))
    dnn_out = Dense(1, activation='sigmoid', name=task_name)(dnn_out)
    return dnn_out
    
def ESMM(user_feature_columns,
         item_feature_columns,
         tasks,
         hidden_units=[128, 64]):
    """ Entire Space Multi-task Model """
    # Input
    user_dense_input_dict, user_sparse_input_dict, _ = build_input_layers(user_feature_columns)
    item_dense_input_dict, item_sparse_input_dict, _ = build_input_layers(item_feature_columns)

    user_input_list = list(user_dense_input_dict.values()) + list(user_sparse_input_dict.values())
    item_input_list = list(item_dense_input_dict.values()) + list(item_sparse_input_dict.values())
    
    # 用户侧
    user_dense_input_list = list(user_dense_input_dict.values())
    user_embedding_layer_dict = build_embedding_layers(user_feature_columns)
    user_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), user_feature_columns))
    flatten_user_sparse_embed_list = embedding_lookup(user_sparse_feature_columns, user_sparse_input_dict, user_embedding_layer_dict, flatten=True)
    user_dnn_input = concat_input_list(user_dense_input_list + flatten_user_sparse_embed_list)
    
    # 物品侧
    item_dense_input_list = list(item_dense_input_dict.values())
    item_embedding_layer_dict = build_embedding_layers(item_feature_columns)
    item_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), item_feature_columns))
    flatten_item_sparse_embed_list = embedding_lookup(item_sparse_feature_columns, item_sparse_input_dict, item_embedding_layer_dict, flatten=True)
    item_dnn_input = concat_input_list(item_dense_input_list + flatten_item_sparse_embed_list)
    
    # 合并用户侧物品侧向量
    concat_user_item = Concatenate(axis=1)([user_dnn_input, item_dnn_input])
    
    # Ctr Cvr Tower
    ctr_out = get_dnn(concat_user_item, task_name='ctr_output')
    cvr_out = get_dnn(concat_user_item, task_name='cvr_output')
    ctcvr_out = tf.multiply(ctr_out, cvr_out)
    
    model = Model(user_input_list+item_input_list, outputs=[ctr_out, ctcvr_out])
    return model

model= ESMM(user_feature_columns,
            item_feature_columns,
            tasks,
            hidden_units=[128, 64])
model.summary()

hidden_units:  [64, 32]
hidden_units:  [64, 32]
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
workclass (InputLayer)          [(None, 1)]          0                                            
__________________________________________________________________________________________________
occupation (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
race (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
native_country (InputLayer)     [(None, 1)]          0                                            
______________________________________________

In [7]:
train_input = {f: train_df[f] for f in dense_column_names + sparse_column_names}
test_input = {f: test_df[f] for f in dense_column_names + sparse_column_names}

In [8]:
#filepath = "./checkpoints/esmm_best.h5"
my_callbacks = [
    #ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min'),
    ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1),
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss=['binary_crossentropy', 'binary_crossentropy'],
              loss_weights=[1.0, 1.0],
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

y_list = [train_df[i].values for i in tasks]
model.fit(train_input,
          y_list,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100


Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100


Epoch 33/100
Epoch 34/100
Epoch 35/100

Epoch 00035: ReduceLROnPlateau reducing learning rate to 0.000800000037997961.
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100

Epoch 00040: ReduceLROnPlateau reducing learning rate to 0.0006400000303983689.
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100

Epoch 00044: ReduceLROnPlateau reducing learning rate to 0.0005120000336319208.
Epoch 45/100
Epoch 46/100
Epoch 47/100



Epoch 00047: ReduceLROnPlateau reducing learning rate to 0.00040960004553198815.
Epoch 48/100
Epoch 49/100

Epoch 00049: ReduceLROnPlateau reducing learning rate to 0.00032768002711236477.
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100

Epoch 00054: ReduceLROnPlateau reducing learning rate to 0.0002621440216898918.
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100

Epoch 00058: ReduceLROnPlateau reducing learning rate to 0.00020971521735191345.
Epoch 59/100
Epoch 60/100

Epoch 00060: ReduceLROnPlateau reducing learning rate to 0.00016777217388153076.
Epoch 61/100


Epoch 62/100

Epoch 00062: ReduceLROnPlateau reducing learning rate to 0.00013421773910522462.
Epoch 63/100
Epoch 64/100

Epoch 00064: ReduceLROnPlateau reducing learning rate to 0.00010737419361248613.
Epoch 65/100
Epoch 66/100

Epoch 00066: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100


Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x216861ea588>

In [12]:
# 模型预测与评估
result = model.predict(test_input)

for idx, target_name in enumerate(tasks):
    print(idx, target_name)
    model_metric(np.array([i[0] for i in result[idx]]), test_df[target_name].values)

0 income_50k
模型准确率:0.7185676555494134, AUC得分:0.8712877007239518, LogLoss:0.5233858462358333
              precision    recall  f1-score   support

           0       0.94      0.67      0.78     12435
           1       0.45      0.87      0.59      3846

    accuracy                           0.72     16281
   macro avg       0.70      0.77      0.69     16281
weighted avg       0.83      0.72      0.74     16281

1 marital_status
模型准确率:0.7853940175664885, AUC得分:0.9182682785716367, LogLoss:0.5038989764330436
              precision    recall  f1-score   support

           0       0.61      0.99      0.76      5434
           1       0.99      0.68      0.81     10847

    accuracy                           0.79     16281
   macro avg       0.80      0.84      0.78     16281
weighted avg       0.87      0.79      0.79     16281

