## AFM

In [9]:
import os
import itertools
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [2]:
def get_criteo_data():
    """ 读取criteo数据集 """
    train_path = '../data/criteo/train.txt'
    test_path = '../data/criteo/test.txt'

    dense_column_names = ['I' + str(i) for i in range(1, 14)]
    sparse_column_names = ['C' + str(i) for i in range(14, 40)]
    column_names = ['label'] + dense_column_names + sparse_column_names

    train_df = pd.read_csv(train_path, names=column_names, sep='\t')
    test_df = pd.read_csv(test_path, names=column_names, sep='\t')
    return train_df, test_df, dense_column_names, sparse_column_names

train_df, test_df, dense_column_names, sparse_column_names = get_criteo_data()
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))
print('dense_column_names:{}\nsparse_column_names:{} '.format(dense_column_names, sparse_column_names))

train_df.shape: (1000000, 40), test_df.shape: (1000000, 40)
dense_column_names:['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']
sparse_column_names:['C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29', 'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39'] 


##### 数据预处理

In [3]:
data = pd.concat([train_df, test_df], axis=0)

def label_encoder(y, frequency_threshold=100):
    """ 字典编码，eg: label_encoder([2,2,3,3,3,5,6], 2) => array([2, 2, 1, 1, 1, 0, 0], dtype=int8)
    :params y 类别特征
    :params frequency_threshold 阈值，低于阈值的类别编码为0
    :return: 编码后的numpy数组
    """
    value_counts = pd.value_counts(y)
    categories = value_counts[value_counts >= frequency_threshold].index.to_numpy()
    #print('categories', categories)
    return pd.Categorical(y, categories=categories).codes+1

def data_processing(df, dense_column_names, sparse_column_names):
    df[dense_column_names] = df[dense_column_names].fillna(0.0)
    for f in dense_column_names:
        df[f] = df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    df[sparse_column_names] = df[sparse_column_names].fillna("-1")
    for f in sparse_column_names:
        df[f] = label_encoder(df[f])
#         lbe = LabelEncoder()
#         df[f] = lbe.fit_transform(df[f])
    return df[dense_column_names + sparse_column_names]

df = data_processing(data, dense_column_names, sparse_column_names)
df['label'] = data['label']

train_df, test_df = df.iloc[0:train_df.shape[0]], df.iloc[train_df.shape[0]:]
print('train_df.shape:{}, test_df.shape:{}'.format(train_df.shape, test_df.shape))

train_df.shape:(1000000, 40), test_df.shape:(1000000, 40)


##### 模型构建

In [7]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])

# 定义特征列
linear_feature_columns = [SparseFeature(f, vocabulary_size=df[f].nunique(), embedding_size=4) for f in sparse_column_names] + \
[DenseFeature(f, 1,) for f in dense_column_names]

dnn_feature_columns = [SparseFeature(f, vocabulary_size=df[f].nunique(), embedding_size=4) for f in sparse_column_names] + \
[DenseFeature(f, 1,) for f in dense_column_names]

In [10]:
def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict = {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
    return dense_input_dict, sparse_input_dict

def build_embedding_layers(feature_columns, is_linear):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    if is_linear:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, 1, name='1d_emb_' + f.name)
    else:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='kd_emb_' + f.name)
    return embedding_layer_dict

def get_linear_logits(dense_input_dict, sparse_input_dict, sparse_feature_columns):
    """ 数值特征拼接一起传入全连接层 + 类别特征onehot，flatten，add """
    concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values()))
    dense_logits_output = Dense(1)(concat_dense_inputs)
    
    embedding_layer_dict = build_embedding_layers(sparse_feature_columns, is_linear=True)
    
    # embedding(input)查表操作，返回对应input的嵌入向量
    sparse_1d_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = Flatten()(embedding_layer_dict[f.name](_input))
        sparse_1d_embed_list.append(_embed)
    
    sparse_logits_output = Add()(sparse_1d_embed_list)
    linear_logits = Add()([dense_logits_output, sparse_logits_output])
    return linear_logits

class BiInteractionAttentionPooling(Layer):
    """ embedding向量集合两两点乘(对应位置相乘)后进行Attention Pooling操作 """
    def __init__(self):
        super(BiInteractionAttentionPooling, self).__init__()
    
    def build(self, input_shape):
        super(BiInteractionAttentionPooling, self).build(input_shape)
        self.att_dense1 = Dense(8, activation='relu')
        self.att_dense2 = Dense(1)
        
    def call(self, inputs):
        concat_embed_values = inputs
        
        row, col = [], []
        for r, c in itertools.combinations(range(concat_embed_values.shape[1]), 2):
            row.append(r)
            col.append(c)
        p = tf.gather(concat_embed_values, row, axis=1)
        q = tf.gather(concat_embed_values, col, axis=1)
        bi_interaction = p * q # (None, n*(n-1)/2, dim)
        
        # dnn softmax
        dense_out = self.att_dense1(bi_interaction) # (None, n*(n-1)/2, dim)
        dense_out = self.att_dense2(dense_out) # (None, n*(n-1)/2, 1)
        att_score = tf.nn.softmax(dense_out, axis=1) # (None, n*(n-1)/2, 1)
        output = tf.reduce_sum(bi_interaction*att_score, axis=1) # (None, dim)
        print('att output: ', output.shape)
        return output

def build_bi_interaction_pooling_layers(sparse_input_dict, sparse_feature_columns, embedding_layer_dict):
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name] 
        _embed = embedding_layer_dict[f.name]
        embed_layer = _embed(_input)
        sparse_kd_embed_list.append(embed_layer)
    #print('sparse_kd_embed_list: ', sparse_kd_embed_list)
    concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed_list)
    pooling_out = BiInteractionAttentionPooling()(concat_sparse_kd_embed)
    return pooling_out
        
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')
    
def AFM(linear_feature_columns, dnn_feature_columns):
    """ Neural Factorization Machines """
    # Input
    dense_input_dict, sparse_input_dict = build_input_layers(linear_feature_columns + dnn_feature_columns)
    
    # wide侧(linear)
    linear_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), linear_feature_columns))
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values())
    linear_logits = get_linear_logits(dense_input_dict, sparse_input_dict, linear_sparse_feature_columns)
    
    # deep侧(embedding bi-interaction pooling bn hidden)
    dnn_embedding_layer_dict = build_embedding_layers(dnn_feature_columns, is_linear=False)
    dnn_sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), dnn_feature_columns))
    pooling_out = build_bi_interaction_pooling_layers(sparse_input_dict, dnn_sparse_feature_columns, dnn_embedding_layer_dict)
    
    # bn
    #bn_pooling_out = BatchNormalization()(pooling_out)
    # hidden
    dnn_out = Dropout(0.5)(Dense(4, activation='relu')(pooling_out))
    #dnn_out = Dropout(0.5)(Dense(512, activation='relu')(dnn_out))
    #dnn_out = Dropout(0.5)(Dense(256, activation='relu')(dnn_out))
    dnn_logits = Dense(1)(dnn_out)
    
    output_logits = Add()([linear_logits, dnn_logits])
    output_layer = Activation("sigmoid")(output_logits)
    model = Model(input_list, output_layer)
    return model

model = AFM(linear_feature_columns, dnn_feature_columns)
model.summary()

att output:  (None, 4)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C14 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C15 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C16 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C17 (InputLayer)                [(None, 1)]          0                                            
_______________________________________________________________________

In [11]:
# 模型训练
train_input = {f: train_df[f] for f in dense_column_names + sparse_column_names}
test_input = {f: test_df[f] for f in dense_column_names + sparse_column_names}

my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
att output:  (None, 4)
att output:  (None, 4)
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping


<keras.callbacks.History at 0x2362f777d68>

In [13]:
# 模型预测与评估
result = model.predict(test_input)
model_metric(np.array([i[0] for i in result]).astype(np.float64), test_df['label'].values)

模型准确率:0.783228, AUC得分:0.7724402185324007, LogLoss:0.4691115944759669
              precision    recall  f1-score   support

           0       0.81      0.92      0.87    751819
           1       0.61      0.36      0.45    248181

    accuracy                           0.78   1000000
   macro avg       0.71      0.64      0.66   1000000
weighted avg       0.76      0.78      0.76   1000000

