In [1]:
import os
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [2]:
def get_criteo_data():
    """ 读取criteo数据集 """
    train_path = '../data/criteo/train.txt'
    test_path = '../data/criteo/test.txt'

    dense_column_names = ['I' + str(i) for i in range(1, 14)]
    sparse_column_names = ['C' + str(i) for i in range(14, 40)]
    column_names = ['label'] + dense_column_names + sparse_column_names

    train_df = pd.read_csv(train_path, names=column_names, sep='\t')
    test_df = pd.read_csv(test_path, names=column_names, sep='\t')
    return train_df, test_df, dense_column_names, sparse_column_names

train_df, test_df, dense_column_names, sparse_column_names = get_criteo_data()
print('train.shape: {}, test.shape: {}'.format(train_df.shape, test_df.shape))

train.shape: (1000000, 40), test.shape: (1000000, 40)


##### 数据预处理

In [8]:
data = pd.concat([train_df, test_df], axis=0)

def data_processing(df, dense_column_names, sparse_column_names):
    """ 数据预处理 """
    df[dense_column_names] = df[dense_column_names].fillna(0.0)
    for f in dense_column_names:
        df[f] = df[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    df[sparse_column_names] = df[sparse_column_names].fillna("-1")
    for f in sparse_column_names:
        lbe = LabelEncoder()
        df[f] = lbe.fit_transform(df[f])
    return df[dense_column_names + sparse_column_names]

df = data_processing(data, dense_column_names, sparse_column_names)
df['label'] = data['label']

train_df, test_df = df.iloc[0:train_df.shape[0]], df.iloc[train_df.shape[0]:]
print('train.shape:{}, test.shape:{}'.format(train_df.shape, test_df.shape))

train.shape:(1000000, 40), test.shape:(1000000, 40)


##### 模型构建

如何提取FM模型预训练参数？

从原有模型结构中取出部分模型结构作为获取预训练参数的模型。

1. 构建一个完整的FM模型。
2. 选取所需的部分或全部特征对应的Input张量列表作为输入和某一层对应的张量/张量列表作为输出，形如：

- spare_embed_model = Model(inputs=sparse_input_list, outputs=concat_sparse_kd_embed_list)
    
3. 通过构建的新模型进行预测，输入是Input所对应特征，输出是outputs相应层所对应的输出数据，形如：

- spare_embed_model_input = {f: test[f] for f in sparse_column_names}
- spare_embedding = spare_embed_model.predict(spare_embed_model_input, batch_size=2**12)

自定义层（继承Layer父类）

- call函数中可以直接使用低级API，但如果需要使用层的API（例如:Dense、Dropout等），则需要先在build函数中定义然后在build中调用。

##### 第一阶段：训练FM模型，获取稀疏特征对应的embedding向量。

In [9]:
SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])

feature_columns = [SparseFeature(f, vocabulary_size=df[f].nunique(), embedding_size=4) for f in sparse_column_names] + \
[DenseFeature(f, 1) for f in dense_column_names]
feature_columns

[SparseFeature(name='C14', vocabulary_size=1370, embedding_size=4),
 SparseFeature(name='C15', vocabulary_size=541, embedding_size=4),
 SparseFeature(name='C16', vocabulary_size=597026, embedding_size=4),
 SparseFeature(name='C17', vocabulary_size=200785, embedding_size=4),
 SparseFeature(name='C18', vocabulary_size=284, embedding_size=4),
 SparseFeature(name='C19', vocabulary_size=17, embedding_size=4),
 SparseFeature(name='C20', vocabulary_size=11387, embedding_size=4),
 SparseFeature(name='C21', vocabulary_size=600, embedding_size=4),
 SparseFeature(name='C22', vocabulary_size=3, embedding_size=4),
 SparseFeature(name='C23', vocabulary_size=39665, embedding_size=4),
 SparseFeature(name='C24', vocabulary_size=4953, embedding_size=4),
 SparseFeature(name='C25', vocabulary_size=498707, embedding_size=4),
 SparseFeature(name='C26', vocabulary_size=3097, embedding_size=4),
 SparseFeature(name='C27', vocabulary_size=26, embedding_size=4),
 SparseFeature(name='C28', vocabulary_size=10159, 

In [10]:
class FM_Layer(Layer):
    def __init__(self):
        super(FM_Layer, self).__init__()
        
    def call(self, inputs):
        concat_embed_values = inputs
        print('concat_embed_values.shape: ', concat_embed_values.shape) # (None, 26, 4)
        sum_square = tf.square(tf.reduce_sum(concat_embed_values, axis=1, keepdims=True)) # (None, 1, 4)
        print('sum_square.shape: ', sum_square.shape)
        square_sum = tf.reduce_sum(concat_embed_values * concat_embed_values, axis=1, keepdims=True) # (None, 1, 4)
        print('square_sum.shape: ', square_sum.shape)
        output = sum_square - square_sum # 和的平方-平方的和
        output = 0.5 * tf.reduce_sum(output, axis=2, keepdims=False) # (None 1)
        print('output.shape: ', output.shape)
        return output
    
    def compute_output_shape(self, input_shape):
        return (None, 1)
    
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict = {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
    return dense_input_dict, sparse_input_dict
    
def build_embedding_layers(feature_columns, is_linear):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    if is_linear:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, 1, name='1d_emb_' + f.name)
    else:
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='kd_emb_' + f.name)
    return embedding_layer_dict
    
def get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns):
    """ 数值特征拼接一起传入全连接层 + 类别特征onehot，flatten，add """
    concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values()))
    dense_logits_output = Dense(1)(concat_dense_inputs)
    
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=True)
    
    # embedding(input)查表操作，返回对应input的嵌入向量
    sparse_1d_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = Flatten()(embedding_layer_dict[f.name](_input))
        sparse_1d_embed_list.append(_embed)
    
    sparse_logits_output = Add()(sparse_1d_embed_list)
    linear_logits = Add()([dense_logits_output, sparse_logits_output])
    return linear_logits
    
def get_fm_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns):
    """ 取出input所对应的嵌入向量拼接在一起，计算和的平和-平方的和 """
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = kd_embedding_layer_dict[f.name](_input)
        sparse_kd_embed_list.append(_embed)
    
    concat_sparse_kd_embed_list = Concatenate(axis=1)(sparse_kd_embed_list)
    
    fm_logits = FM_Layer()(concat_sparse_kd_embed_list)
    return fm_logits, concat_sparse_kd_embed_list
    
def FM(feature_columns, seed=1024, l2_reg=1e-5, task='binary'):
    """ Instantiates FM architecture
    :param feature_columns 
    :param seed
    :param l2_reg L2regularization
    :return: A kears model instance
    """
    dense_input_dict, sparse_input_dict = build_input_layers(feature_columns)
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values())
    
    # w * x
    linear_logits = get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns)
    
    # 0.5 * [sum(vixi)**2 - sum(vixi*vixi)
    kd_embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=False)
    fm_logits, concat_sparse_kd_embed_list = get_fm_logits(sparse_input_dict, kd_embedding_layer_dict, feature_columns)
    
    output_logits = Add()([linear_logits, fm_logits])
    output_layer = Activation("sigmoid")(output_logits)
    model = Model(input_list, output_layer)
    
    # 获取稀疏特征对应的embedding向量
    sparse_input_list = list(sparse_input_dict.values())
    flatten_sparse_kd_embed_list = Flatten()(concat_sparse_kd_embed_list)
    model.__setattr__("sparse_input", sparse_input_list)
    model.__setattr__("spare_embedding", flatten_sparse_kd_embed_list)

    spare_embed_model = Model(inputs=model.sparse_input, outputs=model.spare_embedding)
    #spare_embed_model = Model(inputs=sparse_input_list, outputs=flatten_sparse_kd_embed_list)
    return model, spare_embed_model

model, spare_embed_model = FM(feature_columns)
model.summary()

concat_embed_values.shape:  (None, 26, 4)
sum_square.shape:  (None, 1, 4)
square_sum.shape:  (None, 1, 4)
output.shape:  (None, 1)
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C14 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C15 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C16 (InputLayer)                [(None, 1)]          0                                            
__________________________________________________________________________________________________
C17 (InputLayer)                [(None, 1)]          0      

In [11]:
# 模型训练
train_input = {f: train_df[f] for f in dense_column_names + sparse_column_names}
test_input = {f: test_df[f] for f in dense_column_names + sparse_column_names}

my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
concat_embed_values.shape:  (None, 26, 4)
sum_square.shape:  (None, 1, 4)
square_sum.shape:  (None, 1, 4)
output.shape:  (None, 1)
concat_embed_values.shape:  (None, 26, 4)
sum_square.shape:  (None, 1, 4)
square_sum.shape:  (None, 1, 4)
output.shape:  (None, 1)
sum_square.shape:  (None, 1, 4)
square_sum.shape:  (None, 1, 4)
output.shape:  (None, 1)
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping


<keras.callbacks.History at 0x2183861dfd0>

In [17]:
train_spare_embed_model_input = {f: train_df[f] for f in sparse_column_names}
train_spare_embedding = spare_embed_model.predict(train_spare_embed_model_input, batch_size=2**12)

test_spare_embed_model_input = {f: test_df[f] for f in sparse_column_names}
test_spare_embedding = spare_embed_model.predict(test_spare_embed_model_input, batch_size=2**12)
print(train_spare_embedding.shape, test_spare_embedding.shape)

for i in range(train_spare_embedding.shape[1]):
    train_df['emb{}'.format(i)] = train_spare_embedding[:, i]
    test_df['emb{}'.format(i)] = test_spare_embedding[:, i]

(1000000, 104) (1000000, 104)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


##### 第二阶段：将所有样本的类别特征所提取出embedding向量和数值向量拼接在一起，输入到DNN模型中进行学习。

In [18]:
dnn_feature_columns = [DenseFeature(f, 1) for f in train_df.columns if f.startswith('I') or f.startswith('emb')]
dnn_feature_columns

[DenseFeature(name='I1', dimension=1),
 DenseFeature(name='I2', dimension=1),
 DenseFeature(name='I3', dimension=1),
 DenseFeature(name='I4', dimension=1),
 DenseFeature(name='I5', dimension=1),
 DenseFeature(name='I6', dimension=1),
 DenseFeature(name='I7', dimension=1),
 DenseFeature(name='I8', dimension=1),
 DenseFeature(name='I9', dimension=1),
 DenseFeature(name='I10', dimension=1),
 DenseFeature(name='I11', dimension=1),
 DenseFeature(name='I12', dimension=1),
 DenseFeature(name='I13', dimension=1),
 DenseFeature(name='emb0', dimension=1),
 DenseFeature(name='emb1', dimension=1),
 DenseFeature(name='emb2', dimension=1),
 DenseFeature(name='emb3', dimension=1),
 DenseFeature(name='emb4', dimension=1),
 DenseFeature(name='emb5', dimension=1),
 DenseFeature(name='emb6', dimension=1),
 DenseFeature(name='emb7', dimension=1),
 DenseFeature(name='emb8', dimension=1),
 DenseFeature(name='emb9', dimension=1),
 DenseFeature(name='emb10', dimension=1),
 DenseFeature(name='emb11', dimension

In [19]:
class DNN(Layer):
    """ Model子类化创建自定义模型 """
    def __init__(self):
        super(DNN, self).__init__()
        
    def build(self, input_shape):
        """ 定义权重 """
        self.dropout = Dropout(0.8)
        self.dense16 = Dense(16, activation='tanh')
        self.dense32 = Dense(32, activation='tanh')
        self.dense8 = Dense(8, activation='tanh')
        self.dense1 = Dense(1)
        self.activation = Activation("sigmoid")
        
    def call(self, inputs):
        """ 层的功能逻辑 """
        dnn_input = inputs
        
        dnn_out = self.dropout(self.dense16(dnn_input))
        dnn_out = self.dropout(self.dense32(dnn_out))
        dnn_out = self.dropout(self.dense8(dnn_out))
        dnn_out = self.dense1(dnn_out)
        #dnn_out = dnn_out)
        #dnn_out = self.dense8(inputs)
        #dnn_out = self.dropout(dnn_out)
        
        dnn_logits = self.activation(dnn_out)
        return dnn_logits
    
    def compute_output_shape(self, input_shape):
        return (None, 1)

dense_input_dict, _ = build_input_layers(dnn_feature_columns)
input_list = list(dense_input_dict.values())

concat_inputs = Concatenate(axis=1)(input_list)
output_layer = DNN()(concat_inputs)
dnn_model = Model(input_list, output_layer)
dnn_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
I1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
I2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
I3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
I4 (InputLayer)                 [(None, 1)]          0                                            
____________________________________________________________________________________________

In [22]:
train_input = {f.name: train_df[f.name] for f in dnn_feature_columns}
test_input = {f.name: test_df[f.name] for f in dnn_feature_columns}

tf.config.run_functions_eagerly(True)
my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]
dnn_model.compile('adam',
              loss='binary_crossentropy',
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

dnn_model.fit(train_input,
          train_df['label'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

  "Even though the `tf.config.experimental_run_functions_eagerly` "


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping


<keras.callbacks.History at 0x2182efe70f0>

In [23]:
# 模型预测与评估
result = dnn_model.predict(test_input)
model_metric(np.array([i[0] for i in result]), test_df['label'].values)

模型准确率:0.741172, AUC得分:0.6937285584957377, LogLoss:0.5628377597864209
              precision    recall  f1-score   support

           0       0.80      0.87      0.84    751819
           1       0.47      0.34      0.39    248181

    accuracy                           0.74   1000000
   macro avg       0.64      0.61      0.62   1000000
weighted avg       0.72      0.74      0.73   1000000

