https://zhuanlan.zhihu.com/p/57162373

https://github.com/jc-LeeHub/Recommend-System-tf2.0/tree/3741b742e81588b6d9259410ed51c964c4778e71/xDeepFM

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Dropout

class Linear(Layer):
    def __init__(self):
        super(Linear, self).__init__()
        self.out_layer = Dense(1, activation=None)

    def call(self, inputs, **kwargs):
        output = self.out_layer(inputs)
        return output

class Dense_layer(Layer):
    def __init__(self, hidden_units, out_dim=1, activation='relu', dropout=0.0):
        '''
        hidden_units = [256, 128, 64]
        '''
        super(Dense_layer, self).__init__()
        self.hidden_layers = [Dense(i, activation=activation) for i in hidden_units]
        self.out_layer = Dense(out_dim, activation=None)
        self.dropout = Dropout(dropout)

    def call(self, inputs, **kwargs):
        # inputs: [None, n*k]
        x = inputs
        for layer in self.hidden_layers:
            x = layer(x)
        x = self.dropout(x)
        output = self.out_layer(x)
        return output

class CIN(Layer):
    '''
    我明白了，CIN的意义何在。
    CIN部分最初的输入，是n个原始特征，每个特征都被embedding到特定维度emb_dim.
    CIN每一层的cin_size意思就是：第一层就是n个特征，到了第二层假如cin_size是128_1，就是说，第二层计算完后“特征”数量从n变成了128_1。
        个中的计算，从数学原理上说，是使用了外积，以及一些矩阵相乘。
        不过这份代码在实现的时候，使用了tf中的1维卷积把这部分给做了。卷积怎么就能实现这个？我尚存疑。
    如果还有后续的层，就是128_1个特征又变成了若干个特征。
    经过了所有层，总共有(n + 128_1 + ......)个特征。把最开始的n个原始特征去掉不要，剩下的特征简单concat起来就好了。
    最后的（128_1 + ....）个特征，每一个都是原来的n个特征的embedding向量各种乱七八糟地交叉乘啊，blabla，得到的。
    所以xDeepFM就达成了特征向量级（vector_wise）交叉的目的。这就是有别于DCN的bit_wise交叉的地方。
    
    https://zhuanlan.zhihu.com/p/57162373 里面的“为啥取名CIN”这一部分介绍的黄色圆点的计算啊，简单来说，可以理解成[H, 1]矩阵乘上[1, m]矩阵得到一个[H, m]的矩阵。
    这种乘，就是所谓的外积，参考：https://www.zhihu.com/question/419909144 
    xDeepFM原文介绍黄色圆点的计算的时候，说的也是外积。所以没错了。
    
    我把原理整个撸了一遍之后，也没看出来Hadamard积在哪里有用到。难道被作者虚晃一枪？
    '''
    def __init__(self, cin_size):
        '''
        cin_size = [128, 128]
        '''
        super(CIN, self).__init__()
        self.cin_size = cin_size  # 每层的矩阵个数

    def build(self, input_shape):
        # input_shape: [None, n, k]
        self.field_num = [input_shape[1]] + self.cin_size # 每层的矩阵个数(包括第0层)；[n【就是特征数】, 128_1, 128_2]

        self.cin_W = [self.add_weight(
                         name='w'+str(i),
                         shape=(1, self.field_num[0]*self.field_num[i], self.field_num[i+1]), ## 第一个是(1, n * n, 128_1), 第二个是(1, n*128_1, 128_2)
                         initializer=tf.initializers.glorot_uniform(),
                         regularizer=tf.keras.regularizers.l1_l2(1e-5),
                         trainable=True)
                      for i in range(len(self.field_num)-1)] ## [self.add_weight( i blabla ) for i in range(2)]

    def call(self, inputs, **kwargs):
        # inputs: [None, n, k] 这里的k就是emb_dim，n是特征数。
        k = inputs.shape[-1] ## k
        res_list = [inputs] ## [inputs(None, n, k)]
        X0 = tf.split(inputs, k, axis=-1) ## 难道是 k * [?, n, 1]          # 最后维切成k份，list: k * [None, field_num[0], 1] ## 
        for i, size in enumerate(self.field_num[1:]): ## [128_1, 128_2]
            Xi = tf.split(res_list[-1], k, axis=-1) # list: k * [None, field_num[i], 1]
            
            ## 下面那一行，transpose_b=True的意思是，tf.matmul(a, b)里面的b进行转置。这样一来，。。。
            ## 就是k * [?,n,1]的变量乘以k * [?, 1, field_num[i]]，乘完后, 自然得到k * [None, field_num[0], field_num[i]]
            x = tf.matmul(X0, Xi, transpose_b=True) # list: k * [None, field_num[0], field_num[i]]
            
            x = tf.reshape(x, shape=[k, -1, self.field_num[0]*self.field_num[i]])
                                                    # [k, None, field_num[0]*field_num[i]]
            x = tf.transpose(x, [1, 0, 2])          # [None, k, field_num[0]*field_num[i]] ## [?, emb_dim, n*n]
            
            # print(self.cin_W[i].shape) ## 打印出来之后啊，我发现我前面总结的self.cin_W的维度是正确的。
            
            
            x = tf.nn.conv1d(input=x, filters=self.cin_W[i], stride=1, padding='VALID') ## self.cin_W[i]：(1, n*n, 128_1), 意思难道是：（扫1层得到一个值 * emb_dim层）* 128_1次？
                                                    # (None, k, field_num[i+1]) ## 输出的x的形状：(?, emb_dim, 128_1)。总的来说不是完全理解里面的原理，姑且接受这个现实吧。
            x = tf.transpose(x, [0, 2, 1])          # (None, field_num[i+1], k) ## (?, 128_1, emb_dim)
            res_list.append(x)

        res_list = res_list[1:]   # 去掉X0
        res = tf.concat(res_list, axis=1)  # (None, field_num[1]+...+field_num[n], k) ## (?, 128_1 + 128_2, emb_dim)
        output = tf.reduce_sum(res, axis=-1)  # (None, field_num[1]+...+field_num[n]) 
        return output

In [2]:
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding

class xDeepFM(Model):
    def __init__(self, feature_columns, cin_size, hidden_units, out_dim=1, activation='relu', dropout=0.0):
        '''
        hidden_units = [256, 128, 64]
        cin_size = [128, 128]
        '''
        super(xDeepFM, self).__init__()
        self.dense_feature_columns, self.sparse_feature_columns = feature_columns
        self.embed_layers = [Embedding(feat['feat_onehot_dim'], feat['embed_dim'])
                                    for feat in self.sparse_feature_columns]
        self.linear = Linear()
        self.dense_layer = Dense_layer(hidden_units, out_dim, activation, dropout)
        self.cin_layer = CIN(cin_size)
        self.out_layer = Dense(1, activation=None)

    def call(self, inputs, training=None, mask=None):
        dense_inputs, sparse_inputs = inputs[:, :13], inputs[:, 13:]

        # linear
        linear_out = self.linear(inputs)
        
        ## 这里n是特征的数量，k是embedding向量的维度。
        emb = [self.embed_layers[i](sparse_inputs[:, i]) for i in range(sparse_inputs.shape[1])] # [n, None, k]
        emb = tf.transpose(tf.convert_to_tensor(emb), [1, 0, 2]) # [None, n, k]

        # CIN
        cin_out = self.cin_layer(emb)

        # Dense
        emb = tf.reshape(emb, shape=(-1, emb.shape[1]*emb.shape[2])) ## [None, n*k] 稀疏矩阵
        emb = tf.concat([dense_inputs, emb], axis=1) ## 稠密矩阵和稀疏矩阵concat一下。
        dense_out = self.dense_layer(emb)

        output = self.out_layer(linear_out + cin_out + dense_out)
        return tf.nn.sigmoid(output)

In [3]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

def sparseFeature(feat, feat_onehot_dim, embed_dim):
    return {'feat': feat, 'feat_onehot_dim': feat_onehot_dim, 'embed_dim': embed_dim}

def denseFeature(feat):
    return {'feat': feat}

def create_criteo_dataset(file_path, embed_dim=8, test_size=0.2):
    data = pd.read_csv(file_path)

    dense_features = ['I' + str(i) for i in range(1, 14)] ## 一系列I开头的特征，是稠密特征。
    sparse_features = ['C' + str(i) for i in range(1, 27)] ## 一系列C开头的特征，是稀疏特征。

    #缺失值填充
    data[dense_features] = data[dense_features].fillna(0)
    data[sparse_features] = data[sparse_features].fillna('-1')

    #归一化
    data[dense_features] = MinMaxScaler().fit_transform(data[dense_features])
    #LabelEncoding编码
    for col in sparse_features:
        data[col] = LabelEncoder().fit_transform(data[col]).astype(int)

    feature_columns = [[denseFeature(feat) for feat in dense_features]] + \
           [[sparseFeature(feat, data[feat].nunique(), embed_dim) for feat in sparse_features]]

    X = data.drop(['label'], axis=1).values
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    return feature_columns, (X_train, y_train), (X_test, y_test)

In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import losses, optimizers
from sklearn.metrics import accuracy_score

if __name__ == '__main__':
    file = '../originalDataset/Criteo.txt'
    test_size = 0.2
    hidden_units = [256, 128, 64]
    dropout = 0.3
    cin_size = [128, 128]
    
    ## 
    feature_columns, (X_train, y_train), (X_test, y_test) = create_criteo_dataset(file, test_size=test_size) 

    #########################################################################
    model = xDeepFM(feature_columns, cin_size, hidden_units, dropout=dropout)
    #########################################################################
    
    optimizer = optimizers.SGD(0.01)

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

    summary_writer = tf.summary.create_file_writer('./tensorboard')
    for epoch in range(30):
        loss_summary = []
        for batch, data_batch in enumerate(train_dataset):
            X_train, y_train = data_batch[0], data_batch[1]
            with tf.GradientTape() as tape:
                
                ##############################
                y_pre = model(X_train)
                ##############################
                
                loss = tf.reduce_mean(losses.binary_crossentropy(y_true=y_train, y_pred=y_pre))
                grad = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(grads_and_vars=zip(grad, model.variables))
            if batch%10==0:
                print('epoch: {} batch: {} loss: {}'.format(epoch, batch, loss.numpy()))
            loss_summary.append(loss.numpy())
        with summary_writer.as_default():
            tf.summary.scalar("loss", np.mean(loss_summary), step=epoch)

    pre = model(X_test)
    pre = [1 if x>0.5 else 0 for x in pre]
    print("Accuracy: ", accuracy_score(y_test, pre))

epoch: 0 batch: 0 loss: 2.8921778202056885
epoch: 0 batch: 10 loss: 3.8562371730804443
epoch: 0 batch: 20 loss: 4.338266849517822
epoch: 0 batch: 30 loss: 4.820296287536621
epoch: 0 batch: 40 loss: 3.8562369346618652
epoch: 1 batch: 0 loss: 2.8921778202056885
epoch: 1 batch: 10 loss: 3.8562371730804443
epoch: 1 batch: 20 loss: 4.338266849517822
epoch: 1 batch: 30 loss: 4.820296287536621
epoch: 1 batch: 40 loss: 3.8562369346618652
epoch: 2 batch: 0 loss: 2.8921778202056885
epoch: 2 batch: 10 loss: 3.8562371730804443
epoch: 2 batch: 20 loss: 4.338266849517822
epoch: 2 batch: 30 loss: 4.820296287536621
epoch: 2 batch: 40 loss: 3.8562369346618652
epoch: 3 batch: 0 loss: 2.8921778202056885
epoch: 3 batch: 10 loss: 3.8562371730804443
epoch: 3 batch: 20 loss: 4.338266849517822
epoch: 3 batch: 30 loss: 4.820296287536621
epoch: 3 batch: 40 loss: 3.8562369346618652
epoch: 4 batch: 0 loss: 2.8921778202056885
epoch: 4 batch: 10 loss: 3.8562371730804443
epoch: 4 batch: 20 loss: 4.338266849517822
ep