In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tensorflow import keras
from tqdm import tqdm

In [2]:
tf.get_logger().setLevel('ERROR')

## Loading Data

In [3]:
data = pd.read_csv('../data/data.csv')

In [4]:
data_X = data.iloc[:,2:]
data_y = data.click.values

In [5]:
data_X

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,2,15705,320,50,1722,0,35,-1,79
1,14102100,1005,0,4dd0a958,79cf0c8d,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,20352,320,50,2333,0,39,-1,157
2,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,20352,320,50,2333,0,39,-1,157
3,14102100,1005,0,8cbacf0b,a434fa42,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,19772,320,50,2227,0,687,100075,48
4,14102100,1005,0,f282ab5a,61eb5bc4,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,18993,320,50,2161,0,35,-1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,14102101,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15705,320,50,1722,0,35,100084,79
99996,14102101,1005,1,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,17614,320,50,1993,2,1063,-1,33
99997,14102101,1005,0,85f751fd,c4e18dd6,50e219e0,febd1138,82e27996,0f2161f8,a99f214a,...,1,0,21611,320,50,2480,3,297,100111,61
99998,14102101,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15699,320,50,1722,0,35,100084,79


    可以看到测试的数据全都是类别特征, 其实实际的业务场景中几乎也都是类别型的特征
    这里我们给特征进行Label Encode

In [6]:
data_X = data_X.apply(LabelEncoder().fit_transform)

In [7]:
data_X

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2,0,110,823,1,712,28,0,5703,...,1,1,128,3,2,36,0,1,0,18
1,0,2,0,303,403,16,712,28,0,5703,...,1,0,303,3,2,103,0,2,0,31
2,0,2,0,334,668,3,712,28,0,5703,...,1,0,303,3,2,103,0,2,0,31
3,0,2,0,543,563,16,712,28,0,5703,...,1,0,234,3,2,76,0,26,53,10
4,0,2,0,924,316,16,712,28,0,5703,...,1,0,210,3,2,71,0,1,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,2,0,110,823,1,712,28,0,5703,...,1,0,128,3,2,36,0,1,59,18
99996,1,2,1,825,510,16,712,28,0,5703,...,1,0,173,3,2,60,2,30,0,6
99997,1,2,0,519,658,5,767,31,2,5703,...,1,0,407,3,2,130,3,13,77,13
99998,1,2,0,110,823,1,712,28,0,5703,...,1,0,122,3,2,36,0,1,59,18


    每一个特征都独立进行了label 编码， 这种好处是可以直接进行embedding
    当我们embedding共享权值的时候， 可以给每列特征的label加入之前特征的类别总和，来达到所有特征的label
    这也是所有模型代码中 offset 的作用
    
    e.g. field_dims = [2, 4, 2], offsets = [0, 2, 6]

    所以，实际look up table中
    0 - 1行 对应 特征 X0, 即 field_dims[0]
    2 - 5行 对应 特征 X1, 即 field_dims[1]
    6 - 7行 对应 特征 X2, 即 field_dims[2]
    但实际特征取值 forward(self, x) 的 x大小 只在自身词表内取值
    比如：X1取值1，对应Embedding内行数就是 offsets[X1] + X1 = 2 + 1 = 3


In [8]:
fields = data_X.max().values + 1 # 模型输入的feature_fields

In [9]:
fields

array([    2,     6,     6,   987,   872,    18,   769,    62,    19,
        8544, 47309,  2606,     4,     4,   448,     5,     6,   141,
           4,    38,   144,    33], dtype=int64)

In [10]:
tmp_X, test_X, tmp_y, test_y = train_test_split(data_X, data_y, test_size = 0.2, random_state=42, stratify=data_y)
train_X, val_X, train_y, val_y = train_test_split(tmp_X, tmp_y, test_size = 0.25, random_state=42, stratify=tmp_y)

## 训练过程
   
    数据是avazu数据的随机10万条
    优化器统一Adam， lr = 0.001
    epoch 为 1, batch_size = 32
    主要的目的是跑通所有的模型
    epoch多几次, 调调参数对稍微复杂的网络有好处
    
    
    tips : 类别特征embedding等价于一层没有bias项的全连接，所以模型中几乎都用embedding来模拟LR线性过程

#### LR

In [11]:
from model import LR

In [12]:
model = LR.LogisticRegression(feature_fields = fields)

In [13]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [14]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243c491bdd8>

#### FM

In [15]:
from model import FM

In [16]:
model = FM.FactorizationMachine(feature_fields = fields, embed_dim = 8)

In [17]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [18]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243cb3ecc18>

#### FFM

In [19]:
from model import FFM

In [20]:
model = FFM.FieldFactorizationMachine(feature_fields = fields, embed_dim = 8)

In [21]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [22]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243d1f00908>

#### AFM

In [23]:
from model import AFM

In [24]:
model = AFM.AttentionalFactorizationMachine(feature_fields = fields, embed_dim = 8, attn_size = 8, dropout = 0.2)

In [25]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [26]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243ec9d36d8>

#### DeepFM

In [27]:
from model import DeepFM

In [28]:
model = DeepFM.DeepFM(feature_fields = fields, embed_dim = 8, mlp_dims = [32,16], dropout=0.2)

In [29]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [30]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243cc90a438>

#### xDeepFM

In [31]:
from model import xDeepFM

In [32]:
model = xDeepFM.xDeepFM(feature_fields = fields, embed_dim = 8, mlp_dims = (32, 16), 
                        dropout = 0.3, cross_layer_sizes = (16, 16))

In [33]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [34]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243f5982128>

#### PNN

In [35]:
from model import PNN

In [36]:
model = PNN.PNN(feature_fields=fields, embed_dim=8, mlp_dims=[32,16], dropout=0.2, method='inner')

In [37]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [38]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243fb894080>

#### DCN

In [39]:
from model import DCN

In [40]:
model = DCN.DeepCrossNet(feature_fields=fields, embed_dim=8, num_layers=3, mlp_dims=[32, 16], dropout=0.2)

In [41]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [42]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243fa8b5b00>

#### AutoInt

In [43]:
from model import AutoInt

In [44]:
model = AutoInt.AutoInt(feature_fields=fields, embed_dim=16, head_num=4, attn_layers=3, mlp_dims=(32,16), dropout=0.2)

In [45]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [46]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x243858b2be0>

#### FiBiNet

In [47]:
from model import FiBiNET

In [48]:
model = FiBiNET.FiBiNET(feature_fields=fields, embed_dim=8, reduction_ratio=2, pooling='mean')

In [49]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [50]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x2439817def0>

#### DCNv2

In [51]:
from model import DCNv2

In [53]:
model = DCNv2.DeepCrossNetv2(feature_fields = fields, embed_dim = 16, layer_num = 2,
                             mlp_dims = (32, 16), dropout = 0.1, cross_method = 'Matrix')

In [54]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [55]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x24421bdd198>

In [56]:
model = DCNv2.DeepCrossNetv2(feature_fields = fields, embed_dim = 16, layer_num = 2,
                             mlp_dims = (32, 16), dropout = 0.1, cross_method = 'Mix')

In [57]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [58]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)

Train on 60000 samples, validate on 20000 samples


<tensorflow.python.keras.callbacks.History at 0x24424dac080>

#### DIFM

In [11]:
from model import DIFM

In [12]:
model = DIFM.DIFM(feature_fields=fields, embed_dim=8, head_num=2, dropout=0.1)

In [13]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [14]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)



<tensorflow.python.keras.callbacks.History at 0x1d2a7f34e20>

#### AFN

In [11]:
from model import AFN

In [12]:
model = AFN.AFN(feature_fields=fields, embed_size=8, hidden_size=256, dropout=0.1)

In [13]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [14]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)



<tensorflow.python.keras.callbacks.History at 0x206acfd5940>

#### ONN

In [11]:
from model import ONN

In [12]:
model = ONN.ONN(feature_fields=fields, embed_dim=8, mlp_dims=[64, 32], dropout=0.1)

In [13]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [14]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=1)



<tensorflow.python.keras.callbacks.History at 0x24f6a8b6a90>

## 序列模型

####  DIN

Deep Interest Net在预测的时候，对用户不同的行为的注意力是不一样的

在生成User embedding的时候，加入了Activation Unit Layer.这一层产生了每个用户行为的权重乘上相应的物品embedding，从而生产了user interest embedding的表示

实际例子： Amazon Book数据 10K

每条数据记录会有用户的行为数据

只保留了商品特征，以及历史上的商品hist的特征.

In [2]:

# 预处理好的数据
# 处理的函数在AmazonDataPreprocress.py中
# 原始数据为.txt文件

data = pd.read_csv('../data/amazon-books-100k-preprocessed.csv', index_col = 0)

In [3]:
data

Unnamed: 0,hist_cate_0,hist_cate_1,hist_cate_2,hist_cate_3,hist_cate_4,hist_cate_5,hist_cate_6,hist_cate_7,hist_cate_8,hist_cate_9,...,hist_cate_32,hist_cate_33,hist_cate_34,hist_cate_35,hist_cate_36,hist_cate_37,hist_cate_38,hist_cate_39,cateID,label
0,142,142,142,142,142,0,0,0,0,0,...,0,0,0,0,0,0,0,0,751,0
1,142,142,142,142,142,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,1
2,142,142,142,142,97,142,142,0,0,0,...,0,0,0,0,0,0,0,0,1094,0
3,142,142,142,142,97,142,142,0,0,0,...,0,0,0,0,0,0,0,0,142,1
4,142,142,142,142,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,142,142,142,142,142,751,0,0,0,0,...,0,0,0,0,0,0,0,0,142,1
99996,142,142,142,142,142,142,142,142,142,142,...,0,0,0,0,0,0,0,0,142,0
99997,142,142,142,142,142,142,142,142,142,142,...,0,0,0,0,0,0,0,0,607,1
99998,142,142,142,142,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,0


In [4]:
data_X = data.iloc[:,:-1]
data_y = data.label.values

In [8]:
fields = data_X.max().max()

In [9]:
fields

1347

In [10]:
tmp_X, test_X, tmp_y, test_y = train_test_split(data_X, data_y, test_size = 0.2, random_state=42, stratify=data_y)
train_X, val_X, train_y, val_y = train_test_split(tmp_X, tmp_y, test_size = 0.25, random_state=42, stratify=tmp_y)

In [11]:
from model import DIN

In [13]:
model = DIN.DeepInterestNet(feature_dim=fields, embed_dim=8, mlp_dims=[64,32], dropout=0.2)

In [14]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), 
              loss = 'binary_crossentropy', metrics=[keras.metrics.AUC()])

In [15]:
model.fit(train_X.values, train_y, batch_size=32, validation_data=(val_X.values, val_y), epochs=2)

Train on 60000 samples, validate on 20000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1abfc62fb00>

### DIEN

相比于DIN， DIEN的改动：

1） 关注兴趣的演化过程，提出了兴趣进化网络，用序列模型做的， DIN中用户兴趣之间是相互独立的，但实际上的兴趣是不断进化的

2） 设计了一个兴趣抽取层，加入了一个二分类模型来辅助计算兴趣抽取的准确性

3） 用序列模型表达用户的兴趣动态变化性

实际的数据用例和DIN一样

In [15]:
data = pd.read_csv('../data/amazon-books-100k-preprocessed.csv', index_col=0)

In [16]:
data

Unnamed: 0,hist_cate_0,hist_cate_1,hist_cate_2,hist_cate_3,hist_cate_4,hist_cate_5,hist_cate_6,hist_cate_7,hist_cate_8,hist_cate_9,...,hist_cate_32,hist_cate_33,hist_cate_34,hist_cate_35,hist_cate_36,hist_cate_37,hist_cate_38,hist_cate_39,cateID,label
0,142,142,142,142,142,0,0,0,0,0,...,0,0,0,0,0,0,0,0,751,0
1,142,142,142,142,142,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,1
2,142,142,142,142,97,142,142,0,0,0,...,0,0,0,0,0,0,0,0,1094,0
3,142,142,142,142,97,142,142,0,0,0,...,0,0,0,0,0,0,0,0,142,1
4,142,142,142,142,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,142,142,142,142,142,751,0,0,0,0,...,0,0,0,0,0,0,0,0,142,1
99996,142,142,142,142,142,142,142,142,142,142,...,0,0,0,0,0,0,0,0,142,0
99997,142,142,142,142,142,142,142,142,142,142,...,0,0,0,0,0,0,0,0,607,1
99998,142,142,142,142,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,142,0


In [17]:
fields = data.max().max()
data_X = data.iloc[:,:-1]
data_y = data.label.values

In [18]:
from model.DIEN import DeepInterestEvolutionNet, auxiliary_sample

In [19]:
tmp_X, test_X, tmp_y, test_y = train_test_split(data_X, data_y, test_size = 0.2, random_state=42, stratify=data_y)
train_X, val_X, train_y, val_y = train_test_split(tmp_X, tmp_y, test_size = 0.25, random_state=42, stratify=tmp_y)

In [20]:
train_X_neg = auxiliary_sample(train_X)

In [21]:
train_X = train_X.values
val_X = val_X.values
test_X = test_X.values

In [22]:
train_loader = tf.data.Dataset.from_tensor_slices((train_X, train_X_neg, train_y)).shuffle(len(train_X)).batch(128)

In [23]:
val_loader  =tf.data.Dataset.from_tensor_slices((val_X, val_y)).batch(128)

In [24]:
model = DeepInterestEvolutionNet(feature_dim=fields, embed_dim=4, mlp_dims=[32,32], dropout=0.2, gru_type = 'GRU')
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [25]:
epoches = 3
for epoch in range(epoches):
    epoch_train_loss = tf.keras.metrics.Mean()
    for batch, (x, neg_x, y) in tqdm(enumerate(train_loader)):
        with tf.GradientTape() as tape:
            out, aux_loss = model(x, neg_x)
            loss = tf.keras.losses.binary_crossentropy(y, out)
            loss = tf.reduce_mean(loss) + tf.cast(aux_loss, tf.float32)
            loss = tf.reduce_mean(loss)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(grads_and_vars = zip(grads, model.trainable_variables))
        epoch_train_loss(loss)
    epoch_val_loss = tf.keras.metrics.Mean()
    for batch, (x, y) in tqdm(enumerate(val_loader)):
        out,_ = model(x)
        loss = tf.keras.losses.binary_crossentropy(y, out)
        loss = tf.reduce_mean(loss)
        epoch_val_loss(loss)
    print('EPOCH : %s, train loss : %s, val loss: %s' % (epoch,
                                                         epoch_train_loss.result().numpy(),
                                                         epoch_val_loss.result().numpy()))

469it [01:42,  4.58it/s]
157it [00:11, 14.24it/s]
0it [00:00, ?it/s]

EPOCH : 0, train loss : 1.9061264, val loss: 0.69325197


469it [01:43,  4.55it/s]
157it [00:11, 14.19it/s]
0it [00:00, ?it/s]

EPOCH : 1, train loss : 0.80915856, val loss: 0.6931492


469it [01:42,  4.57it/s]
157it [00:11, 14.26it/s]

EPOCH : 2, train loss : 0.7702951, val loss: 0.693148



