In [19]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

## Loading Data

In [3]:
data = pd.read_csv('../data/data.csv')

In [4]:
data_X = data.iloc[:,2:]
data_y = data.click.values

In [5]:
data_X

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,2,15705,320,50,1722,0,35,-1,79
1,14102100,1005,0,4dd0a958,79cf0c8d,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,20352,320,50,2333,0,39,-1,157
2,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,20352,320,50,2333,0,39,-1,157
3,14102100,1005,0,8cbacf0b,a434fa42,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,19772,320,50,2227,0,687,100075,48
4,14102100,1005,0,f282ab5a,61eb5bc4,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,18993,320,50,2161,0,35,-1,157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,14102101,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15705,320,50,1722,0,35,100084,79
99996,14102101,1005,1,d9750ee7,98572c79,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,17614,320,50,1993,2,1063,-1,33
99997,14102101,1005,0,85f751fd,c4e18dd6,50e219e0,febd1138,82e27996,0f2161f8,a99f214a,...,1,0,21611,320,50,2480,3,297,100111,61
99998,14102101,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,...,1,0,15699,320,50,1722,0,35,100084,79


    可以看到测试的数据全都是类别特征, 其实实际的业务场景中几乎也都是类别型的特征
    这里我们给特征进行Label Encode

In [6]:
data_X = data_X.apply(LabelEncoder().fit_transform)

In [7]:
data_X

Unnamed: 0,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,2,0,110,823,1,712,28,0,5703,...,1,1,128,3,2,36,0,1,0,18
1,0,2,0,303,403,16,712,28,0,5703,...,1,0,303,3,2,103,0,2,0,31
2,0,2,0,334,668,3,712,28,0,5703,...,1,0,303,3,2,103,0,2,0,31
3,0,2,0,543,563,16,712,28,0,5703,...,1,0,234,3,2,76,0,26,53,10
4,0,2,0,924,316,16,712,28,0,5703,...,1,0,210,3,2,71,0,1,0,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1,2,0,110,823,1,712,28,0,5703,...,1,0,128,3,2,36,0,1,59,18
99996,1,2,1,825,510,16,712,28,0,5703,...,1,0,173,3,2,60,2,30,0,6
99997,1,2,0,519,658,5,767,31,2,5703,...,1,0,407,3,2,130,3,13,77,13
99998,1,2,0,110,823,1,712,28,0,5703,...,1,0,122,3,2,36,0,1,59,18


    每一个特征都独立进行了label 编码， 这种好处是可以直接进行embedding
    当我们embedding共享权值的时候， 可以给每列特征的label加入之前特征的类别总和，来达到所有特征的label
    这也是所有模型代码中 offset 的作用

In [9]:
fields = data_X.max().values # 模型输入的feature_fields

In [10]:
#train, validation, test 集合
tmp_X, test_X, tmp_y, test_y = train_test_split(data_X, data_y, test_size = 0.2, random_state=42, stratify=data_y)
train_X, val_X, train_y, val_y = train_test_split(tmp_X, tmp_y, test_size = 0.25, random_state=42, stratify=tmp_y)


# 数据量小, 可以直接读
train_X = torch.from_numpy(train_X.values).long()
val_X = torch.from_numpy(val_X.values).long()
test_X = torch.from_numpy(test_X.values).long()

train_y = torch.from_numpy(train_y).long()
val_y = torch.from_numpy(val_y).long()
test_y = torch.from_numpy(test_y).long()

train_set = Data.TensorDataset(train_X, train_y)
val_set = Data.TensorDataset(val_X, val_y)
train_loader = Data.DataLoader(dataset=train_set,
                               batch_size=32,
                               shuffle=True)
val_loader = Data.DataLoader(dataset=val_set,
                             batch_size=32,
                             shuffle=False)

## 训练过程
   
    数据是avazu数据的随机10万条
    优化器统一Adam， lr = 0.001
    epoch 为 1, batch_size = 32
    主要的目的是跑通所有的模型
    epoch多几次, 调调参数对稍微复杂的网络有好处
    
    
    tips : 类别特征embedding等价于一层没有bias项的全连接，所以模型中几乎都用embedding来模拟LR线性过程

In [13]:
epoches = 1

In [15]:
def train(model):
    for epoch in range(epoches):
        train_loss = []
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr = 0.001)
        model.train()
        for batch, (x, y) in enumerate(train_loader):
            pred = model(x)
            loss = criterion(pred, y.float().detach())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())

        model.eval()
        val_loss = []
        prediction = []
        y_true = []
        with torch.no_grad():
            for batch, (x, y) in enumerate(val_loader):
                pred = model(x)
                loss = criterion(pred, y.float().detach())
                val_loss.append(loss.item())
                prediction.extend(pred.tolist())
                y_true.extend(y.tolist())
        val_auc = roc_auc_score(y_true=y_true, y_score=prediction)
        print ("EPOCH %s train loss : %.5f   validation loss : %.5f   validation auc is %.5f" % (epoch, np.mean(train_loss), np.mean(val_loss), val_auc))        
        return train_loss, val_loss, val_auc

#### LR

In [22]:
from model import LR

In [23]:
model = LR.LogisticRegression(feature_fields=fields)

In [24]:
_ = train(model)

EPOCH 0 train loss : 0.69772   validation loss : 0.60676   validation auc is 0.61067


#### FM

In [25]:
from model import FM

In [26]:
model = FM.FactorizationMachine(feature_fields=fields, embed_dim=8)

In [27]:
_ = train(model)

EPOCH 0 train loss : 0.57438   validation loss : 0.48955   validation auc is 0.69284


#### FFM

In [29]:
from model import FFM

In [30]:
model = FFM.FieldAwareFactorizationMachine(field_dims=fields, embed_dim=8)

In [31]:
_ = train(model)

EPOCH 0 train loss : 0.53654   validation loss : 0.48017   validation auc is 0.69726


#### AFM

In [32]:
from model import AFM

In [36]:
model = AFM.AttentionalFactorizationMachine(feature_fields=fields, embed_dim=8, attn_size=8, dropouts=(0.25, 0.25))

In [37]:
_ = train(model)

EPOCH 0 train loss : 0.70439   validation loss : 0.55860   validation auc is 0.63911


#### DeepFM

In [38]:
from model import DeepFM

In [39]:
model = DeepFM.DeepFM(feature_fields=fields, embed_dim=8, mlp_dims=(32,16), dropout=0.2)

In [40]:
_ = train(model)

EPOCH 0 train loss : 0.55712   validation loss : 0.49020   validation auc is 0.68294


#### xDeepFM

In [60]:
from model import xDeepFM

In [68]:
model = xDeepFM.xDeepFM(feature_fields=fields, embed_dim=8, mlp_dims=(32,16), 
                        dropout=0.2, cross_layer_sizes=(16,16),split_half=False)

In [69]:
_ = train(model)

EPOCH 0 train loss : 0.56085   validation loss : 0.49273   validation auc is 0.69788


#### PNN

In [71]:
from model import PNN

In [72]:
model = PNN.PNN(feature_fields=fields, embed_dim=8, mlp_dims=(32, 16), dropout=0.2)

In [73]:
_ = train(model)

EPOCH 0 train loss : 0.42965   validation loss : 0.40480   validation auc is 0.74818


#### DCN

In [74]:
from model import DCN

In [75]:
model = DCN.DeepCrossNet(feature_fields=fields, embed_dim=8, num_layers=3, mlp_dims=(16,16), dropout=0.2)

In [76]:
_ = train(model)

EPOCH 0 train loss : 0.41618   validation loss : 0.40377   validation auc is 0.74731
