In [52]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
from torch import nn, optim
from collections import OrderedDict
import torch.nn.functional as F
from torch.autograd import Variable
from sklearn import preprocessing

In [2]:
"""处理数据，构建数据集部分"""

'处理数据，构建数据集部分'

In [3]:
def load_data(path):
    Data = pd.read_csv(path, nrows=10000)
    return Data
Data = load_data('G:\github项目\Recomendation_system\Data\criteo_sample\criteo_sampled_data.csv')

In [4]:
Data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [5]:
def Nan_process(Data):
    """补充数据中的缺失值，这里重点是构建模型，大家可自行优化数据缺失值的处理方法"""
    columns = Data.columns
    for column in columns:
        Data[column] = Data[column].fillna(method='bfill') 
        Data[column] = Data[column].fillna(method='pad') # 分别是用缺失值后面的值和前面的值进行填充
    return Data
Data = Nan_process(Data)

In [6]:
dense_fea, sparse_fea = Data.columns[1:14], Data.columns[14:]

In [81]:
len(sparse_fea)

26

In [58]:
def process_Data(Data, ratio=0.8):
    """划分训练数据集和验证数据集"""
    Le = preprocessing.LabelEncoder()
    n = len(Data.index) # 数据的长度
    Train_Data = Data[:int(n*ratio)]
    Val_Data = Data[int(n*ratio):]
    Train_Count_Data = Train_Data.iloc[:, :14]
    Val_Count_Data = Val_Data.iloc[:, :14]
    Train_Type_Data = Train_Data.iloc[:, 14:]
    Val_Type_Data = Val_Data.iloc[:, 14:]
    columns = Train_Type_Data.columns
    for column in columns:
        # 将类别型离散特征编码, 使用LabelEncoder编码
        Train_Type_Data[column] = Le.fit_transform(Train_Type_Data[column])
        Val_Type_Data[column] = Le.fit_transform(Val_Type_Data[column])
    Train_Type_Data = torch.from_numpy(Train_Type_Data.values.astype(float))
    Val_Type_Data = torch.from_numpy(Val_Type_Data.values.astype(float))
    Train_Count_Data = torch.from_numpy(Train_Count_Data.values.astype(float))
    Val_Count_Data = torch.from_numpy(Val_Count_Data.values.astype(float))
    return Train_Count_Data, Val_Count_Data, Train_Type_Data, Val_Type_Data
Train_Count_Data, Val_Count_Data, Train_Type_Data, Val_Type_Data = process_Data(Data)

In [138]:
Train_Type_Data[:,25]

tensor([ 972.,  734., 1001.,  ...,  393.,  128., 1181.], dtype=torch.float64)

In [241]:
class MyDataset(Dataset):
    # 构建数据集
    def __init__(self, Count_Data, Type_Data):
        self.label = Count_Data[:, 0]  # 将标签分离出来
        self.countx = Count_Data[:, 1:]
        self.typex = Type_Data[:, ]
    
    def __getitem__(self, index):
        sample = {'typex':self.typex[index], 'label':self.label[index], 'countx':self.countx[index]}
        return sample
        
    def __len__(self):
        return len(self.countx[:, 0])

In [242]:
Train_Dataset = MyDataset(Train_Count_Data, Train_Type_Data)
Val_Dataset = MyDataset(Val_Count_Data, Val_Type_Data)

In [243]:
class Residual_block(nn.Module):
    """定义残差块"""
    def __init__(self, hidden_units, dim_stack):
        super(Residual_block, self).__init__()
        self.L1 = nn.Linear(dim_stack, hidden_units)
        self.L2 = nn.Linear(hidden_units, dim_stack)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out1 = self.L1(x)
        out1 = self.relu(out1)
        out1 = self.L2(out1)
        out = self.relu(out1 + x)  # 残差连接
        return out

In [258]:
class DeepCrossing(nn.Module):
    def __init__(self, dense_fea, sparse_fea, hidden_units, dropout=0, embed_dim=20, output_dim=1):
        super(DeepCrossing, self).__init__()
        self.dense_fea = dense_fea
        self.sparse_fea = sparse_fea
        self.hidden_units = hidden_units
        
        # Embedding层
        self.embed_layers = nn.ModuleDict({
            'embed'+str(i):nn.Embedding(num_embeddings=10000, embedding_dim=embed_dim)
            for i in range(len(self.sparse_fea))
        })
        embed_dim_sum = len(self.sparse_fea) * embed_dim
        dim_stack = len(self.dense_fea) + embed_dim_sum
        
        # 残差层
        self.res_layers = nn.ModuleList([
            Residual_block(unit, dim_stack) for unit in self.hidden_units
        ])
        
        # dropout层
        self.drop = nn.Dropout(dropout)
        
        # 线性层
        self.L = nn.Linear(dim_stack, output_dim)
        
    def forward(self, typex, countx):
        typex = typex.long()
        sparse_embeds = [self.embed_layers['embed'+str(i)](typex[:, i]) for i in range(typex.shape[1])]
        sparse_embed = torch.cat(sparse_embeds, axis=-1)
        stack = torch.cat([sparse_embed, countx], axis=-1)
        r = stack
        for res in self.res_layers:
            r = res(r)
            
        r = self.drop(r)
        outputs = F.sigmoid(self.L(r))
        outputs = outputs.squeeze(-1)
        return outputs

In [259]:
# 定义优化器、损失函数、学习率、batchsize
hidden_units = [256, 128, 64, 32]
model = DeepCrossing(dense_fea, sparse_fea, hidden_units)

loss_fn = nn.BCELoss()

batchsize = 16
Train_Dataloader = DataLoader(Train_Dataset, batch_size=batchsize)
Val_Dataloader = DataLoader(Val_Dataset, batch_size=batchsize)

lr = 0.001
optimizer = optim.SGD(model.parameters(), lr)

In [262]:
def train(model, Train_Loader, Val_loader, loss_fn, optim, epoches=100):
    model.train()
    for epoch in range(epoches):
        print('第{}轮训练开始，共{}轮'.format(epoch+1, epoches))
        train_loss = 0
        for i, sample in enumerate(Train_Loader):
            typex, countx = Variable(sample['typex'].type(torch.FloatTensor)), Variable(sample['countx'].type(torch.FloatTensor))
            label = Variable(sample['label'])
            output = model(typex, countx)
            loss = loss_fn(output, label.float())
            
            # 优化器优化模型
            optim.zero_grad()
            loss.backward()
            optim.step()
            
            train_loss += loss.item()
        
        model.eval()
        eval_loss = 0
        for i, sample in enumerate(Val_loader):
            val_typex, val_countx = Variable(sample['typex'].type(torch.FloatTensor)), Variable(sample['countx'].type(torch.FloatTensor))
            val_label = Variable(sample['label'])
            val_output = model(val_typex, val_countx)
            val_loss = loss_fn(val_output, val_label.float())
            
            eval_loss += val_loss.item()
        
        print('第{}轮训练完成，训练误差为{}，验证误差为{}'.format(epoch+1, train_loss, eval_loss))
    return model

In [263]:
model = train(model, Train_Dataloader, Val_Dataloader, loss_fn, optimizer)

第1轮训练开始，共100轮
第1轮训练完成，训练误差为8368.354330598995，验证误差为2144.3956311643124
第2轮训练开始，共100轮
第2轮训练完成，训练误差为8666.79010393328，验证误差为2178.4804430492804
第3轮训练开始，共100轮
第3轮训练完成，训练误差为7972.506401168183，验证误差为2087.302914537951
第4轮训练开始，共100轮
第4轮训练完成，训练误差为8359.864625616861，验证误差为2169.6895073736086
第5轮训练开始，共100轮
第5轮训练完成，训练误差为8356.06542723245，验证误差为2202.2295188605785
第6轮训练开始，共100轮
第6轮训练完成，训练误差为8157.926548386501，验证误差为2206.2750504803844
第7轮训练开始，共100轮
第7轮训练完成，训练误差为8034.911266884679，验证误差为2058.862920779735
第8轮训练开始，共100轮
第8轮训练完成，训练误差为7764.120634795525，验证误差为1679.327098201029
第9轮训练开始，共100轮
第9轮训练完成，训练误差为11411.643085372634，验证误差为1826.327412892948
第10轮训练开始，共100轮
第10轮训练完成，训练误差为9004.33558353252，验证误差为1407.100704550743
第11轮训练开始，共100轮
第11轮训练完成，训练误差为8379.00909963506，验证误差为2071.035608457518
第12轮训练开始，共100轮
第12轮训练完成，训练误差为7699.935258271336，验证误差为2077.634226680064
第13轮训练开始，共100轮
第13轮训练完成，训练误差为7556.903384791941，验证误差为2038.0596796125174
第14轮训练开始，共100轮
第14轮训练完成，训练误差为7721.319726506516，验证误差为1191.537139520049
第15轮训练开始，共100轮
第15轮训练完成，训练误差为3293.2