# Pytorch开发深度学习模型一般步骤

## 1.定义DataSet
- 理解数据原始形式
- 理解数据编码方式
- 理解如何进行数据I/O

## 2.定义模型
- 定义各个子模块
- 将子模块合并成最终的模型

## 3.完成Train Pipeline/Valid Pipeline
- Pytorch一般的Train Pipeline/Valid Pipeline书写



In [1]:
import torch
from torch.utils.data import Dataset
import torch.utils.data as D
from torch import nn
import pandas as pd
import numpy as np
import copy
import os
from sklearn.metrics import roc_auc_score,log_loss
from tqdm import tqdm

In [2]:
#参数配置

config = {
    "data_path":'criteo.csv',
    "sparse_cols":[f'C{x}' for x in range(1,27)],
    "dense_cols" : [f'I{x}' for x in range(1,14)],
    "train_ratio" : 0.7,
    "valid_ratio" : 0.2,
    "debug_mode" : True,
    "epoch" : 5,
    "batch" : 10240,
    "lr" : 0.0001,
    "device" : -1,
}

In [3]:
df = pd.read_csv(config['data_path'])

In [4]:
if config['debug_mode']:
    df = df[:1000]
df[config['sparse_cols']] = df[config['sparse_cols']].fillna('-1', )
df[config['dense_cols']] = df[config['dense_cols']].fillna(0, )
train_num = int(len(df)*config['train_ratio'])
valid_num = int(len(df)*config['valid_ratio'])
#切分数据集
train_df = df[:train_num].reset_index(drop=True)
valid_df = df[train_num:train_num+valid_num].reset_index(drop=True)
test_df = df[train_num+valid_num:].reset_index(drop=True)

## 1.定义DataSet
- 理解数据原始形式
- 理解数据编码方式
- 理解如何进行数据I/O




In [5]:
#Dataset构造
class BaseDataset(Dataset):
    def __init__(self,config,df,enc_dict=None):
        self.config = config
        self.df = df
        self.enc_dict = enc_dict
        self.dense_cols = list(set(self.config['dense_cols']))
        self.sparse_cols = list(set(self.config['sparse_cols']))
        self.feature_name = self.dense_cols+self.sparse_cols+['label']

        #数据编码
        if self.enc_dict == None:
            self.get_enc_dict()
        self.enc_data()

    def get_enc_dict(self):
        #计算enc_dict
        self.enc_dict = dict(zip( list(self.dense_cols+self.sparse_cols),[dict() for _ in range(len(self.dense_cols+self.sparse_cols))]))
        for f in self.sparse_cols:
            self.df[f] = self.df[f].astype('str')
            map_dict = dict(zip(self.df[f].unique(), range(1,self.df[f].nunique()+1)))
            self.enc_dict[f] = map_dict
            self.enc_dict[f]['vocab_size'] = self.df[f].nunique()+1
            '''
            eg:C17 特征的map_dict
            {'e5ba7672': 1,
             '07c540c4': 2,
             '8efede7f': 3,
             '1e88c74f': 4,
             '776ce399': 5,
             'd4bb7bd8': 6,
             '3486227d': 7,
             '27c07bd6': 8,
             '2005abd1': 9,
             'vocab_size': 10}
            '''

        for f in self.dense_cols:
            self.enc_dict[f]['min'] = self.df[f].min()
            self.enc_dict[f]['max'] = self.df[f].max()
            '''
            eg:I6 特征
            {'min': 0.0, 'max': 4638.0}
            '''
        return self.enc_dict

    def enc_dense_data(self,col):
        return (self.df[col] - self.enc_dict[col]['min']) / (self.enc_dict[col]['max'] - self.enc_dict[col]['min'])

    def enc_sparse_data(self,col):
        return self.df[col].apply(lambda x : self.enc_dict[col].get(x,0))

    def enc_data(self):
        #使用enc_dict对数据进行编码
        self.enc_df = copy.deepcopy(self.df)
        for col in self.dense_cols:
            self.enc_df[col] = self.enc_dense_data(col)
        for col in self.sparse_cols:
            self.enc_df[col] = self.enc_sparse_data(col)

    def __getitem__(self, index):
        data = dict()
        for col in self.feature_name:
            if col in self.dense_cols:
                data[col] = torch.Tensor([self.enc_df[col].iloc[index]]).squeeze(-1)
            elif col in self.sparse_cols:
                data[col] = torch.Tensor([self.enc_df[col].iloc[index]]).long().squeeze(-1)
        data['label'] = torch.Tensor([self.enc_df['label'].iloc[index]]).squeeze(-1)
        return data

    def __len__(self):
        return len(self.enc_df)

In [6]:
train_dataset = BaseDataset(config,train_df)
enc_dict = train_dataset.get_enc_dict()

valid_dataset = BaseDataset(config,valid_df,enc_dict=enc_dict)
test_dataset = BaseDataset(config,test_df,enc_dict=enc_dict)

In [7]:
train_dataset.__getitem__(5)

{'I8': tensor(0.),
 'I2': tensor(0.),
 'I12': tensor(0.),
 'I5': tensor(0.0144),
 'I7': tensor(0.),
 'I9': tensor(0.0011),
 'I3': tensor(0.),
 'I1': tensor(0.),
 'I6': tensor(0.),
 'I13': tensor(0.),
 'I10': tensor(0.),
 'I11': tensor(0.),
 'I4': tensor(0.),
 'C3': tensor(6),
 'C11': tensor(6),
 'C13': tensor(6),
 'C16': tensor(6),
 'C2': tensor(6),
 'C6': tensor(4),
 'C5': tensor(2),
 'C17': tensor(5),
 'C7': tensor(6),
 'C23': tensor(3),
 'C19': tensor(2),
 'C15': tensor(6),
 'C21': tensor(6),
 'C1': tensor(4),
 'C20': tensor(3),
 'C22': tensor(3),
 'C8': tensor(2),
 'C12': tensor(6),
 'C24': tensor(6),
 'C14': tensor(3),
 'C26': tensor(3),
 'C9': tensor(1),
 'C10': tensor(3),
 'C18': tensor(6),
 'C4': tensor(6),
 'C25': tensor(2),
 'label': tensor(0.)}

## 2.定义模型
- 定义各个子模块
- 将子模块合并成最终的模型

![image-2.png](attachment:image-2.png)

In [50]:
#基本网络模块

#通用Emb
class EmbeddingLayer(nn.Module):
    def __init__(self,
                 enc_dict = None,
                 embedding_dim = None):
        super(EmbeddingLayer, self).__init__()
        self.enc_dict = enc_dict
        self.embedding_dim = embedding_dim
        self.embedding_layer = nn.ModuleDict()

        self.emb_feature = []

        for col in self.enc_dict.keys():
            if 'vocab_size' in self.enc_dict[col].keys():
                self.emb_feature.append(col)
                self.embedding_layer.update({col : nn.Embedding(
                    self.enc_dict[col]['vocab_size'],
                    self.embedding_dim,
                )})

    def forward(self, X):
        #对所有的sparse特征挨个进行embedding
        feature_emb_list = []
        for col in self.emb_feature:
            inp = X[col].long().view(-1, 1)
            feature_emb_list.append(self.embedding_layer[col](inp))
        return feature_emb_list
    
#一阶交叉
class LR_Layer(nn.Module):
    def __init__(self,enc_dict):
        super(LR_Layer, self).__init__()
        self.enc_dict = enc_dict
        self.emb_layer = EmbeddingLayer(enc_dict=self.enc_dict,embedding_dim=1)
        self.dnn_input_dim = get_dnn_input_dim(self.enc_dict, 1)
        self.fc = nn.Linear(self.dnn_input_dim,1)
        
    def forward(self,data):
        sparse_emb = self.emb_layer(data)
        sparse_emb = torch.stack(sparse_emb,dim=1).flatten(1) #[batch,num_sparse*emb]
        dense_input = get_linear_input(self.enc_dict, data)  #[batch,num_dense]
        dnn_input = torch.cat((sparse_emb, dense_input), dim=1) # [batch,num_sparse*emb + num_dense]
        out = self.fc(dnn_input)
        return out
    

## CIN
![image-4.png](attachment:image-4.png)

In [84]:
# Z_{k+1}的按batch计算逻辑

Xk = torch.rand(16,12,32) # 16:batch_size, 12:Hk, 32:emb_dim
X0 = torch.rand(16,8,32) # 16:batch_size, 8:m, 32:emb_dim
# 对于Pytorch中特别复杂的矩阵运算(3，4，5维的矩阵运行)，一般不使用 bmm,mm,matmul，一般使用einsum
# Xk.unsqueeze(-2): (16,12,1,32) X0.unsqueeze(1):(16,1,8,32)
print(torch.einsum('bhce,bcde->bhde',Xk.unsqueeze(-2),X0.unsqueeze(1)).shape)
print(torch.einsum('bhe,bde->bhde',Xk,X0).shape)

torch.Size([16, 12, 8, 32])
torch.Size([16, 12, 8, 32])


In [85]:
# 1D 卷积的计算逻辑

conv_1d = nn.Conv1d(in_channels=8,out_channels=64,kernel_size=1) # batch, featu_num, emb_dim(卷积滑动的维度)
conv_1d(X0).shape # batch, out_channels, emb_dim(卷积滑动的维度)

torch.Size([16, 64, 32])

In [78]:
# CIN
class CIN(nn.Module):
    def __init__(self,
                sparse_num,
                cin_hidden_units=[16,16,16], #H1,H2,...,HK
                ):
        super(CIN, self).__init__()
        self.sparse_num = sparse_num
        self.cin_hidden_units = cin_hidden_units
        self.fc = nn.Linear(sum(self.cin_hidden_units), 1)
        
        self.cin_layer = nn.ModuleList()
        for i, unit in enumerate(self.cin_hidden_units):
            in_channels = self.sparse_num * self.cin_hidden_units[i - 1] if i > 0 else self.sparse_num ** 2
            out_channels = unit
            self.cin_layer.append(nn.Conv1d(in_channels,
                                              out_channels,  
                                              kernel_size=1)) 
    def forward(self,sparse_embedding):
        batch_size = sparse_embedding.shape[0]
        embedding_dim = sparse_embedding.shape[-1]
        
        cin_output_list = []
        
        X0 = sparse_embedding   # batch,m,emb
        Xk = X0                 # batch,Hk,emb
        for idx,cin in enumerate(self.cin_layer):
            
            Zk = torch.einsum('bhce,bcde->bhde',Xk.unsqueeze(-2),X0.unsqueeze(1)) # batch,Hk,m,emb
            Zk = Zk.view(batch_size,-1,embedding_dim) #batch,Hk*m,emb
            Xk = self.cin_layer[idx](Zk) #batch,Hk+1,emb
            
            cin_output_list.append(Xk.sum(dim=-1))#batch,Hk+1
            
        cin_output = torch.cat(cin_output_list, dim=-1) # batch,sum(self.cin_hidden_units)
        output = self.fc(cin_output)
        return output
    
#DNN
class MLP_Layer(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim=None,
                 hidden_units=[],
                 hidden_activations="ReLU",
                 final_activation=None,
                 dropout_rates=0,
                 batch_norm=False,
                 use_bias=True):
        super(MLP_Layer, self).__init__()
        dense_layers = []
        if not isinstance(dropout_rates, list):
            dropout_rates = [dropout_rates] * len(hidden_units)
        if not isinstance(hidden_activations, list):
            hidden_activations = [hidden_activations] * len(hidden_units)
        hidden_activations = [set_activation(x) for x in hidden_activations]
        hidden_units = [input_dim] + hidden_units
        for idx in range(len(hidden_units) - 1):
            dense_layers.append(nn.Linear(hidden_units[idx], hidden_units[idx + 1], bias=use_bias))
            if batch_norm:
                dense_layers.append(nn.BatchNorm1d(hidden_units[idx + 1]))
            if hidden_activations[idx]:
                dense_layers.append(hidden_activations[idx])
            if dropout_rates[idx] > 0:
                dense_layers.append(nn.Dropout(p=dropout_rates[idx]))
        if output_dim is not None:
            dense_layers.append(nn.Linear(hidden_units[-1], output_dim, bias=use_bias))
        if final_activation is not None:
            dense_layers.append(set_activation(final_activation))
        self.dnn = nn.Sequential(*dense_layers)  # * used to unpack list

    def forward(self, inputs):
        return self.dnn(inputs)

def set_device(gpu=-1):
    if gpu >= 0 and torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
        device = torch.device(f"cuda:{gpu}")
    else:
        device = torch.device("cpu")
    return device
    
def set_activation(activation):
    if isinstance(activation, str):
        if activation.lower() == "relu":
            return nn.ReLU()
        elif activation.lower() == "sigmoid":
            return nn.Sigmoid()
        elif activation.lower() == "tanh":
            return nn.Tanh()
        else:
            return getattr(nn, activation)()
    else:
        return activation
    
def get_dnn_input_dim(enc_dict,embedding_dim):
    num_sparse = 0
    num_dense = 0
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            num_dense+=1
        elif 'vocab_size' in enc_dict[col].keys():
            num_sparse+=1
    return num_sparse*embedding_dim+num_dense

def get_feature_num(enc_dict):
    num_sparse = 0
    num_dense = 0
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            num_dense+=1
        elif 'vocab_size' in enc_dict[col].keys():
            num_sparse+=1
    return num_sparse,num_dense   

def get_linear_input(enc_dict,data):
    res_data = []
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            res_data.append(data[col])
    res_data = torch.stack(res_data,axis=1)
    return res_data

In [79]:
#xDeepFM模型
class xDeepFM(nn.Module):
    def __init__(self,
                 embedding_dim=10,
                 hidden_units=[64, 64, 64],
                 cin_hidden_units = [8,8,8,8,8],
                 loss_fun = 'torch.nn.BCELoss()',
                 enc_dict=None):
        super(xDeepFM, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.cin_hidden_units = cin_hidden_units
        self.loss_fun = eval(loss_fun)
        self.enc_dict = enc_dict

        self.embedding_layer = EmbeddingLayer(enc_dict=self.enc_dict, embedding_dim=self.embedding_dim)
        
        self.sparse_num,_ = get_feature_num(self.enc_dict) # 论文中的“m“
        
        self.cin = CIN(self.sparse_num,self.cin_hidden_units) #CIN
        self.lr = LR_Layer(enc_dict=enc_dict) #一阶

        self.dnn_input_dim = get_dnn_input_dim(self.enc_dict, self.embedding_dim)
        #sparse_num * emb_dim + dense_num

        self.dnn = MLP_Layer(input_dim=self.dnn_input_dim, output_dim=1, hidden_units=self.hidden_units,
                                 hidden_activations='relu', dropout_rates=0)

    def forward(self, data):
        sparse_embedding = self.embedding_layer(data)
        sparse_embedding = torch.stack(sparse_embedding, dim=1).squeeze(-2) # batch,sparse_num(m),emb_dim
        
        #一阶交叉
        lr_logit = self.lr(data)
        # CIN
        cin_logit = self.cin(sparse_embedding) 
        # DNN
        emb_flatten = sparse_embedding.flatten(start_dim=1)
        dense_input = get_linear_input(self.enc_dict, data)
        
        dnn_input = torch.cat((emb_flatten, dense_input), dim=1)
        
        dnn_logit = self.dnn(dnn_input)
        
        #输出
        y_pred = torch.sigmoid(lr_logit + cin_logit + dnn_logit)
        loss = self.loss_fun(y_pred.squeeze(-1),data['label'])
        output_dict = {'pred':y_pred,'loss':loss}
        return output_dict

## 3.完成Train Pipeline/Valid Pipeline
- Pytorch一般的Train Pipeline/Valid Pipeline书写

In [86]:
#训练模型，验证模型
def train_model(model, train_loader, optimizer, device, metric_list=['roc_auc_score','log_loss']):
    model.train()
    pred_list = []
    label_list = []
    pbar = tqdm(train_loader)
    for data in pbar:

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']
        loss = output['loss']

        loss.backward()
        optimizer.step()
        model.zero_grad()

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['label'].squeeze(-1).cpu().detach().numpy())
        pbar.set_description("Loss {}".format(loss))

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list,pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

def valid_model(model, valid_loader, device, metric_list=['roc_auc_score','log_loss']):
    model.eval()
    pred_list = []
    label_list = []

    for data in (valid_loader):

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['label'].squeeze(-1).cpu().detach().numpy())

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list,pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

def test_model(model, test_loader, device, metric_list=['roc_auc_score','log_loss']):
    model.eval()
    pred_list = []
    label_list = []

    for data in (test_loader):

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']

        pred_list.extend(pred.squeeze().cpu().detach().numpy())
        label_list.extend(data['label'].squeeze().cpu().detach().numpy())

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list,pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

In [81]:
#dataloader
train_loader = D.DataLoader(train_dataset,batch_size=config['batch'],shuffle=True,num_workers=0)
valid_loader = D.DataLoader(valid_dataset,batch_size=config['batch'],shuffle=False,num_workers=0)
test_loader = D.DataLoader(test_dataset,batch_size=config['batch'],shuffle=False,num_workers=0)

In [82]:
model = xDeepFM(enc_dict=enc_dict)

In [83]:
device = set_device(config['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
model = model.to(device)
#模型训练流程
for i in range(config['epoch']):
    #模型训练
    train_metirc = train_model(model,train_loader,optimizer=optimizer,device=device)
    #模型验证
    valid_metric = valid_model(model,valid_loader,device)

    print("Train Metric:")
    print(train_metirc)
    print("Valid Metric:")
    print(valid_metric)
#测试模型
test_metric = test_model(model,test_loader,device)
print('Test Metric:')
print(test_metric)

Loss 0.7798061966896057: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.00it/s]


Train Metric:
{'roc_auc_score': 0.45016383975091345, 'log_loss': 0.779806188972933}
Valid Metric:
{'roc_auc_score': 0.5548902195608783, 'log_loss': 0.6003923718631268}


Loss 0.7603656649589539: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


Train Metric:
{'roc_auc_score': 0.45257435562642023, 'log_loss': 0.7603657453720059}
Valid Metric:
{'roc_auc_score': 0.5548902195608783, 'log_loss': 0.5883885800838471}


Loss 0.7418236136436462: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.01it/s]


Train Metric:
{'roc_auc_score': 0.45571304817265323, 'log_loss': 0.7418236697145871}
Valid Metric:
{'roc_auc_score': 0.5539829432045001, 'log_loss': 0.5770594095811248}


Loss 0.7241957187652588: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.01it/s]


Train Metric:
{'roc_auc_score': 0.45829933083074914, 'log_loss': 0.7241957553476095}
Valid Metric:
{'roc_auc_score': 0.5516240246779169, 'log_loss': 0.5664044605568052}


Loss 0.7074797749519348: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.14it/s]

Train Metric:
{'roc_auc_score': 0.4610739350416191, 'log_loss': 0.7074797973143203}
Valid Metric:
{'roc_auc_score': 0.5508982035928144, 'log_loss': 0.5564197567850351}
Test Metric:
{'roc_auc_score': 0.41767848470131136, 'log_loss': 0.7104476961493492}



