# Pytorch开发深度学习模型一般步骤

## 1.定义DataSet
- 理解数据原始形式
- 理解数据编码方式
- 理解如何进行数据I/O

## 2.定义模型
- 定义各个子模块
- 将子模块合并成最终的模型

## 3.完成Train Pipeline/Valid Pipeline
- Pytorch一般的Train Pipeline/Valid Pipeline书写



In [66]:
import torch
from torch.utils.data import Dataset
import torch.utils.data as D
from torch import nn
import pandas as pd
import numpy as np
import copy
import os
from sklearn.metrics import roc_auc_score,log_loss
from tqdm import tqdm

In [67]:
#参数配置

config = {
    "data_path":'criteo.csv',
    "sparse_cols":[f'C{x}' for x in range(1,27)],
    "dense_cols" : [f'I{x}' for x in range(1,14)],
    "train_ratio" : 0.7,
    "valid_ratio" : 0.2,
    "debug_mode" : True,
    "epoch" : 5,
    "batch" : 10240,
    "lr" : 0.0001,
    "device" : -1,
}

In [68]:
df = pd.read_csv(config['data_path'])

In [69]:
if config['debug_mode']:
    df = df[:1000]
df[config['sparse_cols']] = df[config['sparse_cols']].fillna('-1', )
df[config['dense_cols']] = df[config['dense_cols']].fillna(0, )
train_num = int(len(df)*config['train_ratio'])
valid_num = int(len(df)*config['valid_ratio'])
#切分数据集
train_df = df[:train_num].reset_index(drop=True)
valid_df = df[train_num:train_num+valid_num].reset_index(drop=True)
test_df = df[train_num+valid_num:].reset_index(drop=True)

## 1.定义DataSet
- 理解数据原始形式
- 理解数据编码方式
- 理解如何进行数据I/O




In [70]:
#Dataset构造
class BaseDataset(Dataset):
    def __init__(self,config,df,enc_dict=None):
        self.config = config
        self.df = df
        self.enc_dict = enc_dict
        self.dense_cols = list(set(self.config['dense_cols']))
        self.sparse_cols = list(set(self.config['sparse_cols']))
        self.feature_name = self.dense_cols+self.sparse_cols+['label']

        #数据编码
        if self.enc_dict == None:
            self.get_enc_dict()
        self.enc_data()

    def get_enc_dict(self):
        #计算enc_dict
        self.enc_dict = dict(zip( list(self.dense_cols+self.sparse_cols),[dict() for _ in range(len(self.dense_cols+self.sparse_cols))]))
        for f in self.sparse_cols:
            self.df[f] = self.df[f].astype('str')
            map_dict = dict(zip(self.df[f].unique(), range(1,self.df[f].nunique()+1)))
            self.enc_dict[f] = map_dict
            self.enc_dict[f]['vocab_size'] = self.df[f].nunique()+1
            '''
            eg:C17 特征的map_dict
            {'e5ba7672': 1,
             '07c540c4': 2,
             '8efede7f': 3,
             '1e88c74f': 4,
             '776ce399': 5,
             'd4bb7bd8': 6,
             '3486227d': 7,
             '27c07bd6': 8,
             '2005abd1': 9,
             'vocab_size': 10}
            '''

        for f in self.dense_cols:
            self.enc_dict[f]['min'] = self.df[f].min()
            self.enc_dict[f]['max'] = self.df[f].max()
            '''
            eg:I6 特征
            {'min': 0.0, 'max': 4638.0}
            '''
        return self.enc_dict

    def enc_dense_data(self,col):
        return (self.df[col] - self.enc_dict[col]['min']) / (self.enc_dict[col]['max'] - self.enc_dict[col]['min'])

    def enc_sparse_data(self,col):
        return self.df[col].apply(lambda x : self.enc_dict[col].get(x,0))

    def enc_data(self):
        #使用enc_dict对数据进行编码
        self.enc_df = copy.deepcopy(self.df)
        for col in self.dense_cols:
            self.enc_df[col] = self.enc_dense_data(col)
        for col in self.sparse_cols:
            self.enc_df[col] = self.enc_sparse_data(col)

    def __getitem__(self, index):
        data = dict()
        for col in self.feature_name:
            if col in self.dense_cols:
                data[col] = torch.Tensor([self.enc_df[col].iloc[index]]).squeeze(-1)
            elif col in self.sparse_cols:
                data[col] = torch.Tensor([self.enc_df[col].iloc[index]]).long().squeeze(-1)
        data['label'] = torch.Tensor([self.enc_df['label'].iloc[index]]).squeeze(-1)
        return data

    def __len__(self):
        return len(self.enc_df)

In [71]:
train_dataset = BaseDataset(config,train_df)
enc_dict = train_dataset.get_enc_dict()

valid_dataset = BaseDataset(config,valid_df,enc_dict=enc_dict)
test_dataset = BaseDataset(config,test_df,enc_dict=enc_dict)

In [72]:
train_dataset.__getitem__(5)

{'I3': tensor(0.),
 'I10': tensor(0.),
 'I11': tensor(0.),
 'I2': tensor(0.),
 'I6': tensor(0.),
 'I8': tensor(0.),
 'I9': tensor(0.0011),
 'I13': tensor(0.),
 'I1': tensor(0.),
 'I7': tensor(0.),
 'I5': tensor(0.0144),
 'I12': tensor(0.),
 'I4': tensor(0.),
 'C23': tensor(3),
 'C4': tensor(6),
 'C21': tensor(6),
 'C18': tensor(6),
 'C24': tensor(6),
 'C17': tensor(5),
 'C20': tensor(3),
 'C2': tensor(6),
 'C19': tensor(2),
 'C22': tensor(3),
 'C13': tensor(6),
 'C7': tensor(6),
 'C3': tensor(6),
 'C8': tensor(2),
 'C5': tensor(2),
 'C10': tensor(3),
 'C1': tensor(4),
 'C25': tensor(2),
 'C26': tensor(3),
 'C14': tensor(3),
 'C11': tensor(6),
 'C9': tensor(1),
 'C16': tensor(6),
 'C12': tensor(6),
 'C6': tensor(4),
 'C15': tensor(6),
 'label': tensor(0.)}

## 2.定义模型
- 定义各个子模块
- 将子模块合并成最终的模型

![image-2.png](attachment:image-2.png)

In [73]:
#基本网络模块

#通用Emb
class EmbeddingLayer(nn.Module):
    def __init__(self,
                 enc_dict = None,
                 embedding_dim = None):
        super(EmbeddingLayer, self).__init__()
        self.enc_dict = enc_dict
        self.embedding_dim = embedding_dim
        self.embedding_layer = nn.ModuleDict()

        self.emb_feature = []

        for col in self.enc_dict.keys():
            if 'vocab_size' in self.enc_dict[col].keys():
                self.emb_feature.append(col)
                self.embedding_layer.update({col : nn.Embedding(
                    self.enc_dict[col]['vocab_size'],
                    self.embedding_dim,
                )})

    def forward(self, X):
        #对所有的sparse特征挨个进行embedding
        feature_emb_list = []
        for col in self.emb_feature:
            inp = X[col].long().view(-1, 1)
            feature_emb_list.append(self.embedding_layer[col](inp))
        return feature_emb_list
    
#一阶交叉
class LR_Layer(nn.Module):
    def __init__(self,enc_dict):
        super(LR_Layer, self).__init__()
        self.enc_dict = enc_dict
        self.emb_layer = EmbeddingLayer(enc_dict=self.enc_dict,embedding_dim=1)
        self.dnn_input_dim = get_dnn_input_dim(self.enc_dict, 1)
        self.fc = nn.Linear(self.dnn_input_dim,1)
        
    def forward(self,data):
        sparse_emb = self.emb_layer(data)
        sparse_emb = torch.stack(sparse_emb,dim=1).flatten(1) #[batch,num_sparse*emb]
        dense_input = get_linear_input(self.enc_dict, data)  #[batch,num_dense]
        dnn_input = torch.cat((sparse_emb, dense_input), dim=1) # [batch,num_sparse*emb + num_dense]
        out = self.fc(dnn_input)
        return out
    

## Corss Network
![image-2.png](attachment:image-2.png)
![image-3.png](attachment:image-3.png)

In [74]:
#Cross Layer层
class CrossLayer(nn.Module):
    def __init__(self,input_dim):
        super(CrossLayer, self).__init__()
        self.input_dim = input_dim
        self.W = nn.Parameter(torch.rand(self.input_dim))
        self.b = nn.Parameter(torch.rand(self.input_dim))
        
    def forward(self,x0,xl):
        '''
        x0: [batch,input_dim]
        xl: [batch,input_dim]
        
        x0.unsqueeze(-1): [batch,input_dim,1]
        xl.unsqueeze(1) : [batch,1,input_dim]
        '''
        out = torch.bmm(x0.unsqueeze(-1),xl.unsqueeze(1)) 
        out = torch.matmul(out, self.W)
        out = out + self.b + xl
        return out
    
class CrossNet(nn.Module):
    def __init__(self,input_dim,num_layers):
        super(CrossNet, self).__init__()
        self.input_dim = input_dim
        self.num_layers = num_layers
        self.cross_net_list = nn.ModuleList()
        for i in range(self.num_layers):
            self.cross_net_list.append(CrossLayer(input_dim))
    
    def forward(self,x0):
        xl = x0
        for l in range(self.num_layers):
            xl = self.cross_net_list[l](x0,xl)
        return xl
    
    
#DNN
class MLP_Layer(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim=None,
                 hidden_units=[],
                 hidden_activations="ReLU",
                 final_activation=None,
                 dropout_rates=0,
                 batch_norm=False,
                 use_bias=True):
        super(MLP_Layer, self).__init__()
        dense_layers = []
        if not isinstance(dropout_rates, list):
            dropout_rates = [dropout_rates] * len(hidden_units)
        if not isinstance(hidden_activations, list):
            hidden_activations = [hidden_activations] * len(hidden_units)
        hidden_activations = [set_activation(x) for x in hidden_activations]
        hidden_units = [input_dim] + hidden_units
        for idx in range(len(hidden_units) - 1):
            dense_layers.append(nn.Linear(hidden_units[idx], hidden_units[idx + 1], bias=use_bias))
            if batch_norm:
                dense_layers.append(nn.BatchNorm1d(hidden_units[idx + 1]))
            if hidden_activations[idx]:
                dense_layers.append(hidden_activations[idx])
            if dropout_rates[idx] > 0:
                dense_layers.append(nn.Dropout(p=dropout_rates[idx]))
        if output_dim is not None:
            dense_layers.append(nn.Linear(hidden_units[-1], output_dim, bias=use_bias))
        if final_activation is not None:
            dense_layers.append(set_activation(final_activation))
        self.dnn = nn.Sequential(*dense_layers)  # * used to unpack list

    def forward(self, inputs):
        return self.dnn(inputs)

def set_device(gpu=-1):
    if gpu >= 0 and torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
        device = torch.device(f"cuda:{gpu}")
    else:
        device = torch.device("cpu")
    return device
    
def set_activation(activation):
    if isinstance(activation, str):
        if activation.lower() == "relu":
            return nn.ReLU()
        elif activation.lower() == "sigmoid":
            return nn.Sigmoid()
        elif activation.lower() == "tanh":
            return nn.Tanh()
        else:
            return getattr(nn, activation)()
    else:
        return activation
    
def get_dnn_input_dim(enc_dict,embedding_dim):
    num_sparse = 0
    num_dense = 0
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            num_dense+=1
        elif 'vocab_size' in enc_dict[col].keys():
            num_sparse+=1
    return num_sparse*embedding_dim+num_dense

def get_linear_input(enc_dict,data):
    res_data = []
    for col in enc_dict.keys():
        if 'min' in enc_dict[col].keys():
            res_data.append(data[col])
    res_data = torch.stack(res_data,axis=1)
    return res_data

In [75]:
#DCN
class DCN(nn.Module):
    def __init__(self,
                 embedding_dim=10,
                 hidden_units=[64, 64, 64],
                 num_layers = 4,
                 loss_fun = 'torch.nn.BCELoss()',
                 enc_dict=None):
        super(DCN, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.loss_fun = eval(loss_fun)
        self.enc_dict = enc_dict

        self.embedding_layer = EmbeddingLayer(enc_dict=self.enc_dict, embedding_dim=self.embedding_dim)
        
        self.lr = LR_Layer(enc_dict=enc_dict) #一阶

        self.dnn_input_dim = get_dnn_input_dim(self.enc_dict, self.embedding_dim) #sparse_num * emb_dim + dense_num

        self.dnn = MLP_Layer(input_dim=self.dnn_input_dim, hidden_units=self.hidden_units,
                                 hidden_activations='relu', dropout_rates=0)
        self.cross_net = CrossNet(input_dim=self.dnn_input_dim, num_layers=self.num_layers) 
        
        self.fc = nn.Linear(self.dnn_input_dim + self.hidden_units[-1], 1)

    def forward(self, data):
        sparse_embedding = self.embedding_layer(data)
        emb_flatten = torch.stack(sparse_embedding, dim=1).flatten(start_dim=1)
        dense_input = get_linear_input(self.enc_dict, data)
        
        lr_logit = self.lr(data) #一阶交叉
        
        x0 = torch.cat((emb_flatten, dense_input), dim=1) # x0
        # CrossNet
        cross_out = self.cross_net(x0)
        # DNN
        dnn_out = self.dnn(x0)
        # fc
        nn_logit = self.fc(torch.cat([cross_out, dnn_out], axis=-1))
        
        #输出
        y_pred = torch.sigmoid(lr_logit + nn_logit)
        loss = self.loss_fun(y_pred.squeeze(-1),data['label'])
        output_dict = {'pred':y_pred,'loss':loss}
        return output_dict

## 3.完成Train Pipeline/Valid Pipeline
- Pytorch一般的Train Pipeline/Valid Pipeline书写

In [76]:
#训练模型，验证模型
def train_model(model, train_loader, optimizer, device, metric_list=['roc_auc_score','log_loss']):
    model.train()
    pred_list = []
    label_list = []
    pbar = tqdm(train_loader)
    for data in pbar:

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']
        loss = output['loss']

        loss.backward()
        optimizer.step()
        model.zero_grad()

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['label'].squeeze(-1).cpu().detach().numpy())
        pbar.set_description("Loss {}".format(loss))

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list,pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

def valid_model(model, valid_loader, device, metric_list=['roc_auc_score','log_loss']):
    model.eval()
    pred_list = []
    label_list = []

    for data in (valid_loader):

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']

        pred_list.extend(pred.squeeze(-1).cpu().detach().numpy())
        label_list.extend(data['label'].squeeze(-1).cpu().detach().numpy())

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list,pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

def test_model(model, test_loader, device, metric_list=['roc_auc_score','log_loss']):
    model.eval()
    pred_list = []
    label_list = []

    for data in (test_loader):

        for key in data.keys():
            data[key] = data[key].to(device)

        output = model(data)
        pred = output['pred']

        pred_list.extend(pred.squeeze().cpu().detach().numpy())
        label_list.extend(data['label'].squeeze().cpu().detach().numpy())

    res_dict = dict()
    for metric in metric_list:
        if metric =='log_loss':
            res_dict[metric] = log_loss(label_list,pred_list, eps=1e-7)
        else:
            res_dict[metric] = eval(metric)(label_list,pred_list)

    return res_dict

In [77]:
#dataloader
train_loader = D.DataLoader(train_dataset,batch_size=config['batch'],shuffle=True,num_workers=0)
valid_loader = D.DataLoader(valid_dataset,batch_size=config['batch'],shuffle=False,num_workers=0)
test_loader = D.DataLoader(test_dataset,batch_size=config['batch'],shuffle=False,num_workers=0)

In [78]:
model = DCN(enc_dict=enc_dict)

In [79]:
device = set_device(config['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
model = model.to(device)
#模型训练流程
for i in range(config['epoch']):
    #模型训练
    train_metirc = train_model(model,train_loader,optimizer=optimizer,device=device)
    #模型验证
    valid_metric = valid_model(model,valid_loader,device)

    print("Train Metric:")
    print(train_metirc)
    print("Valid Metric:")
    print(valid_metric)
#测试模型
test_metric = test_model(model,test_loader,device)
print('Test Metric:')
print(test_metric)

Loss 44.31789779663086: 100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.27it/s]


Train Metric:
{'roc_auc_score': 0.49741999472699655, 'log_loss': 7.265298901233681}
Valid Metric:
{'roc_auc_score': 0.5401923425875521, 'log_loss': 6.694122369587429}


Loss 44.455039978027344: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.31it/s]


Train Metric:
{'roc_auc_score': 0.4928751679200512, 'log_loss': 7.281196023518505}
Valid Metric:
{'roc_auc_score': 0.5420068953003084, 'log_loss': 6.606515622362503}


Loss 44.309661865234375: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.11it/s]


Train Metric:
{'roc_auc_score': 0.49433152126150326, 'log_loss': 7.25866074960434}
Valid Metric:
{'roc_auc_score': 0.5434585374705135, 'log_loss': 6.602691560164102}


Loss 44.27421951293945: 100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.30it/s]


Train Metric:
{'roc_auc_score': 0.4962837880252602, 'log_loss': 7.2395694149187495}
Valid Metric:
{'roc_auc_score': 0.5434585374705135, 'log_loss': 6.599448491409429}


Loss 44.25690841674805: 100%|█████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.33it/s]


Train Metric:
{'roc_auc_score': 0.4959761961557293, 'log_loss': 7.222243108673467}
Valid Metric:
{'roc_auc_score': 0.5434585374705135, 'log_loss': 6.597551818490036}
Test Metric:
{'roc_auc_score': 0.4652744050509956, 'log_loss': 7.464281368800446}
