In [1]:
from torch import nn
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.initializers import TruncatedNormal
from tensorflow.python.keras.layers import LSTM, Lambda, Layer

In [2]:
class Transformer(nn.Module):
    """  Simplified version of Transformer  proposed in 《Attention is all you need》

      Input shape
        - a list of two 3D tensor with shape ``(batch_size, timesteps, input_dim)`` if ``supports_masking=True`` .
        - a list of two 4 tensors, first two tensors with shape ``(batch_size, timesteps, input_dim)``,last two tensors with shape ``(batch_size, 1)`` if ``supports_masking=False`` .


      Output shape
        - 3D tensor with shape: ``(batch_size, 1, input_dim)``  if ``output_type='mean'`` or ``output_type='sum'`` , else  ``(batch_size, timesteps, input_dim)`` .


      Arguments
            - **att_embedding_size**: int.The embedding size in multi-head self-attention network.
            - **head_num**: int.The head number in multi-head  self-attention network.
            - **dropout_rate**: float between 0 and 1. Fraction of the units to drop.
            - **use_positional_encoding**: bool. Whether or not use positional_encoding
            - **use_res**: bool. Whether or not use standard residual connections before output.
            - **use_feed_forward**: bool. Whether or not use pointwise feed foward network.
            - **use_layer_norm**: bool. Whether or not use Layer Normalization.
            - **blinding**: bool. Whether or not use blinding.
            - **seed**: A Python integer to use as random seed.
            - **supports_masking**:bool. Whether or not support masking.
            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'additive'`` }.
            - **output_type**: ``'mean'`` , ``'sum'`` or `None`. Whether or not use average/sum pooling for output.

      References
            - [Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)
    """

    def __init__(self, att_embedding_size=1, head_num=8, dropout_rate=0.0, use_positional_encoding=True, use_res=True,
                 use_feed_forward=True, use_layer_norm=False, blinding=True, seed=1024, supports_masking=False,
                 attention_type="scaled_dot_product", output_type="mean", **kwargs):
        if head_num <= 0:
            raise ValueError('head_num must be a int > 0')
        super(Transformer, self).__init__(**kwargs)
        self.att_embedding_size = att_embedding_size
        self.head_num = head_num
        self.num_units = att_embedding_size * head_num
        self.use_res = use_res
        self.use_feed_forward = use_feed_forward
        self.seed = seed
        self.use_positional_encoding = use_positional_encoding
        self.dropout_rate = dropout_rate
        self.use_layer_norm = use_layer_norm
        self.blinding = blinding
        self.attention_type = attention_type
        self.output_type = output_type
        
        self.supports_masking = supports_masking

    def build(self, input_shape):
        embedding_size = int(input_shape[0][-1])
        if self.num_units != embedding_size:
            raise ValueError(
                "att_embedding_size * head_num must equal the last dimension size of inputs,got %d * %d != %d" % (
                    self.att_embedding_size, self.head_num, embedding_size))
        self.seq_len_max = int(input_shape[0][-2])
        self.W_Query = self.add_weight(name='query', shape=[embedding_size, self.att_embedding_size * self.head_num],
                                       dtype=tf.float32,
                                       initializer=tf.keras.initializers.TruncatedNormal(seed=self.seed))
        self.W_key = self.add_weight(name='key', shape=[embedding_size, self.att_embedding_size * self.head_num],
                                     dtype=tf.float32,
                                     initializer=tf.keras.initializers.TruncatedNormal(seed=self.seed + 1))
        self.W_Value = self.add_weight(name='value', shape=[embedding_size, self.att_embedding_size * self.head_num],
                                       dtype=tf.float32,
                                       initializer=tf.keras.initializers.TruncatedNormal(seed=self.seed + 2))
        if self.attention_type == "additive":
            self.b = self.add_weight('b', shape=[self.att_embedding_size], dtype=tf.float32,
                                     initializer=tf.keras.initializers.glorot_uniform(seed=self.seed))
            self.v = self.add_weight('v', shape=[self.att_embedding_size], dtype=tf.float32,
                                     initializer=tf.keras.initializers.glorot_uniform(seed=self.seed))
        # if self.use_res:
        #     self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num], dtype=tf.float32,
        #                                  initializer=tf.keras.initializers.TruncatedNormal(seed=self.seed))
        if self.use_feed_forward:
            self.fw1 = self.add_weight('fw1', shape=[self.num_units, 4 * self.num_units], dtype=tf.float32,
                                       initializer=tf.keras.initializers.glorot_uniform(seed=self.seed))
            self.fw2 = self.add_weight('fw2', shape=[4 * self.num_units, self.num_units], dtype=tf.float32,
                                       initializer=tf.keras.initializers.glorot_uniform(seed=self.seed))

        self.dropout = tf.keras.layers.Dropout(
            self.dropout_rate, seed=self.seed)
        self.ln = LayerNormalization()
        if self.use_positional_encoding:
            self.query_pe = PositionEncoding()
            self.key_pe = PositionEncoding()
        # Be sure to call this somewhere!
        super(Transformer, self).build(input_shape)


    def call(self, inputs, mask=None, training=None, **kwargs):

        if self.supports_masking:
            queries, keys = inputs
            query_masks, key_masks = mask
            query_masks = tf.cast(query_masks, tf.float32)
            key_masks = tf.cast(key_masks, tf.float32)
        else:
            queries, keys, query_masks, key_masks = inputs

            query_masks = tf.sequence_mask(
                query_masks, self.seq_len_max, dtype=tf.float32)
            key_masks = tf.sequence_mask(
                key_masks, self.seq_len_max, dtype=tf.float32)
            query_masks = tf.squeeze(query_masks, axis=1)
            key_masks = tf.squeeze(key_masks, axis=1)

        if self.use_positional_encoding:
            queries = self.query_pe(queries)
            keys = self.key_pe(queries)

        querys = tf.tensordot(queries, self.W_Query,
                              axes=(-1, 0))  # None T_q D*head_num
        keys = tf.tensordot(keys, self.W_key, axes=(-1, 0))
        values = tf.tensordot(keys, self.W_Value, axes=(-1, 0))

        # head_num*None T_q D
        querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0)
        keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
        values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)

        if self.attention_type == "scaled_dot_product":
            # head_num*None T_q T_k
            outputs = tf.matmul(querys, keys, transpose_b=True)

            outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
        elif self.attention_type == "additive":
            querys_reshaped = tf.expand_dims(querys, axis=-2)
            keys_reshaped = tf.expand_dims(keys, axis=-3)
            outputs = tf.tanh(tf.nn.bias_add(querys_reshaped + keys_reshaped, self.b))
            outputs = tf.squeeze(tf.tensordot(outputs, tf.expand_dims(self.v, axis=-1), axes=[-1, 0]), axis=-1)
        else:
            raise ValueError("attention_type must be scaled_dot_product or additive")

        key_masks = tf.tile(key_masks, [self.head_num, 1])

        # (h*N, T_q, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
                            [1, tf.shape(queries)[1], 1])

        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)

        # (h*N, T_q, T_k)

        outputs = tf.where(tf.equal(key_masks, 1), outputs, paddings, )
        if self.blinding:
            try:
                outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
                    :, :, 0] * (-2 ** 32 + 1))
            except AttributeError:
                outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
                    :, :, 0] * (-2 ** 32 + 1))

        outputs -= reduce_max(outputs, axis=-1, keep_dims=True)
        outputs = softmax(outputs)
        query_masks = tf.tile(query_masks, [self.head_num, 1])  # (h*N, T_q)
        # (h*N, T_q, T_k)
        query_masks = tf.tile(tf.expand_dims(
            query_masks, -1), [1, 1, tf.shape(keys)[1]])

        outputs *= query_masks

        outputs = self.dropout(outputs, training=training)
        # Weighted sum
        # ( h*N, T_q, C/h)
        result = tf.matmul(outputs, values)
        result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2)

        if self.use_res:
            # tf.tensordot(queries, self.W_Res, axes=(-1, 0))
            result += queries
        if self.use_layer_norm:
            result = self.ln(result)

        if self.use_feed_forward:
            fw1 = tf.nn.relu(tf.tensordot(result, self.fw1, axes=[-1, 0]))
            fw1 = self.dropout(fw1, training=training)
            fw2 = tf.tensordot(fw1, self.fw2, axes=[-1, 0])
            if self.use_res:
                result += fw2
            if self.use_layer_norm:
                result = self.ln(result)

        if self.output_type == "mean":
            return reduce_mean(result, axis=1, keep_dims=True)
        elif self.output_type == "sum":
            return reduce_sum(result, axis=1, keep_dims=True)
        else:
            return result

In [1]:
from torch import nn
import pandas as pd
import numpy as np
import pickle
import torchtext
from torch.optim import Optimizer
import torch
from deepctr_torch.models.deepfm import FM,DNN
from deepctr_torch.layers  import CIN,InteractingLayer,CrossNet,CrossNetMix,AttentionSequencePoolingLayer
from deepctr_torch.models.basemodel import *
from tqdm import tqdm
import random
import gc
from collections import defaultdict
from sklearn.metrics import auc,roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import pickle

In [5]:
encoder_layer = nn.TransformerEncoderLayer(d_model=128*3, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
src = torch.rand(10, 32, 128*3)
out = transformer_encoder(src)

In [2]:
ROOT_PATH='../../../data/'
user_data=pickle.load(open(ROOT_PATH+'tmp/user_data.pkl','rb'))

feed_data=pickle.load(open(ROOT_PATH+'tmp/feed_data.pkl','rb'))

item_texts=pickle.load(open(ROOT_PATH+'tmp/item_texts.pkl','rb'))
feed_emb=np.load(ROOT_PATH+'tmp/feed_emb.npy')
userid2nid=pickle.load(open(ROOT_PATH+'tmp/userid2nid.pkl','rb'))
feedid2nid=pickle.load(open(ROOT_PATH+'tmp/feedid2nid.pkl','rb'))
graph_emb=np.concatenate([np.load(ROOT_PATH+'tmp/grap_allembedding32_sg2.npy'),np.load(ROOT_PATH+'tmp/grap_allembedding32_hs2.npy')],axis=1)
ratings=pd.read_csv(ROOT_PATH+'wedata/wechat_algo_data2/user_action.csv')

In [3]:
def disable_grad(module):
    for param in module.parameters():
        param.requires_grad=False
class BagOfWordsPretrained(nn.Module):
    def __init__(self, field, hidden_dims):
        super().__init__()
        self.att_emb=Attn(hidden_dims)
        input_dims = field.vocab.vectors.shape[1]
        self.emb = nn.Embedding(
            len(field.vocab.itos), input_dims,
            padding_idx=field.vocab.stoi[field.pad_token])
        self.emb.weight.data.copy_(torch.from_numpy(field.vocab.vectors).float())
        self.emb.weight.requires_grad = False
#         self.proj = nn.Linear(input_dims, hidden_dims)
#         nn.init.xavier_uniform_(self.proj.weight)
#         nn.init.constant_(self.proj.bias, 0)

#         disable_grad(self.emb) # 词向量不可训练

    def forward(self, x):
        """
        x: (batch_size, max_length) LongTensor
        length: (batch_size,) LongTensor
        """
#         x = self.emb(x).sum(1)# / length.unsqueeze(1).float() # 归一化
        return  self.att_emb(self.emb(x))#self.proj(x)

class BagOfWords(nn.Module):
    def __init__(self, field, hidden_dims):
        super().__init__()
        self.att_emb=Attn(hidden_dims)
        self.emb = nn.Embedding(
            len(field.vocab.itos), hidden_dims,
            padding_idx=field.vocab.stoi[field.pad_token])
        nn.init.xavier_uniform_(self.emb.weight)

    def forward(self, x):
        return self.att_emb(self.emb(x))#.mean(1)#/ length.unsqueeze(1).float() # 归一化
tokenize = lambda x: x.split(' ')
fields = {}
examples = []
for key, texts in item_texts.items():
    if  key in ['ocr','asr','description']:
        fields[key] = torchtext.data.Field(include_lengths=True, lower=True,tokenize=tokenize, batch_first=True, fix_length=64)
    else:
        fields[key] = torchtext.data.Field(include_lengths=True, lower=True,tokenize=tokenize, batch_first=True, fix_length=5)
    
for i in range(len(feedid2nid)):
    example = torchtext.data.Example.fromlist(
        [item_texts[key][i] for key in item_texts.keys()],
        [(key, fields[key]) for key in item_texts.keys()])  #( [feat1,feat2], [(key1,field1),(key2,field2)] )
    examples.append(example)
textset = torchtext.data.Dataset(examples, fields)
for key, field in fields.items():
    field.build_vocab(getattr(textset, key))
for field_name, field in textset.fields.items():
    examples = [getattr(textset[i], field_name) for i in range(len(feedid2nid))]

    tokens, lengths = field.process(examples)

    if not field.batch_first:
        tokens = tokens.t()
    # 给feed +上文本向量
    feed_data[field_name] = tokens




In [4]:
class Transmodel(nn.Module):
    def __init__(self,user_data,feed_data,textset,feed_embed,graph_emb,device):
        super().__init__()
        self.feed_data=feed_data
        self.user_data=user_data
        user_dict={'device':2,'userid':128}
        feed_dict={'bgm_song_id':16, 'bgm_singer_id':16,'authorid':16,'dense':32,'hash_dense':32
       ,'manual_keyword_id1':16,'manual_tag_id1':16,'machine_keyword_id1':16
            ,'machine_tag_id1':16,'knn_feed':16,
           'manual_tag_list':32,'manual_keyword_list':32,'machine_keyword_list':32,'asr':32,'description':32,'ocr':32
                  }
        self.model_dict=_init_input_modules(user_data,feed_data,textset, user_dict,feed_dict)
        self.spare_liner=nn.Linear(8*16,128)
        self.dense_liner=nn.Linear(32*2,128)
        self.text_liner=nn.Linear(32*6+512+64,128)
        self.feed_embed= nn.Parameter(torch.from_numpy(feed_embed).float(),requires_grad=False)
        self.graph= nn.Parameter(torch.from_numpy(graph_emb).float(),requires_grad=False)
        self.att_pool1=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool2=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool3=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)

        self.mmoe=MMOELayer(sum(user_dict.values())+128*12, mmoe_hidden_dim=128,num_task=7,n_expert=5,expert_activation=None,device=device)
        
        self.liner1=nn.Linear(128,1)
        self.liner2=nn.Linear(128,1)
        self.liner3=nn.Linear(128,1)
        self.liner4=nn.Linear(128,1)
        self.liner5=nn.Linear(128,1)
        self.liner6=nn.Linear(128,1)
        self.liner7=nn.Linear(128,1)
    def forward(self,userid,feedid,hist,mask_leng,is_train=True):
        # hist=[B,T]  #T是padding的序列
        # mask_leng=[B,1] # 每个batch中的长度
        user_projections=[]
        dense_embedding=[]
        sparse_embedding=[]
        text_embedding=[]
        for feature, data in self.user_data.items():
            module = self.model_dict[feature]
            result = module(data)
            user_projections.append(result)
        for feature, data in self.feed_data.items():
#             print(feature)
            module = self.model_dict[feature]
            result = module(data)
            if result.shape[-1]==16:
                sparse_embedding.append(result)
            elif 'dense' in feature:
                dense_embedding.append(result)
            else:
                text_embedding.append(result)
        user_feat=torch.cat(user_projections,-1)
        spare_emb=self.spare_liner(torch.cat(sparse_embedding,-1))
        dense_emb=self.dense_liner(torch.cat(dense_embedding,-1))
        text_emb=self.text_liner(torch.cat(text_embedding+[self.feed_embed,self.graph],-1))  
        feed_feat=torch.cat([spare_emb,dense_emb,text_emb],-1) #128*3
        
        hist_feat=feed_feat[hist]
        query=torch.unsqueeze(feed_feat[feedid],1)
#         print(query,hist_feat.shape,mask_leng.shape)
        
        att_output1=self.att_pool1(query,hist_feat,mask_leng)
        att_output1=att_output1.squeeze()
        att_output2=self.att_pool2(query,hist_feat,mask_leng)
        att_output2=att_output2.squeeze()
        att_output3=self.att_pool3(query,hist_feat,mask_leng)
        att_output3=att_output3.squeeze()
        combine=torch.cat([user_feat[userid],feed_feat[feedid],att_output1,att_output2,att_output3],axis=-1)
        outs=self.mmoe(combine)

        logit_gnn1=self.liner1(outs[0])#+ffm1#128+1+128*2
        logit_gnn2=self.liner2(outs[1])
        
        logit_gnn3=self.liner3(outs[2])
        logit_gnn4=self.liner4(outs[3])
        logit_gnn5=self.liner5(outs[4])
        logit_gnn6=self.liner6(outs[5])
        logit_gnn7=self.liner7(outs[6])

        return logit_gnn1,logit_gnn2,logit_gnn3,logit_gnn4,logit_gnn5,logit_gnn6,logit_gnn7
    
def _init_input_modules(user_data,feed_data,textset, user_dict,feed_dict):
    # We initialize the linear projections of each input feature ``x`` as
    # follows:
    # * If ``x`` is a scalar integral feature, we assume that ``x`` is a categorical
    #   feature, and assume the range of ``x`` is 0..max(x).
    # * If ``x`` is a float one-dimensional feature, we assume that ``x`` is a
    #   numeric vector.
    # * If ``x`` is a field of a textset, we process it as bag of words.
    module_dict = nn.ModuleDict()
    for column, data in user_data.items():
#         if column in user_texts.keys():
#             continue
        if data.dtype == torch.float32: # 数值类型的特征
            assert data.ndim == 2
            m = nn.Linear(data.shape[1],user_dict[column]) # 数值特征 做个线性变换
            nn.init.xavier_uniform_(m.weight)
            nn.init.constant_(m.bias, 0)
        elif data.dtype == torch.int64:
            assert data.ndim == 1  # 整形的单值特征做个embedding
            m = nn.Embedding(data.max() + 2, user_dict[column], padding_idx=-1)
            nn.init.xavier_uniform_(m.weight)
        module_dict[column] = m  # 不同的特征名字对应不同的处理moderl 这里或许可以加FM进去
    
    for column, data in feed_data.items():
        if column in textset.fields.keys():
            continue
        if column =='manuual_tag_list_emb':
            continue
        if data.dtype == torch.float32: # 数值类型的特征
            assert data.ndim == 2
            m = nn.Linear(data.shape[1],feed_dict[column]) # 数值特征 做个线性变换
            nn.init.xavier_uniform_(m.weight)
            nn.init.constant_(m.bias, 0)
        elif data.dtype == torch.int64:
            assert data.ndim == 1  # 整形的单值特征做个embedding
            m = nn.Embedding(data.max() + 2, feed_dict[column], padding_idx=-1)
            nn.init.xavier_uniform_(m.weight)
        module_dict[column] = m  # 不同的特征名字对应不同的处理moderl 这里或许可以加FM进去
        
    if textset is not None:
        for column, field in textset.fields.items():
            if field.vocab.vectors:
                module_dict[column] = BagOfWordsPretrained(field,feed_dict[column])
            else:
                module_dict[column] = BagOfWords(field,feed_dict[column])
    return module_dict

class MMOELayer(nn.Module):
    def __init__(self, hidden_size,device, mmoe_hidden_dim=128,num_task=4,n_expert=3,expert_activation=None,):
        super(MMOELayer, self).__init__()
         # experts
        self.num_task=num_task
        self.expert_activation = expert_activation
        self.experts = torch.nn.Parameter(torch.rand(hidden_size, mmoe_hidden_dim, n_expert).to(device), requires_grad=True)
        self.experts.data.normal_(0, 1)
        self.experts_bias = torch.nn.Parameter(torch.rand(mmoe_hidden_dim, n_expert).to(device), requires_grad=True)
        # gates
        self.gates = [torch.nn.Parameter(torch.rand(hidden_size, n_expert), requires_grad=True).to(device) for _ in range(num_task)]
        for gate in self.gates:
            gate.data.normal_(0, 1)
        self.gates_bias = [torch.nn.Parameter(torch.rand(n_expert), requires_grad=True).to(device) for _ in range(num_task)]
        for i in range(num_task):
            setattr(self, 'task_{}_dnn'.format(i+1),DNN(mmoe_hidden_dim,(128,128),dropout_rate=0.2,l2_reg=5e-5,use_bn=True))
    def forward(self,x):
         # mmoe
        experts_out = torch.einsum('ij, jkl -> ikl', x, self.experts) # batch * mmoe_hidden_size * num_experts
        experts_out += self.experts_bias
        if self.expert_activation is not None:
            experts_out = self.expert_activation(experts_out)
        
        gates_out = list()
        for idx, gate in enumerate(self.gates):
            gate_out = torch.einsum('ab, bc -> ac',x, gate) # batch * num_experts
            if self.gates_bias:
                gate_out += self.gates_bias[idx]
            gate_out = nn.Softmax(dim=-1)(gate_out)
            gates_out.append(gate_out)
        
        
        
        outs = list()
        for gate_output in gates_out:
            expanded_gate_output = torch.unsqueeze(gate_output, 1) # batch * 1 * num_experts
            weighted_expert_output = experts_out * expanded_gate_output.expand_as(experts_out) # batch * mmoe_hidden_size * num_experts
            outs.append(torch.sum(weighted_expert_output, 2)) # batch * mmoe_hidden_size
          # task tower
        task_outputs = list()
        for i in range(self.num_task):
            oo = outs[i]
            mod=getattr(self, 'task_{}_dnn'.format(i+1))
            oo = mod(oo)
            task_outputs.append(oo)
        
        return task_outputs
class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0

    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)

    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)

    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss

    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {(id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }

    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state

    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

from torch.optim.lr_scheduler import LambdaLR
class WarmupLinearSchedule(LambdaLR):
    """ Linear warmup and then linear decay.
        Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step.
        Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps.
        Linearly decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps.
    """
    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
class Attn(nn.Module):
    def __init__(self,hidden_size):
        super(Attn, self).__init__()
        self.attn = nn.Linear(hidden_size,1)
    def forward(self, x):
        '''
        :param hidden: 
            previous hidden state of the decoder, in shape (layers*directions,B,H)
        :param encoder_outputs:
            encoder outputs from Encoder, in shape (T,B,H)
        :param src_len:
            used for masking. NoneType or tensor in shape (B) indicating sequence length
        :return
            attention energies in shape (B,T)
        '''   
        att=self.attn(x)
        att=F.tanh(att)
        att=F.softmax(att,1)
        att_x=att*x
        return att_x.sum(1)   
class AdamW(Optimizer):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
        if lr < 0.0:
            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                        correct_bias=correct_bias)
        super(AdamW, self).__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                if grad.is_sparse:
                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
                state = self.state[p]
                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p.data)
                    state['exp_avg_sq'] = torch.zeros_like(p.data)
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                state['step'] += 1
                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
                denom = exp_avg_sq.sqrt().add_(group['eps'])
                step_size = group['lr']
                if group['correct_bias']:  # No bias correction for Bert
                    bias_correction1 = 1.0 - beta1 ** state['step']
                    bias_correction2 = 1.0 - beta2 ** state['step']
                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
                p.data.addcdiv_(-step_size, exp_avg, denom)
                if group['weight_decay'] > 0.0:
                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
        return loss
    
def build_optimizer(model, train_steps, learning_rate):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False, eps=1e-8)
    optimizer = Lookahead(optimizer, 5, 1)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * 0.1, t_total=train_steps)
    return optimizer, scheduler

def n_evaluate_nn(val_df,action_list,device,batch_size=512):
    model.eval()
    leng=len(val_df)
    val_src=val_df['userid'].apply(lambda x:userid2nid[x]).values
    val_dst=val_df['feedid'].apply(lambda x:feedid2nid[x]).values
    val_hist_id=torch.from_numpy((val_df['date_'].values-1)*len(userid2nid)+val_src).long()
    val_pred=[]
    all_aucs=[]
    weights=[0.30769231, 0.23076923, 0.15384615, 0.07692308, 0.07692308,0.07692308, 0.07692308]
    with torch.no_grad():
        for i in tqdm(range(0,leng//batch_size+1)):
            #         print(i*batch_size,(i+1)*batch_size)
            batch_src=val_src[i*batch_size:(i+1)*batch_size]
            batch_dst=val_dst[i*batch_size:(i+1)*batch_size]
            batch_hist=hist_seq[val_hist_id[i*batch_size:(i+1)*batch_size]]
            pred=model(batch_src,batch_dst,batch_hist[:,:-1],batch_hist[:,-1:].to(device))
            val_pred.append(torch.cat(pred,axis=-1).sigmoid().cpu().numpy())
        val_pred=np.concatenate(val_pred,axis=0)
        for i,action in enumerate(action_list):
            val_df['pred_'+action]=val_pred[:,i]
            label_nunique = val_df.groupby(by='userid')[action].transform('nunique')
            tmp_df = val_df[label_nunique == 2]
            aucs = tmp_df.groupby(by='userid').apply(
                lambda x: roc_auc_score(x[action].values, x['pred_'+action].values))
            all_aucs.append(np.mean(aucs))
            print('val %s uauc:'%action,np.mean(aucs))
            print('val %s auc:'%action,roc_auc_score(val_df[action].values,val_pred[:,i]))
        print('score uauc:',sum([all_aucs[i]*weights[i] for i in range(len(action_list))]))
def evaluate_nn(val_df,action,batch_size=512):
    model.eval()
    leng=len(val_df)
    val_src=val_df['userid'].apply(lambda x:userid2nid[x]).tolist()
    val_dst=val_df['feedid'].apply(lambda x:feedid2nid[x]).tolist()
    val_pred=[]
    with torch.no_grad():
        for i in tqdm(range(0,leng//batch_size+1)):
            #         print(i*batch_size,(i+1)*batch_size)
            batch_src=val_src[i*batch_size:(i+1)*batch_size]
            batch_dst=val_dst[i*batch_size:(i+1)*batch_size]

            pred=model(batch_src,batch_dst)

            val_pred.append(pred.sigmoid().view(-1).cpu().numpy())
        val_pred=np.concatenate(val_pred,axis=-1)
        val_df['pred_'+action]=val_pred
        label_nunique = val_df.groupby(by='userid')[action].transform('nunique')
        tmp_df = val_df[label_nunique == 2]
        
        aucs = tmp_df.groupby(by='userid').apply(
            lambda x: roc_auc_score(x[action].values, x['pred_'+action].values))
        print('val uauc:',np.mean(aucs))
        print('val auc:',roc_auc_score(val_df[action].values,val_pred))

In [5]:
class Transmodelv2(nn.Module):
    def __init__(self,user_data,feed_data,textset,feed_embed,graph_emb,device):
        super().__init__()
        self.feed_data=feed_data
        self.user_data=user_data
        user_dict={'device':2,'userid':128}
        feed_dict={'bgm_song_id':16, 'bgm_singer_id':16,'authorid':16,'dense':32,'hash_dense':32
       ,'manual_keyword_id1':16,'manual_tag_id1':16,'machine_keyword_id1':16
            ,'machine_tag_id1':16,'knn_feed':16,
           'manual_tag_list':32,'manual_keyword_list':32,'machine_keyword_list':32,'asr':32,'description':32,'ocr':32
                  }
        self.model_dict=_init_input_modules(user_data,feed_data,textset, user_dict,feed_dict)
        self.spare_liner=nn.Linear(8*16,128)
        self.dense_liner=nn.Linear(32*2,128)
        self.text_liner=nn.Linear(32*6+512+64,128)
        self.feed_embed= nn.Parameter(torch.from_numpy(feed_embed).float(),requires_grad=False)
        self.graph= nn.Parameter(torch.from_numpy(graph_emb).float(),requires_grad=False)
        self.att_pool1=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool2=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool3=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool4=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool5=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool6=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)
        self.att_pool7=AttentionSequencePoolingLayer(att_hidden_units=(128,128),embedding_dim=128*3, weight_normalization=True,
                                                supports_masking=False)

        self.mmoe1=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        self.mmoe2=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        self.mmoe3=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        self.mmoe4=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        self.mmoe5=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        self.mmoe6=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        self.mmoe7=MMOELayer(sum(user_dict.values())+128*3, mmoe_hidden_dim=128,num_task=1,n_expert=5,expert_activation=None,device=device)
        
        self.liner1=nn.Linear(128,1)
        self.liner2=nn.Linear(128,1)
        self.liner3=nn.Linear(128,1)
        self.liner4=nn.Linear(128,1)
        self.liner5=nn.Linear(128,1)
        self.liner6=nn.Linear(128,1)
        self.liner7=nn.Linear(128,1)
    def forward(self,userid,feedid,hist,mask_leng,is_train=True):
        # hist=[B,T]  #T是padding的序列
        # mask_leng=[B,1] # 每个batch中的长度
        user_projections=[]
        dense_embedding=[]
        sparse_embedding=[]
        text_embedding=[]
        for feature, data in self.user_data.items():
            module = self.model_dict[feature]
            result = module(data)
            user_projections.append(result)
        for feature, data in self.feed_data.items():
#             print(feature)
            module = self.model_dict[feature]
            result = module(data)
            if result.shape[-1]==16:
                sparse_embedding.append(result)
            elif 'dense' in feature:
                dense_embedding.append(result)
            else:
                text_embedding.append(result)
        user_feat=torch.cat(user_projections,-1)
        spare_emb=self.spare_liner(torch.cat(sparse_embedding,-1))
        dense_emb=self.dense_liner(torch.cat(dense_embedding,-1))
        text_emb=self.text_liner(torch.cat(text_embedding+[self.feed_embed,self.graph],-1))  
        feed_feat=torch.cat([spare_emb,dense_emb,text_emb],-1) #128*3
        
        hist_feat=feed_feat[hist]
        query=torch.unsqueeze(feed_feat[feedid],1)
#        
        att_output1=self.att_pool1(query,hist_feat,mask_leng)
        att_output1=att_output1.squeeze()
        att_output2=self.att_pool2(query,hist_feat,mask_leng)
        att_output2=att_output2.squeeze()
        att_output3=self.att_pool3(query,hist_feat,mask_leng)
        att_output3=att_output3.squeeze()
        att_output4=self.att_pool4(query,hist_feat,mask_leng)
        att_output4=att_output4.squeeze()
        att_output5=self.att_pool5(query,hist_feat,mask_leng)
        att_output5=att_output5.squeeze()
        att_output6=self.att_pool6(query,hist_feat,mask_leng)
        att_output6=att_output6.squeeze()
        att_output7=self.att_pool7(query,hist_feat,mask_leng)
        att_output7=att_output7.squeeze()
        
        combine1=torch.cat([user_feat[userid],feed_feat[feedid],att_output1],axis=-1)
        combine2=torch.cat([user_feat[userid],feed_feat[feedid],att_output2],axis=-1)
        combine3=torch.cat([user_feat[userid],feed_feat[feedid],att_output3],axis=-1)
        combine4=torch.cat([user_feat[userid],feed_feat[feedid],att_output4],axis=-1)
        combine5=torch.cat([user_feat[userid],feed_feat[feedid],att_output5],axis=-1)
        combine6=torch.cat([user_feat[userid],feed_feat[feedid],att_output6],axis=-1)
        combine7=torch.cat([user_feat[userid],feed_feat[feedid],att_output7],axis=-1)
        outs1=self.mmoe(combine1)
        outs2=self.mmoe(combine2)
        outs3=self.mmoe(combine3)
        outs4=self.mmoe(combine4)
        outs5=self.mmoe(combine5)
        outs6=self.mmoe(combine6)
        outs7=self.mmoe(combine7)

        logit_gnn1=self.liner1(outs1)#+ffm1#128+1+128*2
        logit_gnn2=self.liner2(outs2)
        logit_gnn3=self.liner3(outs3)
        logit_gnn4=self.liner4(outs4)
        logit_gnn5=self.liner5(outs5)
        logit_gnn6=self.liner6(outs6)
        logit_gnn7=self.liner7(outs7)

        return logit_gnn1,logit_gnn2,logit_gnn3,logit_gnn4,logit_gnn5,logit_gnn6,logit_gnn7

In [6]:
PREDICT_LIST=["read_comment","like", "click_avatar", "forward",'comment','follow','favorite']
max_day=14
train_ratings=ratings[(ratings.date_<max_day)]
val_ratings=ratings[ratings.date_==max_day]
del ratings
gc.collect()

0

In [7]:
src=torch.from_numpy(train_ratings['userid'].apply(lambda x: userid2nid[x]).values).long()
dst=torch.from_numpy(train_ratings['feedid'].apply(lambda x: feedid2nid[x]).values).long()
hist_id=torch.from_numpy((train_ratings['date_'].values-1)*len(userid2nid)).long()+src
labels=torch.from_numpy(train_ratings[PREDICT_LIST].values).float()
hist_seq=torch.from_numpy(np.load(ROOT_PATH+'tmp/hist_list1.npy')).long()

Please check the latest version manually on https://pypi.org/project/deepctr-torch/#history


In [8]:
import threading

In [9]:
class myThread (threading.Thread):
    def __init__(self, func, args):
        threading.Thread.__init__(self)
        self.func=func
        self.args= args
    def run(self):
        print('----start--------')
        self.func(*self.args)
        print('finish')

In [10]:
batch_size=4096*2
epochs=2
def train1(Model,user_data,feed_data,textset,feed_emb,graph_emb,device):
    for f,d in user_data.items():
        user_data[f]=d.to(device)
    for f,d in feed_data.items():
        feed_data[f]=d.to(device)
        
    model = Model(user_data,feed_data,textset=textset
                 ,feed_embed=feed_emb,graph_emb=graph_emb,device=device)
    model=model.to(device)
    train_steps = int(len(train_ratings) * epochs / batch_size) + 1
    optimizer, scheduler = build_optimizer(model, train_steps, learning_rate=2e-2)
    all_pred=[]

    criti=nn.BCEWithLogitsLoss()
    reg_criti=nn.MSELoss()
    n_pos=len(train_ratings)
    batch_index=np.arange(n_pos) # 生成正样本的index
    for epoch in range(epochs):
        print('epoch: ----%d--'%epoch)
        random.shuffle(batch_index) 
        epoch_loss=0
        model.train()
        for ind in tqdm(range(0,n_pos//batch_size+1)):
            batch=batch_index[ind*batch_size:(ind+1)*batch_size]
            batch_src=src[batch]
            batch_dst=dst[batch]
            batch_hist=hist_seq[hist_id[batch]]
    #         print(batch_src)
            logits =model(batch_src,batch_dst,batch_hist[:,:-1],batch_hist[:,-1:].to(device))
            batch_label=labels[batch].to(device)
            loss=criti(logits[0][:,0],batch_label[:,0])*0.8+criti(logits[1][:,0],batch_label[:,1])*0.8+\
            criti(logits[2][:,0],batch_label[:,2])*0.4+criti(logits[3][:,0],batch_label[:,3])*0.4+\
            criti(logits[4][:,0],batch_label[:,4])*0.3+criti(logits[5][:,0],batch_label[:,5])*0.3+criti(logits[6][:,0],batch_label[:,6])*0.3
            epoch_loss+=loss.item()
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            scheduler.step()

            if ind%1000==0:
                print('binary loss:',loss.item())
                batch_label=batch_label.cpu().numpy()
                pred=torch.cat(logits,axis=-1).sigmoid().detach().cpu().numpy()
    #             pred=logits.sigmoid().detach().cpu().numpy()
                for ii,aa in enumerate(PREDICT_LIST):
                    try:
                        print('train %s auc:'%aa,roc_auc_score(batch_label[:,ii],pred[:,ii]))
                    except:
                        continue
        print('epoch %d  loss: %f '%(epoch,epoch_loss/(len(batch_index)//batch_size+1)))
        n_evaluate_nn(val_df=val_ratings,action_list=PREDICT_LIST,batch_size=2048,device=device)



In [11]:
p1=myThread (train1,(Transmodel,user_data,feed_data,textset,feed_emb,graph_emb,torch.device('cuda:0')))
p1.start()
p2=myThread (train1,(Transmodelv2,user_data,feed_data,textset,feed_emb,graph_emb,torch.device('cuda:2')))
p2.start()


----start--------
----start--------
epoch: ----0--
epoch: ----0--


  0%|          | 0/8188 [00:00<?, ?it/s]
Exception in thread Thread-6:
Traceback (most recent call last):
  File "/home/tione/notebook/envs/tf1/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-9-a3a1f21677ed>", line 8, in run
    self.func(*self.args)
  File "<ipython-input-10-b49b27e49325>", line 31, in train1
    logits =model(batch_src,batch_dst,batch_hist[:,:-1],batch_hist[:,-1:].to(device))
  File "/home/tione/notebook/envs/tf1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-5-8e12b449b36c>", line 57, in forward
    result = module(data)
  File "/home/tione/notebook/envs/tf1/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/home/tione/notebook/envs/tf1/lib/python3.6/site-packages/torch/nn/modules/sparse.py", line 126, in forward
    self.norm_type, self

binary loss: 2.429819107055664
train read_comment auc: 0.518138553952238
train like auc: 0.49945733594732783
train click_avatar auc: 0.4394251816443968
train forward auc: 0.6503630563933104
train follow auc: 0.3678553981436248
train favorite auc: 0.42283672811725614


 12%|█▏        | 1001/8188 [09:32<1:09:20,  1.73it/s]

binary loss: 0.18441987037658691
train read_comment auc: 0.9183549422916294
train like auc: 0.849339492224249
train click_avatar auc: 0.7992943584100765
train forward auc: 0.8480566454144188
train comment auc: 0.7920605838524489
train follow auc: 0.7557553015097304
train favorite auc: 0.8992325097847358


 24%|██▍       | 2001/8188 [19:02<59:43,  1.73it/s]  

binary loss: 0.17367444932460785
train read_comment auc: 0.9249567623099635
train like auc: 0.863652145899703
train click_avatar auc: 0.8201610701610702
train forward auc: 0.9231487503783318
train comment auc: 0.5955294979846097
train follow auc: 0.8225832722534409
train favorite auc: 0.8548913517758966


 37%|███▋      | 3001/8188 [28:34<50:08,  1.72it/s]  

binary loss: 0.1758095622062683
train read_comment auc: 0.9223037185601655
train like auc: 0.87444414246321
train click_avatar auc: 0.8335224029422602
train forward auc: 0.9245605606737128
train comment auc: 0.8711371686820569
train follow auc: 0.917002688172043
train favorite auc: 0.8941116544417277


 49%|████▉     | 4001/8188 [38:06<40:27,  1.72it/s]

binary loss: 0.16956983506679535
train read_comment auc: 0.9494719800312449
train like auc: 0.8721608369640931
train click_avatar auc: 0.8633813806414521
train forward auc: 0.9154308125995833
train comment auc: 0.6921480034192209
train follow auc: 0.8957673545132291
train favorite auc: 0.9229790876849701


 61%|██████    | 5001/8188 [47:38<30:48,  1.72it/s]

binary loss: 0.16249528527259827
train read_comment auc: 0.9412121438799718
train like auc: 0.8769372083624204
train click_avatar auc: 0.8927299920029528
train forward auc: 0.9087614252341747
train comment auc: 0.8753358573522227
train follow auc: 0.8109875336103641
train favorite auc: 0.9651918846247862


 73%|███████▎  | 6001/8188 [57:10<21:10,  1.72it/s]

binary loss: 0.15573710203170776
train read_comment auc: 0.9470169838105815
train like auc: 0.8724221665762862
train click_avatar auc: 0.8490058504938165
train forward auc: 0.9032969552276563
train comment auc: 0.9597427443318272
train follow auc: 0.8803737176355643
train favorite auc: 0.9583907845361724


 86%|████████▌ | 7001/8188 [1:06:39<11:27,  1.73it/s]

binary loss: 0.15671928226947784
train read_comment auc: 0.9502131987316095
train like auc: 0.8670612648184433
train click_avatar auc: 0.8856964256435481
train forward auc: 0.9213729528918977
train comment auc: 0.9883473799926713
train follow auc: 0.9360938072554049
train favorite auc: 0.9966489222445238


 98%|█████████▊| 8001/8188 [1:16:08<01:47,  1.74it/s]

binary loss: 0.1740330010652542
train read_comment auc: 0.9475752536410732
train like auc: 0.8837377518989603
train click_avatar auc: 0.8680706403684073
train forward auc: 0.890609737527252
train comment auc: 0.9256657708282433
train follow auc: 0.7516395779868833
train favorite auc: 0.8351509042033234


100%|██████████| 8188/8188 [1:17:54<00:00,  1.75it/s]
Exception in thread Thread-5:
Traceback (most recent call last):
  File "/home/tione/notebook/envs/tf1/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "<ipython-input-9-a3a1f21677ed>", line 8, in run
    self.func(*self.args)
  File "<ipython-input-10-b49b27e49325>", line 54, in train1
    n_evaluate_nn(val_df=val_ratings,action_list=PREDICT_LIST,batch_size=2048,device=device)
  File "<ipython-input-4-ff5325d1bfba>", line 340, in n_evaluate_nn
    model.eval()
NameError: name 'model' is not defined



epoch 0  loss: 0.178519 


In [None]:
for f,d in user_data.items():
    user_data[f]=d.to(device)
for f,d in feed_data.items():
    feed_data[f]=d.to(device)

model = Model(user_data,feed_data,textset=textset
             ,feed_embed=feed_emb,graph_emb=graph_emb,device=device)
model=model.to(device)
train_steps = int(len(train_ratings) * epochs / batch_size) + 1
optimizer, scheduler = build_optimizer(model, train_steps, learning_rate=2e-2)
all_pred=[]

criti=nn.BCEWithLogitsLoss()
reg_criti=nn.MSELoss()
n_pos=len(train_ratings)
batch_index=np.arange(n_pos) # 生成正样本的index
for epoch in range(epochs):
    print('epoch: ----%d--'%epoch)
    random.shuffle(batch_index) 
    epoch_loss=0
    model.train()
    for ind in tqdm(range(0,n_pos//batch_size+1)):
        batch=batch_index[ind*batch_size:(ind+1)*batch_size]
        batch_src=src[batch]
        batch_dst=dst[batch]
        batch_hist=hist_seq[hist_id[batch]]
#         print(batch_src)
        logits =model(batch_src,batch_dst,batch_hist[:,:-1],batch_hist[:,-1:].to(device))
        batch_label=labels[batch].to(device)
        loss=criti(logits[0][:,0],batch_label[:,0])*0.8+criti(logits[1][:,0],batch_label[:,1])*0.8+\
        criti(logits[2][:,0],batch_label[:,2])*0.4+criti(logits[3][:,0],batch_label[:,3])*0.4+\
        criti(logits[4][:,0],batch_label[:,4])*0.3+criti(logits[5][:,0],batch_label[:,5])*0.3+criti(logits[6][:,0],batch_label[:,6])*0.3
        epoch_loss+=loss.item()
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
        scheduler.step()

        if ind%1000==0:
            print('binary loss:',loss.item())
            batch_label=batch_label.cpu().numpy()
            pred=torch.cat(logits,axis=-1).sigmoid().detach().cpu().numpy()
#             pred=logits.sigmoid().detach().cpu().numpy()
            for ii,aa in enumerate(PREDICT_LIST):
                try:
                    print('train %s auc:'%aa,roc_auc_score(batch_label[:,ii],pred[:,ii]))
                except:
                    continue
    print('epoch %d  loss: %f '%(epoch,epoch_loss/(len(batch_index)//batch_size+1)))