## Rewrite

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
from collections import namedtuple

import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
import tensorflow.keras.backend as K
from tensorflow.python.keras.callbacks import EarlyStopping
from sklearn.preprocessing import  MinMaxScaler, LabelEncoder

from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, roc_curve

##### 获取数据

In [8]:
def get_wechat_data():
    """ 读取wechat数据集 """
    train_path = '../data/wechat/train_df.pkl'
    test_path = '../data/wechat/test_df.pkl'
    encoder_dict_path = '../data/wechat/encoder_dict.pkl'
    train_df = pd.read_pickle(train_path)
    test_df = pd.read_pickle(test_path)
    encoder_dict = joblib.load(encoder_dict_path)
    return train_df, test_df, encoder_dict

train_df, test_df, encoder_dict = get_wechat_data()
train_df = train_df.sample(frac=1.0)

# 按行为比例加权
data = pd.concat([train_df, test_df], axis=0)
print('各行为比例: ', data.read_comment.sum() / data.shape[0], data.like.sum() / data.shape[0], data.click_avatar.sum() / data.shape[0], data.forward.sum() / data.shape[0])
data['sample_weight'] = data['read_comment']*1.0+data['like']*1.1+data['click_avatar']*1.2+data['forward']*2.0

train_df, test_df = data.iloc[:train_df.shape[0]], data.iloc[train_df.shape[0]:]
print('train_df.shape: {}, test_df.shape: {}'.format(train_df.shape, test_df.shape))
train_df.head()

各行为比例:  0.03501586934580252 0.02580487086290815 0.007533327266004016 0.0038211876059220415
train_df.shape: (6708846, 13), test_df.shape: (609036, 13)


Unnamed: 0,videoplayseconds,userid,feedid,authorid,bgm_song_id,bgm_singer_id,manual_keyword_list,manual_tag_list,read_comment,like,click_avatar,forward,sample_weight
5143704,1.946289,5832,6529,263,2935,2451,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0.0
5793422,4.078125,10483,408,276,1,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0,0,0,0,0.0
3867148,4.109375,1117,1954,857,784,781,"[276, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[88, 55, 9, 48, 2, 0, 0, 0, 0, 0, 0]",0,0,0,0,0.0
3133071,3.712891,13508,54,270,1,1,"[62, 241, 1220, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[34, 35, 9, 2, 0, 0, 0, 0, 0, 0, 0]",1,0,0,0,1.0
4400240,3.611328,4425,2309,26,1,1,"[276, 1702, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[88, 55, 9, 48, 2, 0, 0, 0, 0, 0, 0]",0,0,0,0,0.0


In [9]:
target = ['read_comment', 'like', 'click_avatar', 'forward']

# 稠密特征、稀疏特征、变长稀疏特征
dense_column_names = ['videoplayseconds']
sparse_column_names = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
varlen_sparse_column_names = ['manual_keyword_list', 'manual_tag_list']

SparseFeature = namedtuple('SparseFeature', ['name', 'vocabulary_size', 'embedding_size'])
DenseFeature = namedtuple('DenseFeature', ['name', 'dimension'])
VarLenSparseFeature = namedtuple('VarLenSparseFeature', ['name', 'vocabulary_size', 'embedding_size', 'maxlen'])

varlen_sparse_column_maxlen_dict = {
    'manual_keyword_list': 18,
    'manual_tag_list': 11
}

feature_columns = [SparseFeature(f, vocabulary_size=data[f].nunique(), embedding_size=4) for f in sparse_column_names] + \
[DenseFeature(f, 1) for f in dense_column_names] + \
[VarLenSparseFeature(f, len(encoder_dict[f])+1, embedding_size=4, maxlen=varlen_sparse_column_maxlen_dict[f]) for f in varlen_sparse_column_names]
feature_columns

[SparseFeature(name='userid', vocabulary_size=17390, embedding_size=4),
 SparseFeature(name='feedid', vocabulary_size=16448, embedding_size=4),
 SparseFeature(name='authorid', vocabulary_size=6966, embedding_size=4),
 SparseFeature(name='bgm_song_id', vocabulary_size=5773, embedding_size=4),
 SparseFeature(name='bgm_singer_id', vocabulary_size=4573, embedding_size=4),
 DenseFeature(name='videoplayseconds', dimension=1),
 VarLenSparseFeature(name='manual_keyword_list', vocabulary_size=21576, embedding_size=4, maxlen=18),
 VarLenSparseFeature(name='manual_tag_list', vocabulary_size=349, embedding_size=4, maxlen=11)]

In [10]:
class FM_Layer(Layer):
    def __init__(self):
        super(FM_Layer, self).__init__()
        
    def call(self, inputs):
        concat_embed_values = inputs
        #print('concat_embed_values.shape: ', concat_embed_values.shape) # (None, 26, 4)
        sum_square = tf.square(tf.reduce_sum(concat_embed_values, axis=1, keepdims=True)) # (None, 1, 4)
        #print('sum_square.shape: ', sum_square.shape)
        square_sum = tf.reduce_sum(concat_embed_values * concat_embed_values, axis=1, keepdims=True) # (None, 1, 4)
        #print('square_sum.shape: ', square_sum.shape)
        output = sum_square - square_sum # 和的平方-平方的和
        output = 0.5 * tf.reduce_sum(output, axis=2, keepdims=False) # (None 1)
        #print('output.shape: ', output.shape)
        return output
    
    def compute_output_shape(self, input_shape):
        return (None, 1)
    
def model_metric(prob, label, thr=0.5):
    """ 模型评估 """
    # AUC
    fpr, tpr, threshold = metrics.roc_curve(label, prob)
    auc = metrics.auc(fpr, tpr)
    score = metrics.accuracy_score(label, prob > thr)
    # LogLoss
    logloss = log_loss(label, prob)
    print('模型准确率:{}, AUC得分:{}, LogLoss:{}'.format(score, auc, logloss))
    print(classification_report(label, prob > thr, digits=2))
    print('==========================================================')

def build_input_layers(feature_columns):
    """ 构建输入层 """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = {}, {}, {}
    for f in feature_columns:
        if isinstance(f, DenseFeature):
            dense_input_dict[f.name] = Input(shape=(f.dimension, ), name=f.name)
        elif isinstance(f, SparseFeature):
            sparse_input_dict[f.name] = Input(shape=(1, ), name=f.name)
        elif isinstance(f, VarLenSparseFeature):
            varlen_sparse_input_dict[f.name] = Input(shape=(f.maxlen, ), name=f.name)
    return dense_input_dict, sparse_input_dict, varlen_sparse_input_dict
    
def build_embedding_layers(feature_columns, is_linear):
    """ 构建embedding层 """
    embedding_layer_dict = {}
    if is_linear:
        sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
        for f in sparse_feature_columns:
            embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, 1, name='1d_emb_' + f.name)
    else:
        for f in feature_columns:
            if isinstance(f, SparseFeature):
                embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='kd_emb_' + f.name)
            elif isinstance(f, VarLenSparseFeature):
                embedding_layer_dict[f.name] = Embedding(f.vocabulary_size+1, f.embedding_size, name='var_emb_' + f.name, mask_zero=True)
    return embedding_layer_dict
    
def get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns):
    """ 数值特征拼接一起传入全连接层 + 类别特征onehot，flatten，add """
    concat_dense_inputs = Concatenate(axis=1)(list(dense_input_dict.values()))
    dense_logits_output = Dense(1)(concat_dense_inputs)
    
    # embedding(input)查表操作，返回对应input的嵌入向量
    sparse_1d_embed_list = []
    sparse_embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=True)
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = Flatten()(sparse_embedding_layer_dict[f.name](_input))
        sparse_1d_embed_list.append(_embed)
    
    sparse_logits_output = Add()(sparse_1d_embed_list)
    linear_logits = Add()([dense_logits_output, sparse_logits_output])
    return linear_logits
    
def get_fm_logits(sparse_input_dict, embedding_layer_dict, feature_columns):
    """ 取出input所对应的嵌入向量拼接在一起，计算和的平和-平方的和 """
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = embedding_layer_dict[f.name](_input)
        sparse_kd_embed_list.append(_embed)
    
    concat_sparse_kd_embed_list = Concatenate(axis=1)(sparse_kd_embed_list)
    fm_logits = FM_Layer()(concat_sparse_kd_embed_list)
    return fm_logits

class MeanPoolingLayer(Layer):
    def __init__(self, axis, **kwargs):
        super(MeanPoolingLayer, self).__init__(**kwargs)
        self.axis = axis
        self.supports_masking = True
    
    def compute_mask(self, input, input_mask=None):
        # need not to pass the mask to next layers
        return None
    
    def call(self, x, mask=None):
        if x is not None:
            mask = K.cast(mask, K.floatx()) # (None, 18)
            mask = K.repeat(mask, x.shape[-1]) # (None, 4, 18)
            mask = tf.transpose(mask, [0, 2, 1]) # (None, 18, 4)
            x = x * mask # # (None, 18, 4) * (None, 18, 4)
            return K.sum(x, axis=self.axis) / K.sum(mask, axis=self.axis)
        else:
            return K.mean(x, axis=self.axis)

def get_dnn_logits(sparse_input_dict, varlen_sparse_input_dict, embedding_layer_dict, feature_columns):
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeature), feature_columns))
    varlen_sparse_feature_columns = list(filter(lambda x: isinstance(x, VarLenSparseFeature), feature_columns))
    
    # kd embed
    sparse_kd_embed_list = []
    for f in sparse_feature_columns:
        _input = sparse_input_dict[f.name]
        _embed = embedding_layer_dict[f.name](_input)
        flatten_embed = Flatten()(_embed)
        sparse_kd_embed_list.append(flatten_embed)
    
    # seq embed
    varlen_sparse_embed_list = []
    for f in varlen_sparse_feature_columns:
        _input = varlen_sparse_input_dict[f.name] #  (None, 18)
        _embed = embedding_layer_dict[f.name]
        embed_layer = _embed(_input) # (None, 18, 4)
        mask = Masking()(embed_layer) # (None, 18, 4)
        mean_pooling_embed = MeanPoolingLayer(axis=1)(mask) # (None, 4)
        varlen_sparse_embed_list.append(mean_pooling_embed)
    
    concat_embed = Concatenate(axis=1)(sparse_kd_embed_list + varlen_sparse_embed_list)
    print('concat_embed.shape: ', concat_embed.shape)
    
    # DNN
    dnn_out = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(4)(concat_embed))))
    #dnn_out = Dropout(0.2)(Activation(activation='relu')(BatchNormalization()(Dense(4)(dnn_out))))
    #dnn_out = Dropout(0.5)(Activation(activation='relu')(Dense(4, activation='relu')(concat_sparse_kd_embed))
    #dnn_out = Dropout(0.5)(Dense(32, activation='relu')(dnn_out))
    #dnn_out = Dropout(0.5)(Dense(8, activation='relu')(dnn_out))
    dnn_logits = Dense(1)(dnn_out)
    return dnn_logits
    
class ReweightModel(tf.keras.Model):
    def train_step(self, data):
        """ fit()传递的数据 """
        print('data: ', data)
        if len(data) == 3:
            y_prop, y_true, sample_weight = data
        else:
            raise "This is a reweight model, your inputs should contain features, label, sample_weight"
        
        with tf.GradientTape() as tape:
            y_pred = self(y_prop, training=True)
            y_true_1d = tf.cast(tf.reshape(y_true, [-1, 1]), tf.float32) # (batch, 1)
            sample_weight_1d = tf.reshape(sample_weight, [-1, 1]) # (batch, 1)
            
            # compute binary_crossentropy loss
            loss = tf.losses.binary_crossentropy(y_true_1d, y_pred, from_logits=False)[:, None]
            # multiply sample_weight
            loss_weight = (loss * (1 - y_true_1d) + loss * (y_true_1d) * sample_weight_1d)
            # sum loss with weight
            loss_sum = tf.reduce_sum(loss_weight, name="weight_loss_sum")
            # sum sample_weight
            sample_weight_sum = tf.reduce_sum(sample_weight, name="sample_weight_sum")
            # mean batch
            loss = tf.divide(loss_sum, sample_weight_sum, name="weight_logloss")
    
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # update weight
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        self.compiled_metrics.update_state(y_true, y_pred)

        loss_tracker = {'logloss': loss}
        metrics_tracker = {m.name: m.result() for m in self.metrics}
        return {**loss_tracker, **metrics_tracker}
    
def DeepFM(feature_columns, seed=1024, l2_reg=1e-5, task='binary'):
    """ Instantiates FM architecture
    :param feature_columns 
    :param seed
    :param l2_reg L2regularization
    :return: A kears model instance
    """
    dense_input_dict, sparse_input_dict, varlen_sparse_input_dict = build_input_layers(feature_columns)
    input_list = list(dense_input_dict.values()) + list(sparse_input_dict.values()) + list(varlen_sparse_input_dict.values())
    
    # linear w * x + b
    linear_logits = get_linear_logits(dense_input_dict, sparse_input_dict, feature_columns)
    
    # fm 0.5 * [sum(vixi)**2 - sum(vixi*vixi)
    embedding_layer_dict = build_embedding_layers(feature_columns, is_linear=False)
    fm_logits = get_fm_logits(sparse_input_dict, embedding_layer_dict, feature_columns)
    
    # dnn next_a = σ(w * a + b)
    dnn_logits = get_dnn_logits(sparse_input_dict, varlen_sparse_input_dict, embedding_layer_dict, feature_columns)
    
    #output_logits = Add()([linear_logits])
    #output_logits = Add()([linear_logits, fm_logits])
    output_logits = Add()([linear_logits, fm_logits, dnn_logits])
    output_layer = Activation("sigmoid")(output_logits)
    model = ReweightModel(input_list, output_layer)
    return model

model = DeepFM(feature_columns)
model.summary()

concat_embed.shape:  (None, 28)
Model: "reweight_model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
manual_keyword_list (InputLayer [(None, 18)]         0                                            
__________________________________________________________________________________________________
manual_tag_list (InputLayer)    [(None, 11)]         0                                            
__________________________________________________________________________________________________
userid (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
feedid (InputLayer)             [(None, 1)]          0                                            
___________________________________________________

In [11]:
# 模型训练
train_input = {f: np.array([row for row in train_df[f]]) for f in dense_column_names + sparse_column_names + varlen_sparse_column_names}
test_input = {f: np.array([row for row in test_df[f]]) for f in dense_column_names + sparse_column_names + varlen_sparse_column_names}

my_callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='auto')
]

loss = tf.keras.losses.binary_crossentropy
model.compile('adam',
              loss=loss,
              metrics=tf.keras.metrics.AUC(name='auc'))

# 单个任务
y_list = [train_df[i].values for i in target[:1]]
model.fit(x=train_input,
          y=y_list,
          sample_weight=train_df['sample_weight'].values,
          batch_size=1024,
          epochs=100,
          validation_split=0.2,
          callbacks=my_callbacks)

Epoch 1/100
data:  ({'videoplayseconds': <tf.Tensor 'IteratorGetNext:7' shape=(None,) dtype=float32>, 'userid': <tf.Tensor 'IteratorGetNext:6' shape=(None,) dtype=int32>, 'feedid': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=int32>, 'authorid': <tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=int32>, 'bgm_song_id': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>, 'bgm_singer_id': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=int32>, 'manual_keyword_list': <tf.Tensor 'IteratorGetNext:4' shape=(None, 18) dtype=int32>, 'manual_tag_list': <tf.Tensor 'IteratorGetNext:5' shape=(None, 11) dtype=int32>}, (<tf.Tensor 'IteratorGetNext:8' shape=(None,) dtype=int8>,), <tf.Tensor 'IteratorGetNext:9' shape=(None,) dtype=float32>)
data:  ({'videoplayseconds': <tf.Tensor 'IteratorGetNext:7' shape=(None,) dtype=float32>, 'userid': <tf.Tensor 'IteratorGetNext:6' shape=(None,) dtype=int32>, 'feedid': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=int32>, 'authorid': <tf.Tensor

<keras.callbacks.History at 0x15ac78e0160>

In [12]:
# 模型预测与评估
result = model.predict(test_input)

for idx, target_name in enumerate(target):
    print(idx, target_name)
    model_metric(np.array([i[0] for i in result]), test_df[target_name].values)

0 read_comment
模型准确率:0.9586806034454449, AUC得分:0.9090740237023934, LogLoss:0.11337686608195455
              precision    recall  f1-score   support

           0       0.97      0.98      0.98    588439
           1       0.35      0.26      0.30     20597

    accuracy                           0.96    609036
   macro avg       0.66      0.62      0.64    609036
weighted avg       0.95      0.96      0.96    609036

1 like
模型准确率:0.9522228571053271, AUC得分:0.561270940380196, LogLoss:0.22748518754859673
              precision    recall  f1-score   support

           0       0.98      0.97      0.98    594422
           1       0.02      0.03      0.03     14614

    accuracy                           0.95    609036
   macro avg       0.50      0.50      0.50    609036
weighted avg       0.95      0.95      0.95    609036

2 click_avatar
模型准确率:0.9679756204887724, AUC得分:0.5432700738530556, LogLoss:0.11786122998572322
              precision    recall  f1-score   support

           0   

##### 不同行为加权gAUC分数

In [13]:
from scipy.stats import rankdata
from collections import defaultdict

def fast_auc(actual, predicted):
    # https://www.kaggle.com/c/riiid-test-answer-prediction/discussion/208031    
    pred_ranks = rankdata(predicted)
    n_pos = np.sum(actual)
    n_neg = len(actual) - n_pos
    return (np.sum(pred_ranks[actual == 1]) - n_pos*(n_pos+1)/2) / (n_pos*n_neg)

def uAUC(labels, preds, users):
    """ 计算uAUC """
    label_dict, pred_dict, user_flag_dict = defaultdict(lambda: []), defaultdict(lambda: []), defaultdict(lambda: False)
    for idx, label in enumerate(labels):
        user = users[idx]
        pred = preds[idx]
        label = labels[idx]
        label_dict[user].append(label)
        pred_dict[user].append(pred)
    
    # 当前用户是否全为正/负样本
    for user in set(users):
        _labels = label_dict[user]
        flag = False
        for i in range(len(_labels)-1):
            if _labels[i] != _labels[i+1]:
                flag = True
                break
        user_flag_dict[user] = flag
    
    auc_sum = 0.0
    auc_cnt = 0.0
    for user in user_flag_dict:
        if user_flag_dict[user]:
            auc = fast_auc(np.asarray(label_dict[user]), np.asarray(pred_dict[user]))
            auc_sum += auc
            auc_cnt += 1.0
    return auc_sum * 1.0 / auc_cnt

def score(result_df, action_list):
    """ 计算多个行为的加权gAUC分数 """
    weight_dict = {
        "read_comment": 4.0,  # 是否查看评论
        "like": 3.0,  # 是否点赞
        "click_avatar": 2.0,  # 是否点击头像
        "forward": 1.0,  # 是否转发
        "favorite": 1.0,  # 是否收藏
        "comment": 1.0,  # 是否发表评论
        "follow": 1.0  # 是否关注
    }
    
    score = 0.0
    score_dict = {}
    weight_sum = 0.0
    for action in action_list:
        print('action: ', action)
        labels = result_df[action].values
        preds = result_df['p'+action].values
        users = result_df['userid'].values
        weight = weight_dict[action]
        gauc = uAUC(labels, preds, users)
        score_dict[action] = gauc
        score += weight * gauc
        weight_sum += weight
    
    score /= weight_sum
    score = round(score, 4)
    return {
        'score': score,
        'score_detail': score_dict
    } 

result_df = test_df[['userid', 'feedid'] + target]
for idx, target_name in enumerate(target):
    result_df['p'+target_name] = [i[0] for i in result]
    
score(result_df, target)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


action:  read_comment
action:  like
action:  click_avatar
action:  forward


{'score': 0.5509,
 'score_detail': {'read_comment': 0.6048038200546656,
  'like': 0.5085832840871161,
  'click_avatar': 0.5314385168541587,
  'forward': 0.5011312346147583}}