#### Related CTR Papers:

1. [DCN] Deep & Cross Network for Ad Click Predictions (Stanford 2017)
2. [DIEN] Deep Interest Evolution Network for Click-Through Rate Prediction (Alibaba 2019)
3. BPR Bayesian Personalized Ranking from Implicit Feedback
4. DCN V2 Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems
5. Deep Interest Network for Click-Through Rate Prediction
6. Deep Neural Networks for YouTube Recommendations
7. deepfm
8. maximum margin matrix factorization
9. practical-lessons-from-predicting-clicks-on-ads-at-facebook
10. Recommender-Systems-[Netflix]
11. Rendle2010FM
12. Wide_Deep Learning for Recommender Systems

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Libraries

In [2]:
# coding: utf-8
import os
import sys
import time
from datetime import datetime
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from drive.MyDrive.jupyter_notebook.WeChat_Big_Data_Challenge.comm import ACTION_LIST, STAGE_END_DAY, FEA_COLUMN_LIST
from drive.MyDrive.jupyter_notebook.WeChat_Big_Data_Challenge.evaluation import uAUC, compute_weighted_score

#### Data Sources

##### Prepare DataFrame

In [3]:
feed_embedding_df = pd.read_csv("drive/MyDrive/jupyter_notebook/data/wechat_algo_data1/feed_embeddings.csv")
read_comment_offline_df = pd.read_csv("drive/MyDrive/jupyter_notebook/data/offline_train/offline_train_read_comment_12_concate_sample.csv")
read_comment_offline_df = read_comment_offline_df.merge(feed_embedding_df, on="feedid")
evaluate_df = pd.read_csv("drive/MyDrive/jupyter_notebook/data/evaluate/evaluate_all_13_concate_sample.csv")
evaluate_df = evaluate_df.merge(feed_embedding_df, on="feedid")
del feed_embedding_df

##### Prepare Dataset

In [4]:
SEED = 2021
batch_size = 128
read_comment_offline_df = read_comment_offline_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
categorical_features = ["feedid", "authorid", "bgm_song_id", "bgm_singer_id", "userid", "device", "feed_embedding"]
continuous_features = [
    "videoplayseconds", "read_commentsum", "likesum", 
    "click_avatarsum", "forwardsum", "commentsum", "followsum", 
    "favoritesum", "read_commentsum_user", "likesum_user", "click_avatarsum_user",
    "forwardsum_user", "commentsum_user", "followsum_user", "favoritesum_user"
]
feed_continuous_features = [
    "videoplayseconds", "read_commentsum", "likesum", 
    "click_avatarsum", "forwardsum", "commentsum", "followsum", 
    "favoritesum"                        
]
user_continuous_features = [
    "read_commentsum_user", "likesum_user", "click_avatarsum_user",
    "forwardsum_user", "commentsum_user", "followsum_user", "favoritesum_user"                            
]
def get_dict_for_dataset(df):
    my_dict = {}
    for name, values in df.items():
        my_dict[name] = values.values
    return my_dict

In [5]:
def preprocess(x, test=False, model=None):
    _continuous_features_ = []
    for feature in continuous_features:
        _continuous_features_.append(x[feature])
    for feature in categorical_features:
        if feature == "feed_embedding":
            feed_embedding_str = tf.strings.split(x[feature], " ")[:-1]
            feed_embedding = tf.strings.to_number(feed_embedding_str)
            feed_embedding.set_shape([512])
        elif feature == "device":
            device = tf.cast(x["device"], dtype=tf.int32)
            device = tf.one_hot(device, 2) 
        elif feature == "feedid":
            feedid = x[feature]
        elif feature == "authorid":
            authorid = x[feature]
        elif feature == "bgm_song_id":
            bgm_song_id = x[feature]
        elif feature == "bgm_singer_id":
            bgm_singer_id = x[feature]
        elif feature == "userid":
            userid = x[feature]
    if model == "fm":
        feed_continuous_features = _continuous_features_[0:8]
        user_continuous_features = _continuous_features_[8:]
        predictors = (feed_continuous_features, user_continuous_features, feedid, authorid, bgm_song_id, bgm_singer_id, userid, device, feed_embedding)
    else:
        predictors = (_continuous_features_, feedid, authorid, bgm_song_id, bgm_singer_id, userid, device, feed_embedding)
    if test:
        return predictors
    else:
        return (predictors, x["read_comment"])
def get_dataset(which, df, batch_size=batch_size, cache=False, test=False, model=None):
    try:
        dict_df = get_dict_for_dataset(df)
        dataset = tf.data.Dataset.from_tensor_slices(
            dict_df
        )
    except:
        del dict_df, dataset
    if which == "train":
        buffer_size = 1000
        dataset = dataset.shuffle(buffer_size, seed=SEED)
    dataset = dataset.map(lambda x: preprocess(x, test=test, model=model), num_parallel_calls=tf.data.AUTOTUNE, deterministic=(which == "test"))
    dataset = dataset.batch(batch_size, drop_remainder=(not test))
    dataset = dataset.cache() if cache else dataset
    dataset = dataset.prefetch(1)
    return dataset

#### Models

##### Model Components

In [6]:
def get_metrics(metrics):
    if not metrics:
        return None
    _metrics_ = []
    if "precision" in metrics:
        _metrics_.append(tf.keras.metrics.Precision())
    if "recall" in metrics:
        _metrics_.append(tf.keras.metrics.Recall())
    return _metrics_

In [7]:
def get_callbacks(callbacks):
    callback_set = []
    if "earlystopping" in callbacks:
        C = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', mode='auto', restore_best_weights=True,
            patience=2
        )
        callback_set.append(C)
    return callback_set

In [8]:
def print_status_bar(iteration, total, loss, metrics=None):
    metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result())
                         for m in [loss] + (metrics or [])])
    end = "" if iteration < total else "\n"
    print("\r{}/{} - ".format(iteration, total) + metrics,
          end=end)

In [9]:
class vocab_lookup_layer(keras.layers.Layer):
    # https://stackoverflow.com/questions/58507400/how-to-use-tf-lookup-tables-with-tensorflow-2-0-keras-and-mlflow
    def __init__(self, vocab, num_oov_buckets, **kwargs):
      self.vocab = vocab
      self.num_oov_buckets = num_oov_buckets
      super(vocab_lookup_layer, self).__init__(**kwargs)
    def build(self, input_shape):
      vocab_initializer = tf.lookup.KeyValueTensorInitializer(
        self.vocab, tf.range(len(self.vocab), dtype=tf.int64)
      )
      self.table = tf.lookup.StaticVocabularyTable(vocab_initializer, self.num_oov_buckets)
      self.built = True
    def call(self, inputs):
      return self.table.lookup(inputs)
    def get_config(self):
      return {'vocab': self.vocab, 'num_oov_buckets': self.num_oov_buckets}
def dense_layer(n_units, previous_output, BN=False, DR=False,
                KR=None, name=None, activation="relu",
                KI='he_normal'):
    layer = keras.layers.Dense(
        n_units, 
        activation=activation, 
        kernel_initializer=KI,
        kernel_regularizer=KR
    )
    output = layer(previous_output)
    if name:
        layer._name=name
    if DR:
        output = keras.layers.Dropout(DR)(output)
    if BN:
        output = keras.layers.BatchNormalization(momentum=0.999, trainable=True)(output)
    return output
def embedding_layer(vocab, output_dim=10, name=None, num_oov_buckets=None):
    table = vocab_lookup_layer(vocab, num_oov_buckets)
    categorical_feature_input = keras.layers.Input(shape=[], dtype=tf.int64, name=name)
    indexes = keras.layers.Lambda(lambda c: table(c))(categorical_feature_input)
    embeddings = keras.layers.Embedding(input_dim=len(vocab)+num_oov_buckets, output_dim=output_dim)(indexes)
    return categorical_feature_input, embeddings

##### Wide and Deep Model

In [10]:
feedid_vocab = read_comment_offline_df["feedid"].unique()
authorid_vocab = read_comment_offline_df["authorid"].unique()
bgm_song_id_vocab = read_comment_offline_df["bgm_song_id"].unique()
bgm_singer_id_vocab = read_comment_offline_df["bgm_singer_id"].unique()
userid_vocab = read_comment_offline_df["userid"].unique()

In [11]:
def deep_model():
    keras.backend.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    feed_embedding_input = keras.layers.Input(shape=[512], name="feed_embedding_input")
    feed_embedding_input_h1 = dense_layer(256, feed_embedding_input, name="feed_embedding_input_h1")
    feed_embedding_input_h2 = dense_layer(128, feed_embedding_input_h1, name="feed_embedding_input_h2")
    feed_embedding_input_h3 = dense_layer(64, feed_embedding_input_h2, name="feed_embedding_input_h3")
    feedid_input, feedid_embedding = embedding_layer(feedid_vocab, num_oov_buckets=240000)
    authorid_input, authorid_embedding = embedding_layer(authorid_vocab, num_oov_buckets=40000)
    bgm_song_id_input, bgm_song_id_embedding = embedding_layer(bgm_song_id_vocab, num_oov_buckets=60000)
    bgm_singer_id_input, bgm_singer_id_embedding = embedding_layer(bgm_singer_id_vocab, num_oov_buckets=40000)
    userid_input, userid_embedding = embedding_layer(userid_vocab, num_oov_buckets=40000)
    device_input = keras.layers.Input(
        shape=(2), 
        name="device_input"
    )
    c1 = keras.layers.concatenate(
        [
            feedid_embedding, 
            authorid_embedding, 
            bgm_song_id_embedding, 
            bgm_singer_id_embedding, 
            userid_embedding,
            device_input,
            feed_embedding_input_h3
        ]
    )
    h1 = dense_layer(128, c1, name="deep_h1")
    h2 = dense_layer(64, h1, name="deep_h2")
    h3 = dense_layer(32, h2, name="deep_h3")
    output = keras.layers.Dense(1, name="deep_model_output")(h3)
    inputs=[
          feedid_input,
          authorid_input,
          bgm_song_id_input,
          bgm_singer_id_input,
          userid_input,
          device_input,
          feed_embedding_input
    ]
    model = keras.models.Model(
      inputs=inputs,
      outputs=output,
      name="deep_model"
    )
    return inputs, model
def wide_model():
    keras.backend.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    wide_features_input = keras.layers.Input(
        shape=(len(continuous_features)), 
        name="wide_model_input"
    )
    output = keras.layers.Dense(1, name="wide_model_output")(wide_features_input)
    model = keras.models.Model(
      inputs=wide_features_input,
      outputs=output,
      name="wide_model"
    )
    return [wide_features_input], model
def wide_and_deep_model():
    keras.backend.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    deep_inputs, deep_model_layer = deep_model()
    wide_inputs, wide_model_layer = wide_model()
    deep_output = deep_model_layer(deep_inputs)
    wide_output = wide_model_layer(wide_inputs)
    output = tf.nn.sigmoid(wide_output+deep_output)
    inputs = wide_inputs + deep_inputs 
    model = keras.models.Model(
      inputs=inputs,
      outputs=output
    )
    return model

##### FM Model

In [32]:
def fm_model():
    keras.backend.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    feed_embedding_input = keras.layers.Input(shape=[512], name="feed_embedding_input")
    feed_embedding_input_h1 = dense_layer(256, feed_embedding_input, name="feed_embedding_input_h1")
    feed_embedding_input_h2 = dense_layer(128, feed_embedding_input_h1, name="feed_embedding_input_h2")
    feed_embedding_input_h3 = dense_layer(64, feed_embedding_input_h2, name="feed_embedding_input_h3")
    feedid_input, feedid_embedding = embedding_layer(feedid_vocab, num_oov_buckets=240000)
    authorid_input, authorid_embedding = embedding_layer(authorid_vocab, num_oov_buckets=40000)
    bgm_song_id_input, bgm_song_id_embedding = embedding_layer(bgm_song_id_vocab, num_oov_buckets=60000)
    bgm_singer_id_input, bgm_singer_id_embedding = embedding_layer(bgm_singer_id_vocab, num_oov_buckets=40000)
    userid_input, userid_embedding = embedding_layer(userid_vocab, num_oov_buckets=40000, output_dim=64)
    feed_continuous_features_input = keras.layers.Input(
        shape=(len(feed_continuous_features)), 
        dtype=tf.float32, 
        name="feed_continuous_features_input"
    )
    user_continuous_features_input = keras.layers.Input(
        shape=(len(user_continuous_features)), 
        dtype=tf.float32, 
        name="user_continuous_features_input"
    )
    device_input = keras.layers.Input(
        shape=(2), 
        name="device_input"
    )
    feed = keras.layers.concatenate(
        [
            feed_embedding_input_h3,
            feedid_embedding, 
            authorid_embedding, 
            bgm_song_id_embedding, 
            bgm_singer_id_embedding, 
            feed_continuous_features_input,
        ]
    )
    user = keras.layers.concatenate(
        [
            device_input,
            userid_embedding, 
            user_continuous_features_input
        ]
    )
    feed_output = dense_layer(256, feed, name="feed_h1", KR="l2")
    user_output = dense_layer(256, user, name="user_h1", KR="l2")
    output = tf.keras.layers.Dot(axes=1, name="dot")([feed_output, user_output])
    inputs=[
          feed_continuous_features_input,
          user_continuous_features_input,
          feedid_input,
          authorid_input,
          bgm_song_id_input,
          bgm_singer_id_input,
          userid_input,
          device_input,
          feed_embedding_input
    ]
    model = keras.models.Model(
      inputs=inputs,
      outputs=output
    )
    return model

##### Deep Cross Network Model

In [13]:
%pip install -q tensorflow-recommenders
import tensorflow_recommenders as tfrs

[?25l[K     |███▉                            | 10 kB 15.7 MB/s eta 0:00:01[K     |███████▊                        | 20 kB 10.3 MB/s eta 0:00:01[K     |███████████▌                    | 30 kB 6.9 MB/s eta 0:00:01[K     |███████████████▍                | 40 kB 3.6 MB/s eta 0:00:01[K     |███████████████████▏            | 51 kB 4.1 MB/s eta 0:00:01[K     |███████████████████████         | 61 kB 4.6 MB/s eta 0:00:01[K     |███████████████████████████     | 71 kB 4.6 MB/s eta 0:00:01[K     |██████████████████████████████▊ | 81 kB 5.2 MB/s eta 0:00:01[K     |████████████████████████████████| 85 kB 2.2 MB/s 
[?25h

In [44]:
def dcn_model():
    keras.backend.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    feed_embedding_input = keras.layers.Input(shape=[512])
    feedid_input, feedid_embedding = embedding_layer(feedid_vocab, num_oov_buckets=240000)
    authorid_input, authorid_embedding = embedding_layer(authorid_vocab, num_oov_buckets=40000)
    bgm_song_id_input, bgm_song_id_embedding = embedding_layer(bgm_song_id_vocab, num_oov_buckets=60000)
    bgm_singer_id_input, bgm_singer_id_embedding = embedding_layer(bgm_singer_id_vocab, num_oov_buckets=40000)
    userid_input, userid_embedding = embedding_layer(userid_vocab, num_oov_buckets=40000, output_dim=64)
    continuous_features_input = keras.layers.Input(
        shape=(len(continuous_features))
    )
    device_input = keras.layers.Input(
        shape=(2)
    )
    c1 = keras.layers.concatenate(
        [
            continuous_features_input,
            feedid_embedding, 
            authorid_embedding, 
            bgm_song_id_embedding, 
            bgm_singer_id_embedding, 
            userid_embedding, 
            device_input,
            feed_embedding_input
        ]
    )
    x1 = tfrs.layers.dcn.Cross()(c1, c1)
    x2 = tfrs.layers.dcn.Cross()(c1, x1)
    x3 = tfrs.layers.dcn.Cross()(c1, x2)
    x4 = tfrs.layers.dcn.Cross()(c1, x3)
    x5 = tfrs.layers.dcn.Cross()(c1, x4)
    x6 = tfrs.layers.dcn.Cross()(c1, x5)
    h1 = dense_layer(1024, c1)
    h2 = dense_layer(1024, h1, DR=0.50)
    c2 = keras.layers.concatenate(
        [
            x6,
            h2
        ]
    )
    output = dense_layer(1, c2, activation="sigmoid")
    inputs=[
          continuous_features_input,
          feedid_input,
          authorid_input,
          bgm_song_id_input,
          bgm_singer_id_input,
          userid_input,
          device_input,
          feed_embedding_input
    ]
    model = keras.models.Model(
      inputs=inputs,
      outputs=output
    )
    return model

##### Deep Cross Network Version 2 Model

In [15]:
class crossV2(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    def build(self, batch_input_shape):
        self.kernel = self.add_weight(
            name="kernel", shape=[int(batch_input_shape[-1]), int(batch_input_shape[-1])],
            initializer="he_normal")
        self.bias = self.add_weight(
            name="bias", shape=[int(batch_input_shape[-1])], initializer="zeros")
        super().build(batch_input_shape) 
    def call(self, X0, X):
        return X0*(tf.transpose(self.kernel @ tf.transpose(X)) + self.bias) + X
    def compute_output_shape(self, batch_input_shape):
        return tf.TensorShape(batch_input_shape.as_list()[:-1] + [batch_input_shape[-1]])

In [45]:
def dcn_v2_model():
    keras.backend.clear_session()
    np.random.seed(SEED)
    tf.random.set_seed(SEED)
    feed_embedding_input = keras.layers.Input(shape=[512])
    feedid_input, feedid_embedding = embedding_layer(feedid_vocab, num_oov_buckets=240000)
    authorid_input, authorid_embedding = embedding_layer(authorid_vocab, num_oov_buckets=40000)
    bgm_song_id_input, bgm_song_id_embedding = embedding_layer(bgm_song_id_vocab, num_oov_buckets=60000)
    bgm_singer_id_input, bgm_singer_id_embedding = embedding_layer(bgm_singer_id_vocab, num_oov_buckets=40000)
    userid_input, userid_embedding = embedding_layer(userid_vocab, num_oov_buckets=40000, output_dim=64)
    continuous_features_input = keras.layers.Input(
        shape=(len(continuous_features))
    )
    device_input = keras.layers.Input(
        shape=(2)
    )
    c1 = keras.layers.concatenate(
        [
            continuous_features_input,
            feedid_embedding, 
            authorid_embedding, 
            bgm_song_id_embedding, 
            bgm_singer_id_embedding, 
            userid_embedding, 
            device_input,
            feed_embedding_input
        ]
    )
    x1 = crossV2()(c1, c1)
    x2 = crossV2()(c1, x1)
    x3 = crossV2()(c1, x2)
    x4 = crossV2()(c1, x3)
    x5 = crossV2()(c1, x4)
    x6 = crossV2()(c1, x5)
    h1 = dense_layer(1024, c1)
    h2 = dense_layer(1024, h1, DR=0.50)
    c2 = keras.layers.concatenate(
        [
            x6,
            h2
        ]
    )
    output = dense_layer(1, c2, activation="sigmoid", KR="l2")
    inputs=[
          continuous_features_input,
          feedid_input,
          authorid_input,
          bgm_song_id_input,
          bgm_singer_id_input,
          userid_input,
          device_input,
          feed_embedding_input
    ]
    model = keras.models.Model(
      inputs=inputs,
      outputs=output
    )
    return model

##### Light Gradient Boosting Tree From Others (When ensembled with DNN, the ensembled model shows a significant improvement )

In [None]:
# https://developers.weixin.qq.com/community/minihome/article/doc/0006467d05427892b94c341aa56813
from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier
from collections import defaultdict
import gc
import time
from tqdm import tqdm
pd.set_option('display.max_columns', None)
def reduce_mem(df, cols):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in tqdm(cols):
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df
## 从官方baseline里面抽出来的评测函数
def uAUC(labels, preds, user_id_list):
    """Calculate user AUC"""
    user_pred = defaultdict(lambda: [])
    user_truth = defaultdict(lambda: [])
    for idx, truth in enumerate(labels):
        user_id = user_id_list[idx]
        pred = preds[idx]
        truth = labels[idx]
        user_pred[user_id].append(pred)
        user_truth[user_id].append(truth)
    user_flag = defaultdict(lambda: False)
    for user_id in set(user_id_list):
        truths = user_truth[user_id]
        flag = False
        # 若全是正样本或全是负样本，则flag为False
        for i in range(len(truths) - 1):
            if truths[i] != truths[i + 1]:
                flag = True
                break
        user_flag[user_id] = flag
    total_auc = 0.0
    size = 0.0
    for user_id in user_flag:
        if user_flag[user_id]:
            auc = roc_auc_score(np.asarray(user_truth[user_id]), np.asarray(user_pred[user_id]))
            total_auc += auc 
            size += 1.0
    user_auc = float(total_auc)/size
    return user_auc
y_list = ['read_comment', 'like', 'click_avatar', 'forward', 'favorite', 'comment', 'follow']
max_day = 15
## 读取训练集
train = pd.read_csv('drive/MyDrive/jupyter_notebook/data/wechat_algo_data1/user_action.csv')
print(train.shape)
for y in y_list:
    print(y, train[y].mean())
## 读取测试集
test = pd.read_csv('drive/MyDrive/jupyter_notebook/data/wechat_algo_data1/test_a.csv')
test['date_'] = max_day
print(test.shape)
## 合并处理
df = pd.concat([train, test], axis=0, ignore_index=True)
print(df.head(3))
## 读取视频信息表
feed_info = pd.read_csv("drive/MyDrive/jupyter_notebook/data/wechat_algo_data1/feed_info.csv")
## 此份baseline只保留这三列
feed_info = feed_info[[
    'feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id',
]]
df = df.merge(feed_info, on='feedid', how='left')
## 视频时长是秒，转换成毫秒，才能与play、stay做运算
df['videoplayseconds'] *= 1000
## 是否观看完视频（其实不用严格按大于关系，也可以按比例，比如观看比例超过0.9就算看完）
df['is_finish'] = (df['play'] >= df['videoplayseconds']).astype('int8')
df['play_times'] = df['play'] / df['videoplayseconds']
play_cols = [
    'is_finish', 'play_times', 'play', 'stay'
]
## 统计历史5天的曝光、转化、视频观看等情况（此处的转化率统计其实就是target encoding）
n_day = 5
for stat_cols in tqdm([
    ['userid'],
    ['feedid'],
    ['authorid'],
    ['userid', 'authorid']
]):
    f = '_'.join(stat_cols)
    stat_df = pd.DataFrame()
    for target_day in range(2, max_day + 1):
        left, right = max(target_day - n_day, 1), target_day - 1
        tmp = df[((df['date_'] >= left) & (df['date_'] <= right))].reset_index(drop=True)
        tmp['date_'] = target_day
        tmp['{}_{}day_count'.format(f, n_day)] = tmp.groupby(stat_cols)['date_'].transform('count')
        g = tmp.groupby(stat_cols)
        tmp['{}_{}day_finish_rate'.format(f, n_day)] = g[play_cols[0]].transform('mean')
        feats = ['{}_{}day_count'.format(f, n_day), '{}_{}day_finish_rate'.format(f, n_day)]
        for x in play_cols[1:]:
            for stat in ['max', 'mean']:
                tmp['{}_{}day_{}_{}'.format(f, n_day, x, stat)] = g[x].transform(stat)
                feats.append('{}_{}day_{}_{}'.format(f, n_day, x, stat))
        for y in y_list[:4]:
            tmp['{}_{}day_{}_sum'.format(f, n_day, y)] = g[y].transform('sum')
            tmp['{}_{}day_{}_mean'.format(f, n_day, y)] = g[y].transform('mean')
            feats.extend(['{}_{}day_{}_sum'.format(f, n_day, y), '{}_{}day_{}_mean'.format(f, n_day, y)])
        tmp = tmp[stat_cols + feats + ['date_']].drop_duplicates(stat_cols + ['date_']).reset_index(drop=True)
        stat_df = pd.concat([stat_df, tmp], axis=0, ignore_index=True)
        del g, tmp
    df = df.merge(stat_df, on=stat_cols + ['date_'], how='left')
    del stat_df
    gc.collect()
## 全局信息统计，包括曝光、偏好等，略有穿越，但问题不大，可以上分，只要注意不要对userid-feedid做组合统计就行
for f in tqdm(['userid', 'feedid', 'authorid']):
    df[f + '_count'] = df[f].map(df[f].value_counts())
for f1, f2 in tqdm([
    ['userid', 'feedid'],
    ['userid', 'authorid']
]):
    df['{}_in_{}_nunique'.format(f1, f2)] = df.groupby(f2)[f1].transform('nunique')
    df['{}_in_{}_nunique'.format(f2, f1)] = df.groupby(f1)[f2].transform('nunique')
for f1, f2 in tqdm([
    ['userid', 'authorid']
]):
    df['{}_{}_count'.format(f1, f2)] = df.groupby([f1, f2])['date_'].transform('count')
    df['{}_in_{}_count_prop'.format(f1, f2)] = df['{}_{}_count'.format(f1, f2)] / (df[f2 + '_count'] + 1)
    df['{}_in_{}_count_prop'.format(f2, f1)] = df['{}_{}_count'.format(f1, f2)] / (df[f1 + '_count'] + 1)
df['videoplayseconds_in_userid_mean'] = df.groupby('userid')['videoplayseconds'].transform('mean')
df['videoplayseconds_in_authorid_mean'] = df.groupby('authorid')['videoplayseconds'].transform('mean')
df['feedid_in_authorid_nunique'] = df.groupby('authorid')['feedid'].transform('nunique')
## 内存够用的不需要做这一步
df = reduce_mem(df, [f for f in df.columns if f not in ['date_'] + play_cols + y_list])
train = df[~df['read_comment'].isna()].reset_index(drop=True)
test = df[df['read_comment'].isna()].reset_index(drop=True)
cols = [f for f in df.columns if f not in ['date_'] + play_cols + y_list]
print(train[cols].shape)
trn_x = train[train['date_'] < 14].reset_index(drop=True)
val_x = train[train['date_'] == 14].reset_index(drop=True)
##################### 线下验证 #####################
uauc_list = []
r_list = []
for y in y_list[:4]:
    print('=========', y, '=========')
    t = time.time()
    clf = LGBMClassifier(
        learning_rate=0.05,
        n_estimators=5000,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2021,
        metric='None'
    )
    clf.fit(
        trn_x[cols], trn_x[y],
        eval_set=[(val_x[cols], val_x[y])],
        eval_metric='auc',
        early_stopping_rounds=100,
        verbose=50
    )
    val_x[y + '_score'] = clf.predict_proba(val_x[cols])[:, 1]
    val_uauc = uAUC(val_x[y], val_x[y + '_score'], val_x['userid'])
    uauc_list.append(val_uauc)
    print(val_uauc)
    r_list.append(clf.best_iteration_)
    print('runtime: {}\n'.format(time.time() - t))
weighted_uauc = 0.4 * uauc_list[0] + 0.3 * uauc_list[1] + 0.2 * uauc_list[2] + 0.1 * uauc_list[3]
print(uauc_list)
print(weighted_uauc)
##################### 全量训练 #####################
r_dict = dict(zip(y_list[:4], r_list))
for y in y_list[:4]:
    print('=========', y, '=========')
    t = time.time()
    clf = LGBMClassifier(
        learning_rate=0.05,
        n_estimators=r_dict[y],
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=2021
    )
    clf.fit(
        train[cols], train[y],
        eval_set=[(train[cols], train[y])],
        early_stopping_rounds=r_dict[y],
        verbose=100

    )
    test[y] = clf.predict_proba(test[cols])[:, 1]
    print('runtime: {}\n'.format(time.time() - t))
test[['userid', 'feedid'] + y_list[:4]].to_csv(
    'sub_%.6f_%.6f_%.6f_%.6f_%.6f.csv' % (weighted_uauc, uauc_list[0], uauc_list[1], uauc_list[2], uauc_list[3]),
    index=False
)

#### Train Model

In [20]:
negative = sum(read_comment_offline_df["read_comment"]==0)
positive = sum(read_comment_offline_df["read_comment"]==1)
total = len(read_comment_offline_df)
negative_weight = (1 / negative) * (total / 2.0)
positive_weight = (1 / positive) * (total / 2.0)
weight = {0: negative_weight, 1: positive_weight}

In [27]:
def train_model(n_epochs=4, dataset=None, valid_dataset=None, batch_size=128, model=None, model_name=None, 
                deep_optimizer=keras.optimizers.Adam(learning_rate=0.01), loss_fn = keras.losses.BinaryCrossentropy(),
                metrics = get_metrics(["precision", "recall"])):
    deep_optimizer = deep_optimizer
    wide_optimizer = keras.optimizers.Ftrl(learning_rate=0.01)
    loss_fn = loss_fn
    metrics = metrics
    callbacks = get_callbacks(["earlystopping"])
    if model_name == "wide_and_deep":
        deep_layers = model.get_layer("deep_model")
        wide_layers = model.get_layer("wide_model")
        mean_loss = keras.metrics.Mean()
        for epoch in range(1, n_epochs + 1):
            print("Epoch {}/{}".format(epoch, n_epochs))
            step = 1
            for step_dataset in dataset:
                X, y = step_dataset[0], step_dataset[1]
                with tf.GradientTape(persistent=True) as tape:
                    y_pred = model(X, training=True)
                    positive_instances_loss = loss_fn(y, y_pred) * positive_weight * tf.cast(y == 1, dtype=tf.float32)
                    negative_instances_loss = loss_fn(y, y_pred) * negative_weight * tf.cast(y == 0, dtype=tf.float32)
                    instances_loss = positive_instances_loss + negative_instances_loss
                    main_loss = tf.reduce_mean(instances_loss)
                    loss = tf.add_n([main_loss] + model.losses)
                for layers, optimizer in ((deep_layers, deep_optimizer),
                                              (wide_layers, wide_optimizer)):
                    gradients = tape.gradient(loss, layers.trainable_variables)
                    optimizer.apply_gradients(zip(gradients, layers.trainable_variables))
                del tape
                mean_loss(loss)
                for metric in metrics:
                    metric(y, y_pred)
                print_status_bar(step * batch_size, len(dataset)*batch_size, mean_loss, metrics)
                step += 1
            print_status_bar(len(dataset)*batch_size, len(dataset)*batch_size, mean_loss, metrics)
            for metric in [mean_loss] + metrics:
                metric.reset_states()
            for step_dataset in valid_dataset:
                X, y = step_dataset[0], step_dataset[1]
                y_pred = model(X, training=True)
                instances_loss = loss_fn(y, y_pred)
                loss = tf.reduce_mean(instances_loss)               
                mean_loss(loss)
                for metric in metrics:
                    metric(y, y_pred)
            print_status_bar("validation", "validation", mean_loss, metrics)
            for metric in [mean_loss] + metrics:
                metric.reset_states()
    else:
        model.compile(optimizer=deep_optimizer, loss=loss_fn, metrics=metrics)
        model.fit(dataset, validation_data=valid_dataset, callbacks=callbacks, epochs=n_epochs, class_weight=weight) 

In [46]:
my_wide_and_deep_model = wide_and_deep_model()
my_fm_model = fm_model()
my_dcn_model = dcn_model()
my_dcn_v2_model = dcn_v2_model()

In [25]:
dataset = get_dataset("train", read_comment_offline_df)
valid_dataset = get_dataset("valid", evaluate_df)
fm_dataset = get_dataset("train", read_comment_offline_df, model="fm")
fm_valid_dataset = get_dataset("valid", read_comment_offline_df, model="fm")
test_dataset = get_dataset("test", evaluate_df, test=True)
fm_test_dataset = get_dataset("test", evaluate_df, test=True, model="fm")

In [28]:
train_model(n_epochs=4, dataset=dataset, valid_dataset=valid_dataset, batch_size=128, model=my_wide_and_deep_model, model_name="wide_and_deep")

Epoch 1/4
619776/619776 - mean: 0.2354 - precision: 0.6979 - recall: 0.6084
619776/619776 - mean: 0.2354 - precision: 0.6979 - recall: 0.6084
validation/validation - mean: 0.1446 - precision: 0.2881 - recall: 0.6076
Epoch 2/4
619776/619776 - mean: 0.1995 - precision: 0.7194 - recall: 0.6892
619776/619776 - mean: 0.1995 - precision: 0.7194 - recall: 0.6892
validation/validation - mean: 0.1540 - precision: 0.2781 - recall: 0.6250
Epoch 3/4
619776/619776 - mean: 0.1724 - precision: 0.7321 - recall: 0.7484
619776/619776 - mean: 0.1724 - precision: 0.7321 - recall: 0.7484
validation/validation - mean: 0.1744 - precision: 0.2451 - recall: 0.6806
Epoch 4/4
619776/619776 - mean: 0.1543 - precision: 0.7414 - recall: 0.7893
619776/619776 - mean: 0.1543 - precision: 0.7414 - recall: 0.7893
validation/validation - mean: 0.1898 - precision: 0.2274 - recall: 0.7017


In [37]:
train_model(n_epochs=4, dataset=fm_dataset, valid_dataset=fm_valid_dataset, model=my_fm_model, deep_optimizer=keras.optimizers.Adam(learning_rate=0.001), loss_fn=keras.losses.mse, metrics=[]) 

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [47]:
train_model(n_epochs=8, dataset=dataset, valid_dataset=valid_dataset, model=my_dcn_model, deep_optimizer=keras.optimizers.Adam(learning_rate=0.001))

Epoch 1/8
Epoch 2/8
Epoch 3/8


In [48]:
train_model(n_epochs=8, dataset=dataset, valid_dataset=valid_dataset, model=my_dcn_v2_model, deep_optimizer=keras.optimizers.Adam(learning_rate=0.001))

Epoch 1/8
Epoch 2/8
Epoch 3/8


#### Evaluate Model

In [29]:
userid_list = evaluate_df["userid"].astype(str).tolist()
labels = evaluate_df["read_comment"].tolist()
def uauc_evaluate(model, model_name, test_dataset):
    def predict(model, test_dataset):
        all_predictions = None
        all_predictions_flag = False
        for x in test_dataset:
            batch_predictions = model(x)
            if not all_predictions_flag:
                all_predictions = batch_predictions
                all_predictions_flag = True
            else:
                all_predictions = np.row_stack([all_predictions, batch_predictions])
        return all_predictions
    predictions = predict(model, test_dataset)
    print(f"{model_name} uAUC:{uAUC(labels, predictions, userid_list)}")

In [30]:
print(uauc_evaluate(my_wide_and_deep_model, "my_wide_and_deep_model", test_dataset))

my_wide_and_deep_model uAUC:0.5802043553435963
None


In [41]:
print(uauc_evaluate(my_fm_model, "my_fm_model", fm_test_dataset))

my_fm_model uAUC:0.5986379389857975
None


In [49]:
print(uauc_evaluate(my_dcn_model, "my_dcn_model", test_dataset))

my_dcn_model uAUC:0.6144456315323304
None


In [50]:
print(uauc_evaluate(my_dcn_v2_model, "my_dcn_v2_model", test_dataset))

my_dcn_v2_model uAUC:0.616542769765119
None
