# MMOE
## dfn ctr

In [2]:
import datetime
import numpy as np
import pandas as pd
import joblib
import warnings
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import collections
import re
import copy
import tensorflow as tf

import utils_

from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
# from deepctr.feature_column import SparseFeat, DenseFeat
# from deepctr.models import MMOE
from tensorflow import keras

from utils.feature_column import SparseFeat, DenseFeat
from model.mmoe import MMOE

# pd.set_option('display.max_columns', None)
# pd.set_option('max_row', 500)
warnings.filterwarnings('ignore')
tqdm.pandas(desc='pandas bar')

# 数据处理

## 数据

In [None]:
df_20211219 = pd.read_csv('../data/sample_dfn/union/zfbx_click_resource_ctr_sample_label_feature_union_new_dfn_ctr1_20211219.txt', sep='\t', encoding='utf-8')
print(df_20211219.shape)
df_20211219.head()

In [None]:
df_20211219.groupby(by=['obs_dt', 'dt'])['uid'].count()

In [None]:
df_20211219['label_click'].value_counts()

In [None]:
347870 / 2408693

In [None]:
df_20211219['label_click_apply'].value_counts()

In [None]:
117459 / 347870

In [None]:
117459 / 2408693

In [None]:
# 特征
list_feats_x = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_feats_x_new_adj.pickle')
print(len(list_feats_x))
list_feats_x[:10]

In [None]:
df_20211219['obs_dt'] = pd.to_datetime(df_20211219['obs_dt'])
df_20211219['dt'] = pd.to_datetime(df_20211219['dt'])
df_20211219.head()

In [None]:
utils_.save_pickle(df_20211219[['uid', 'obs_dt', 'dt', 'label_click', 'label_click_apply']+list_feats_x], 
                  '../data/sample_dfn/union/df_new_20211219.pickle')

## 划分训练（包含验证）、测试集

In [None]:
df_20211219 = utils_.load_pickle('../data/sample_dfn/union/df_new_20211219.pickle')
print(df_20211219.shape)
df_20211219.head()

In [None]:
# 按id、obs_dt去重划分
# 数据集包含1周的数据，但由于特征为周更级别，因此7天内同id的特征都一致
# 但不同obs_dt去重划分下同id的可以保留
df_id = df_20211219[['uid', 'obs_dt']].drop_duplicates()
print(df_id.shape)
df_id.head()

In [None]:
# 比例：8:2
df_id_train = df_id.sample(frac=0.8, random_state=2021).reset_index(drop=True)
print(df_id_train.shape)
df_id_train.head()

In [None]:
df_id_test = pd.concat([df_id, df_id_train], axis=0)
df_id_test.drop_duplicates(keep=False, inplace=True)
df_id_test.reset_index(drop=True, inplace=True)
print(df_id_test.shape)
df_id_test.head()

In [None]:
%%time
df_train = df_id_train.merge(df_20211219, on=['uid', 'obs_dt'], how='left')
print(df_train.shape)
df_train.head()

In [None]:
utils_.save_pickle(df_train, '../data/sample_dfn/union/df_train_new_20211219.pickle')

In [None]:
%%time
df_test = df_id_test.merge(df_20211219, on=['uid', 'obs_dt'], how='left')
print(df_test.shape)
df_test.head()

In [None]:
utils_.save_pickle(df_test, '../data/sample_dfn/union/df_test_new_20211219.pickle')

In [None]:
# 训练集处理：由于当前特征的最小间隔为周（样本为天），因此需要对相同样本（uid，obs_date）进行整合（周期内是否为点击）
df_train = utils_.load_pickle('../data/sample_dfn/union/df_train_new_20211219.pickle')
print(df_train.shape)
df_train.head()

In [None]:
df_train_unique_label = df_train[['uid', 'obs_dt', 'label_click', 'label_click_apply']].\
    groupby(by=['uid', 'obs_dt']).max()
df_train_unique_label.reset_index(inplace=True)
print(df_train_unique_label.shape)
df_train_unique_label.head()

In [None]:
df_train_unique_label.rename(columns={'label_click': 'label_click_combine', 
                                      'label_click_apply': 'label_click_apply_combine'}, 
                             inplace=True)
df_train_unique_label.head()

In [None]:
%%time
df_train_unique = df_train.\
    merge(df_train_unique_label, on=['uid', 'obs_dt'], how='left')
df_train_unique.drop(columns=['dt', 'label_click', 'label_click_apply'], inplace=True)
print(df_train_unique.shape)
df_train_unique.head()

In [None]:
df_train_unique.rename(columns={'label_click_combine': 'label_click', 
                                'label_click_apply_combine': 'label_click_apply'}, 
                       inplace=True)
df_train_unique.head()

In [None]:
df_train_unique.drop_duplicates(inplace=True)
df_train_unique.reset_index(drop=True, inplace=True)
print(df_train_unique.shape)
df_train_unique.head()

In [None]:
utils_.save_pickle(df_train_unique, '../data/sample_dfn/union/df_train_unique_new_20211219.pickle')

In [None]:
# id，X，y
df_train_unique = utils_.load_pickle('../data/sample_dfn/union/df_train_unique_new_20211219.pickle')

df_train_unique_uid_obsDate = df_train_unique[['uid', 'obs_dt']]
df_train_unique_y = df_train_unique[['label_click', 'label_click_apply']]
df_train_unique_X = df_train_unique[[x for x in df_train_unique.columns 
                                     if x not in ['uid', 'obs_dt', 'label_click', 'label_click_apply']]]

utils_.save_pickle(df_train_unique_uid_obsDate, 
                  '../data/sample_dfn/union/df_train_unique_uid_obsDate_new_20211219.pickle')
utils_.save_pickle(df_train_unique_y, '../data/sample_dfn/union/df_train_unique_y_new_20211219.pickle')
utils_.save_pickle(df_train_unique_X, '../data/sample_dfn/union/df_train_unique_X_new_20211219.pickle')

In [None]:
df_test = utils_.load_pickle('../data/sample_dfn/union/df_test_new_20211219.pickle')

df_test_uid_obsDate_dt = df_test[['uid', 'obs_dt', 'dt']]
df_test_y = df_test[['label_click', 'label_click_apply']]
df_test_X = df_test[[x for x in df_test.columns 
                     if x not in ['uid', 'obs_dt', 'dt', 'label_click', 'label_click_apply']]]

utils_.save_pickle(df_test_uid_obsDate_dt, 
                  '../data/sample_dfn/union/df_test_uid_obsDate_dt_new_20211219.pickle')
utils_.save_pickle(df_test_y, '../data/sample_dfn/union/df_test_y_new_20211219.pickle')
utils_.save_pickle(df_test_X, '../data/sample_dfn/union/df_test_X_new_20211219.pickle')

# 入参处理

In [None]:
df_train_unique_uid_obsDate = utils_.load_pickle('../data/sample_dfn/union/df_train_unique_uid_obsDate_new_20211219.pickle')
df_train_unique_y = utils_.load_pickle('../data/sample_dfn/union/df_train_unique_y_new_20211219.pickle')
df_train_unique_X = utils_.load_pickle('../data/sample_dfn/union/df_train_unique_X_new_20211219.pickle')

print(df_train_unique_uid_obsDate.shape)
print(df_train_unique_y.shape)
print(df_train_unique_X.shape)

In [None]:
list_feats_x = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_feats_x_new_adj.pickle')
print(len(list_feats_x))
list_feats_x[:10]

In [None]:
sparse_features = [
    'xxx'
]
print(len(sparse_features))

In [None]:
utils_.save_pickle(sparse_features, '../data/sample_dfn/union/list_feats/list_sparse_features.pickle')

In [None]:
df_train_unique_X[sparse_features].head()

In [None]:
dense_features = [x for x in list_feats_x if x not in sparse_features]
print(len(dense_features))
dense_features[:10]

In [None]:
utils_.save_pickle(dense_features, '../data/sample_dfn/union/list_feats/list_dense_features.pickle')

In [None]:
mms = MinMaxScaler()
train_unique_X_mms = mms.fit_transform(df_train_unique_X[dense_features])
joblib.dump(mms, '../data/sample_dfn/union/scaler/mms.pickle')

In [None]:
mms = joblib.load('../data/sample_dfn/union/scaler/mms.pickle')
train_unique_X_mms = mms.transform(df_train_unique_X[dense_features])
print(train_unique_X_mms.shape)

In [None]:
mms = joblib.load('../data/sample_dfn/union/scaler/mms.pickle')

In [None]:
mms.data_min_

In [None]:
mms.data_max_

In [None]:
df_train_unique_X[dense_features].max()

In [None]:
df_train_unique_X[dense_features].min()

In [None]:
(39+99)/(117+99)*(1-0)+0

In [None]:
df_train_unique_X_dense = pd.DataFrame(train_unique_X_mms, columns=dense_features)
print(df_train_unique_X_dense.shape)
df_train_unique_X_dense.head()

In [None]:
df_train_unique_X[dense_features].head()

In [None]:
utils.save_pickle(df_train_unique_X_dense, '../data/sample_dfn/union/df_train_unique_X_dense_20211219.pickle')

In [None]:
dict_lbe = {}
list_df_train_unique_X_sparse = []

try:
    with tqdm(sparse_features) as t:
        for x in t:
            lbe = LabelEncoder()
            df_train_unique_X_sparse_each = pd.DataFrame(lbe.fit_transform(df_train_unique_X[x]), columns=[x])
            dict_lbe[x] = lbe
            list_df_train_unique_X_sparse.append(df_train_unique_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

joblib.dump(dict_lbe, '../data/sample_dfn/union/scaler/dict_lbe.pickle')
df_train_unique_X_sparse = pd.concat(list_df_train_unique_X_sparse, axis=1)
print(df_train_unique_X_sparse.shape)
df_train_unique_X_sparse.head()

In [None]:
dict_lbe = joblib.load('../data/sample_dfn/union/scaler/dict_lbe.pickle')
dict_lbe

In [None]:
df_train_unique_X[sparse_features].head()

In [None]:
for k, v in dict_lbe.items():
    print('key:{ke}, collect_set:{cs}'.format(ke=k, cs=v.classes_))

In [None]:
df_train_unique_X['b_edu_type'].value_counts()

In [None]:
utils.save_pickle(df_train_unique_X_sparse, '../data/sample_dfn/union/df_train_unique_X_sparse_20211219.pickle')

In [None]:
df_train_unique_X_sparse = utils.load_pickle('../data/sample_dfn/union/df_train_unique_X_sparse_20211219.pickle')
df_train_unique_X_dense = utils.load_pickle('../data/sample_dfn/union/df_train_unique_X_dense_20211219.pickle')

print(df_train_unique_X_sparse.shape)
print(df_train_unique_X_dense.shape)

In [None]:
df_train_unique_X_transform = pd.concat([df_train_unique_X_sparse, df_train_unique_X_dense], axis=1)
print(df_train_unique_X_transform.shape)
df_train_unique_X_transform.head()

In [None]:
utils.save_pickle(df_train_unique_X_transform, '../data/sample_dfn/union/df_train_unique_X_transform_20211219.pickle')

# 模型

* 结构

In [None]:
df_train_unique_X_transform = utils_.load_pickle('../data/sample_dfn/union/df_train_unique_X_transform_20211219.pickle')
print(df_train_unique_X_transform.shape)
df_train_unique_X_transform.head()

In [None]:
sparse_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_sparse_features.pickle')
dense_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_dense_features.pickle')

print(len(sparse_features))
print(len(dense_features))

In [None]:
feature_columns = [SparseFeat(x, df_train_unique_X_transform[x].max() + 1, embedding_dim=4) for x in sparse_features] + \
    [DenseFeat(x, 1, ) for x in dense_features]
feature_columns[15:20]

In [None]:
# utils.save_pickle(feature_columns, '../data/sample_dfn/union/other/feature_columns.pickle')
utils_.save_pickle(feature_columns, '../data/sample_dfn/union/other/feature_columns_self.pickle')

In [None]:
# feature_columns = utils_.load_pickle('../data/sample_dfn/union/other/feature_columns.pickle')
feature_columns = utils_.load_pickle('../data/sample_dfn/union/other/feature_columns_self.pickle')

model = MMOE(feature_columns, 
             num_experts=3, expert_dnn_hidden_units=(256, 128), 
             tower_dnn_hidden_units=(64, ), 
             gate_dnn_hidden_units=(), 
             l2_reg_embedding=0.00001, 
             l2_reg_dnn=0, 
             seed=2022, 
             dnn_dropout=0, 
             dnn_activation='relu', 
             dnn_use_bn=False, 
             task_types=['binary', 'binary'], task_names=['label_click', 'label_click_apply'])

In [None]:
model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])

In [None]:
df_train_unique_y = utils_.load_pickle('../data/sample_dfn/union/df_train_unique_y_new_20211219.pickle')
print(df_train_unique_y.shape)
df_train_unique_y.head()

In [None]:
# train_input = {x: df_train_unique_X_transform[x] for x in sparse_features+dense_features}

In [None]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6), 
#     tf.keras.callbacks.ModelCheckpoint(filepath='../data/model/check_point/mmoe_deepctr_20220208/weights-improvement-{epoch:03d}-{val_loss:.4f}.hdf5', 
#                                        monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min'), 
#     tf.keras.callbacks.ModelCheckpoint(filepath='../data/model/check_point/mmoe_20220214/weights-improvement-{epoch:03d}-{val_loss:.4f}.hdf5', 
#                                        monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min'), 
    tf.keras.callbacks.ModelCheckpoint(filepath='../data/model/check_point/mmoe_20220215/weights-improvement-{epoch:03d}-{val_loss:.4f}.hdf5', 
                                       monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min'), 
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
]

* 训练

In [None]:
history = model.fit(
#     x=train_input, 
    x=df_train_unique_X_transform.values, 
    y=[df_train_unique_y['label_click'].values, df_train_unique_y['label_click_apply'].values], 
    batch_size=1024, 
    epochs=1000, 
    verbose=2, 
    callbacks=callbacks, 
    validation_split=0.2)

In [None]:
# utils.save_pickle(history.history, '../data/model/check_point/history_20220208.pickle')
# utils_.save_pickle(history.history, '../data/model/check_point/mmoe_20220214/history_20220214.pickle')
utils_.save_pickle(history.history, '../data/model/check_point/mmoe_20220215/history_20220215.pickle')

In [None]:
tf.saved_model.save(model, '../data/model/dfn_mutil_task_online/mmoe_train/')

* 训练效果

In [None]:
# dict_fit_history = utils.load_pickle('../data/model/check_point/history_20220208.pickle')
# dict_fit_history = utils_.load_pickle('../data/model/check_point/mmoe_20220214/history_20220214.pickle')
dict_fit_history = utils_.load_pickle('../data/model/check_point/mmoe_20220215/history_20220215.pickle')
df_fit_history = pd.DataFrame(dict_fit_history)
print(df_fit_history.shape)
df_fit_history.head()

In [None]:
sns.lineplot(data=df_fit_history[['loss', 'val_loss']])

In [None]:
# sns.lineplot(data=df_fit_history[['label_click_loss', 'val_label_click_loss']])
sns.lineplot(data=df_fit_history[['output_1_loss', 'val_output_1_loss']])

In [None]:
# sns.lineplot(data=df_fit_history[['label_click_apply_loss', 'val_label_click_apply_loss']])
sns.lineplot(data=df_fit_history[['output_2_loss', 'val_output_2_loss']])

In [None]:
# 重制网络结构，加载最优权重
best_model = MMOE(feature_columns, 
                  num_experts=3, expert_dnn_hidden_units=(256, 128), 
                  tower_dnn_hidden_units=(64, ), 
                  gate_dnn_hidden_units=(), 
                  l2_reg_embedding=0.00001, 
                  l2_reg_dnn=0, 
                  seed=2022, 
                  dnn_dropout=0, 
                  dnn_activation='relu', 
                  dnn_use_bn=False, 
                  task_types=['binary', 'binary'], task_names=['label_click', 'label_click_apply'])

In [None]:
# best_model.load_weights('../data/model/check_point/weights-improvement-008-0.5136.hdf5')
best_model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
best_model.build(input_shape=(df_train_unique_X_transform.values.shape[0], df_train_unique_X_transform.values.shape[1]))
# best_model.load_weights('../data/model/check_point/mmoe_20220214/weights-improvement-006-0.5126.hdf5')
best_model.load_weights('../data/model/check_point/mmoe_20220215/weights-improvement-007-0.5132.hdf5')

In [None]:
tf.saved_model.save(best_model, '../data/model/dfn_mutil_task_online/mmoe_reload/')

In [None]:
# pred_train = best_model.predict(train_input, batch_size=1024)
pred_train = best_model.predict(df_train_unique_X_transform.values, batch_size=1024)

In [None]:
df_y = pd.concat([df_train_unique_y, pd.DataFrame(pred_train[0], columns=['label_click_pred']), pd.DataFrame(pred_train[1], columns=['label_click_apply_pred'])], 
                 axis=1)
print(df_y.shape)
df_y.head()

In [None]:
roc_auc_score(df_y['label_click'], df_y['label_click_pred'])

In [None]:
roc_auc_score(df_y['label_click_apply'], df_y['label_click_apply_pred'])

* 测试

In [None]:
# 数据转换
df_test_uid_obsDate_dt = utils_.load_pickle('../data/sample_dfn/union/df_test_uid_obsDate_dt_new_20211219.pickle')
df_test_y = utils_.load_pickle('../data/sample_dfn/union/df_test_y_new_20211219.pickle')
df_test_X = utils_.load_pickle('../data/sample_dfn/union/df_test_X_new_20211219.pickle')

print(df_test_uid_obsDate_dt.shape)
print(df_test_y.shape)
print(df_test_X.shape)

In [None]:
sparse_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_sparse_features.pickle')
dense_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_dense_features.pickle')

print(len(sparse_features))
print(len(dense_features))

In [None]:
tmp = pd.concat([df_test_uid_obsDate_dt[['uid']].head(100), df_test_X[sparse_features].head(100), df_test_X[dense_features].head(100)], axis=1)
print(tmp.shape)
tmp

In [None]:
tmp.to_csv('../data/sample_dfn/union/other/test_sample_100.csv', encoding='utf-8', index=False)

In [None]:
mms = joblib.load('../data/sample_dfn/union/scaler/mms.pickle')
test_X_mms = mms.transform(df_test_X[dense_features])
print(test_X_mms.shape)

In [None]:
df_test_X_dense = pd.DataFrame(test_X_mms, columns=dense_features)
print(df_test_X_dense.shape)
df_test_X_dense.head()

In [None]:
df_test_X[dense_features].head()

In [None]:
dict_lbe = joblib.load('../data/sample_dfn/union/scaler/dict_lbe.pickle')
list_df_test_X_sparse = []

try:
    with tqdm(sparse_features) as t:
        for x in t:
            df_test_X_sparse_each = pd.DataFrame(dict_lbe[x].transform(df_test_X[x]), columns=[x])
            list_df_test_X_sparse.append(df_test_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_test_X_sparse = pd.concat(list_df_test_X_sparse, axis=1)
print(df_test_X_sparse.shape)
df_test_X_sparse.head()

In [None]:
df_test_X_transform = pd.concat([df_test_X_sparse, df_test_X_dense], axis=1)
print(df_test_X_transform.shape)
df_test_X_transform.head()

In [None]:
utils.save_pickle(df_test_X_transform, '../data/sample_dfn/union/df_test_X_transform_20211219.pickle')

In [None]:
df_test_X_transform = utils_.load_pickle('../data/sample_dfn/union/df_test_X_transform_20211219.pickle')
print(df_test_X_transform.shape)
df_test_X_transform.head()

In [None]:
# test_input = {x: df_test_X_transform[x] for x in sparse_features+dense_features}

In [None]:
# 加载模型
# feature_columns = utils.load_pickle('../data/sample_dfn/union/other/feature_columns.pickle')
feature_columns = utils_.load_pickle('../data/sample_dfn/union/other/feature_columns_self.pickle')

best_model = MMOE(feature_columns, 
                  num_experts=3, expert_dnn_hidden_units=(256, 128), 
                  tower_dnn_hidden_units=(64, ), 
                  gate_dnn_hidden_units=(), 
                  l2_reg_embedding=0.00001, 
                  l2_reg_dnn=0, 
                  seed=2022, 
                  dnn_dropout=0, 
                  dnn_activation='relu', 
                  dnn_use_bn=False, 
                  task_types=['binary', 'binary'], task_names=['label_click', 'label_click_apply'])
# best_model.load_weights('../data/model/check_point/weights-improvement-008-0.5136.hdf5')
best_model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
best_model.build(input_shape=(df_test_X_transform.values.shape[0], df_test_X_transform.values.shape[1]))
# best_model.load_weights('../data/model/check_point/mmoe_20220214/weights-improvement-006-0.5126.hdf5')
best_model.load_weights('../data/model/check_point/mmoe_20220215/weights-improvement-007-0.5132.hdf5')

In [None]:
# 测试数据
# pred_test = best_model.predict(test_input, batch_size=1024)
pred_test = best_model.predict(df_test_X_transform.values, batch_size=1024)

df_test_y = utils_.load_pickle('../data/sample_dfn/union/df_test_y_new_20211219.pickle')
df_y = pd.concat([df_test_y, pd.DataFrame(pred_test[0], columns=['label_click_pred']), pd.DataFrame(pred_test[1], columns=['label_click_apply_pred'])], 
                 axis=1)
print(df_y.shape)
df_y.head()

In [None]:
df_test_uid_obsDate_dt = utils_.load_pickle('../data/sample_dfn/union/df_test_uid_obsDate_dt_new_20211219.pickle')
df_test_uid_obsDate_dt.head()

In [None]:
df_y_uid = pd.concat([df_test_uid_obsDate_dt[['uid']], df_y], axis=1)
df_y_uid.head()

In [None]:
df_y_uid[df_y_uid['uid']==]

In [None]:
# 测试效果
roc_auc_score(df_y['label_click'], df_y['label_click_pred'])

In [None]:
roc_auc_score(df_y['label_click_apply'], df_y['label_click_apply_pred'])

In [None]:
def cal_recall(df_y, y_true, y_pred, threshold=0.3):
    df_y_sort = df_y.copy()
    df_y_sort.sort_values(by=y_pred, ascending=False, inplace=True)
    df_y_sort.reset_index(drop=True, inplace=True)
    
    df_y_sort['label_recall'] = 0
    df_y_sort.loc[:int(df_y_sort.shape[0]*threshold), 'label_recall'] = 1
    
    dict_cr = classification_report(df_y_sort[y_true], df_y_sort['label_recall'], output_dict=True)
    
    return dict_cr['1.0']['recall']

In [None]:
cal_recall(df_y, 'label_click', 'label_click_pred')

In [None]:
cal_recall(df_y, 'label_click_apply', 'label_click_apply_pred')

In [None]:
df_y_copy = df_y.copy()
df_y_copy['label_click'] = df_y_copy['label_click'].astype(int)
df_y_th = utils_.PR_threshold(df_y_copy['label_click'], df_y_copy['label_click_pred'], 
                              threshold_bottom=0.0, threshold_top=0.5, threshold_interval=0.01, dec=3)
df_y_th

In [None]:
df_y_copy['label_click_apply'] = df_y_copy['label_click_apply'].astype(int)
df_y_th = utils_.PR_threshold(df_y_copy['label_click_apply'], df_y_copy['label_click_apply_pred'], 
                              threshold_bottom=0.0, threshold_top=0.5, threshold_interval=0.01, dec=3)
df_y_th

# 与现有模型比较

* 数据

In [None]:
# df_union = pd.read_csv('../data/sample_dfn/union/zfbx_click_resource_ctr_sample_label_feature_union_new_dfn_ctr1_20220116.txt', sep='\t', encoding='utf-8')
df_union = pd.read_csv('../data/sample_dfn/union/zfbx_click_resource_ctr_sample_label_feature_union_new_dfn_ctr1_20220227.txt', sep='\t', encoding='utf-8')
print(df_union.shape)
df_union.head()

In [None]:
df_union.groupby(by=['obs_dt', 'dt'])['uid'].count()

In [None]:
df_union['label_click'].value_counts()

In [None]:
# 160051 / 1162575
115291 / 1261844

In [None]:
df_union['label_click_apply'].value_counts()

In [None]:
# 54848 / 1162575
35410 / 1261844

In [None]:
# 删除旧资源位流量特征
list_feats_x = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_feats_x_new_adj.pickle')
print(len(list_feats_x))
list_feats_x[:10]

In [None]:
df_union['obs_dt'] = pd.to_datetime(df_union['obs_dt'])
df_union['dt'] = pd.to_datetime(df_union['dt'])

df_union_test_uid_obsDate_dt = df_union[['uid', 'obs_dt', 'dt']]
df_union_test_y = df_union[['label_click', 'label_click_apply']]
df_union_test_X = df_union[list_feats_x]

print(df_union_test_uid_obsDate_dt.shape)
print(df_union_test_y.shape)
print(df_union_test_X.shape)

In [None]:
# utils_.save_pickle(df_union_test_uid_obsDate_dt, '../data/sample_dfn/union/df_test_uid_obsDate_dt_new_20220116.pickle')
# utils_.save_pickle(df_union_test_y, '../data/sample_dfn/union/df_test_y_new_20220116.pickle')
utils_.save_pickle(df_union_test_uid_obsDate_dt, '../data/sample_dfn/union/df_test_uid_obsDate_dt_new_20220227.pickle')
utils_.save_pickle(df_union_test_y, '../data/sample_dfn/union/df_test_y_new_20220227.pickle')

In [None]:
# 数据转换
sparse_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_sparse_features.pickle')
dense_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_dense_features.pickle')

print(len(sparse_features))
print(len(dense_features))

In [None]:
mms = joblib.load('../data/sample_dfn/union/scaler/mms.pickle')
test_X_mms = mms.transform(df_union_test_X[dense_features])
print(test_X_mms.shape)

In [None]:
df_test_X_dense = pd.DataFrame(test_X_mms, columns=dense_features)
print(df_test_X_dense.shape)
df_test_X_dense.head()

In [None]:
df_union_test_X[dense_features].head()

In [None]:
dict_lbe = joblib.load('../data/sample_dfn/union/scaler/dict_lbe.pickle')
list_df_test_X_sparse = []

try:
    with tqdm(sparse_features) as t:
        for x in t:
            if x == 'xxx':
                df_test_X_sparse_each = df_union_test_X[[x]].copy()
                df_test_X_sparse_each[x] = 0
            else:
                df_test_X_sparse_each = pd.DataFrame(dict_lbe[x].transform(df_union_test_X[x]), columns=[x])
            list_df_test_X_sparse.append(df_test_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_test_X_sparse = pd.concat(list_df_test_X_sparse, axis=1)
print(df_test_X_sparse.shape)
df_test_X_sparse.head()

In [None]:
df_test_X_transform = pd.concat([df_test_X_sparse, df_test_X_dense], axis=1)
print(df_test_X_transform.shape)
df_test_X_transform.head()

In [None]:
# utils_.save_pickle(df_test_X_transform, '../data/sample_dfn/union/df_test_X_transform_20220116.pickle')
utils_.save_pickle(df_test_X_transform, '../data/sample_dfn/union/df_test_X_transform_20220227.pickle')

In [None]:
# df_test_X_transform = utils_.load_pickle('../data/sample_dfn/union/df_test_X_transform_20220116.pickle')
df_test_X_transform = utils_.load_pickle('../data/sample_dfn/union/df_test_X_transform_20220227.pickle')
print(df_test_X_transform.shape)
df_test_X_transform.head()

In [None]:
# test_input = {x: df_test_X_transform[x] for x in sparse_features+dense_features}

In [None]:
# 加载模型
# feature_columns = utils_.load_pickle('../data/sample_dfn/union/other/feature_columns.pickle')
feature_columns = utils_.load_pickle('../data/sample_dfn/union/other/feature_columns_self.pickle')

best_model = MMOE(feature_columns, 
                  num_experts=3, expert_dnn_hidden_units=(256, 128), 
                  tower_dnn_hidden_units=(64, ), 
                  gate_dnn_hidden_units=(), 
                  l2_reg_embedding=0.00001, 
                  l2_reg_dnn=0, 
                  seed=2022, 
                  dnn_dropout=0, 
                  dnn_activation='relu', 
                  dnn_use_bn=False, 
                  task_types=['binary', 'binary'], task_names=['label_click', 'label_click_apply'])
# best_model.load_weights('../data/model/check_point/weights-improvement-008-0.5136.hdf5')
best_model.compile(optimizer=keras.optimizers.Adam(lr=0.001), loss=['binary_crossentropy', 'binary_crossentropy'], metrics=['binary_crossentropy'])
best_model.build(input_shape=(df_test_X_transform.values.shape[0], df_test_X_transform.values.shape[1]))
# best_model.load_weights('../data/model/check_point/mmoe_20220214/weights-improvement-006-0.5126.hdf5')
best_model.load_weights('../data/model/check_point/mmoe_20220215/weights-improvement-007-0.5132.hdf5')

In [None]:
# 测试数据
# pred_test = best_model.predict(test_input, batch_size=1024)
pred_test = best_model.predict(df_test_X_transform.values, batch_size=1024)

# df_test_y = utils_.load_pickle('../data/sample_dfn/union/df_test_y_new_20220116.pickle')
df_test_y = utils_.load_pickle('../data/sample_dfn/union/df_test_y_new_20220227.pickle')
df_y = pd.concat([df_test_y, pd.DataFrame(pred_test[0], columns=['label_click_pred']), pd.DataFrame(pred_test[1], columns=['label_click_apply_pred'])], 
                 axis=1)
print(df_y.shape)
df_y.head()

In [None]:
# 测试效果
roc_auc_score(df_y['label_click'], df_y['label_click_pred'])

In [None]:
roc_auc_score(df_y['label_click_apply'], df_y['label_click_apply_pred'])

In [None]:
cal_recall(df_y, 'label_click', 'label_click_pred')

In [None]:
cal_recall(df_y, 'label_click_apply', 'label_click_apply_pred')

In [None]:
df_y_copy = df_y.copy()
df_y_copy['label_click'] = df_y_copy['label_click'].astype(int)
df_y_th = utils_.PR_threshold(df_y_copy['label_click'], df_y_copy['label_click_pred'], 
                              threshold_bottom=0.0, threshold_top=0.5, threshold_interval=0.01, dec=3)
df_y_th

In [None]:
df_y_copy['label_click_apply'] = df_y_copy['label_click_apply'].astype(int)
df_y_th = utils_.PR_threshold(df_y_copy['label_click_apply'], df_y_copy['label_click_apply_pred'], 
                              threshold_bottom=0.0, threshold_top=0.5, threshold_interval=0.01, dec=3)
df_y_th

In [None]:
feature_columns = utils.load_pickle('../data/sample_dfn/union/other/feature_columns.pickle')
print(feature_columns)
feature_columns

In [None]:
from collections import namedtuple, OrderedDict
def split_features(feature_columns, prefix=''):
    dict_sparse_features = OrderedDict()
    dict_dense_features = OrderedDict()
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            dict_sparse_features[fc.name] = fc
        elif isinstance(fc, DenseFeat):
            dict_dense_features[fc.name] = fc
        else:
            raise TypeError("Invalid feature column type,got", type(fc))

    return dict_sparse_features, dict_dense_features

In [None]:
df_test_X_transform = utils.load_pickle('../data/sample_dfn/union/df_test_X_transform_20211219.pickle')
print(df_test_X_transform.shape)
df_test_X_transform.head()

In [None]:
df_test_X_transform.iloc[:, len(aaa):]

In [None]:
tensor_input = tf.convert_to_tensor(df_test_X_transform)
tensor_input

In [None]:
orderDict_features_sparse, orderDict_features_dense = split_features(feature_columns)

In [None]:
inputs_sparse, input_dense = tensor_input[:, :len(orderDict_features_sparse)], tensor_input[:, len(orderDict_features_sparse):]
inputs_sparse

In [None]:
input_dense

In [None]:
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.regularizers import l2


def create_embedding(sparse_feature_columns, l2_reg, prefix='sparse_'):
    list_embedding_sparse = []
    for k, v in sparse_feature_columns.items():
        emb = Embedding(v.vocabulary_size, v.embedding_dim,
                        embeddings_initializer=v.embeddings_initializer,
                        embeddings_regularizer=l2(l2_reg),
                        name=prefix+'_emb_'+v.embedding_name)
        emb.trainable = v.trainable
        list_embedding_sparse.append(emb)

    return list_embedding_sparse

In [None]:
embedding_layers = create_embedding(orderDict_features_sparse, 0.00001)

In [None]:
sparse_embedding = [emb_layer(inputs_sparse[:, i]) for i, emb_layer in enumerate(embedding_layers)]
sparse_embedding

In [None]:
import tensorflow as tf
from tensorflow.python.keras.layers import Flatten

try:
    from tensorflow.python.ops.lookup_ops import StaticHashTable
except ImportError:
    from tensorflow.python.ops.lookup_ops import HashTable as StaticHashTable


class NoMask(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(NoMask, self).__init__(**kwargs)

    def build(self, input_shape):
        # Be sure to call this somewhere!
        super(NoMask, self).build(input_shape)

    def call(self, x, mask=None, **kwargs):
        return x

    def compute_mask(self, inputs, mask):
        return None

    
def reduce_sum(input_tensor,
               axis=None,
               keep_dims=False,
               name=None,
               reduction_indices=None):
    try:
        return tf.reduce_sum(input_tensor,
                             axis=axis,
                             keep_dims=keep_dims,
                             name=name,
                             reduction_indices=reduction_indices)
    except TypeError:
        return tf.reduce_sum(input_tensor,
                             axis=axis,
                             keepdims=keep_dims,
                             name=name)
    

def concat_func(inputs, axis=-1, mask=False):
    if not mask:
        inputs = list(map(NoMask(), inputs))
    if len(inputs) == 1:
        return inputs[0]
    else:
        return tf.keras.layers.Concatenate(axis=axis)(inputs)

In [None]:
sparse_dnn_input = Flatten()(concat_func(sparse_embedding))
sparse_dnn_input

In [None]:
dnn_input = tf.concat([tf.cast(sparse_dnn_input, tf.float32), tf.cast(input_dense,  tf.float32)], axis=-1)
dnn_input

In [None]:
from layers.core import DNN, PredictionLayer

In [None]:
expert_layers = [DNN((256, 128), 'relu', 0, 0, False, seed=1024, name='expert_'+str(i)) for i in range(3)]
expert_layers

In [None]:
expert_outs = [exp_layer(dnn_input) for exp_layer in expert_layers]
expert_outs

In [None]:
expert_concat = tf.keras.layers.Lambda(lambda x: tf.stack(x, axis=1))(expert_outs)
expert_concat

In [None]:
gate_layers_inputs = [DNN((), 'relu', 0, 0, False, seed=1024, name='gate_'+('ctr', 'ctcvr')[i]) for i in range(len(('ctr', 'ctcvr')))]
gate_layers_inputs

In [None]:
gate_layers_outputs = [tf.keras.layers.Dense(3, use_bias=False, activation='softmax', name='gate_softmax_'+('ctr', 'ctcvr')[i]) 
                       for i in range(len(('ctr', 'ctcvr')))]
gate_layers_outputs

In [None]:

mmoe_outputs = []
for i in range(len(('ctr', 'ctcvr'))):
    gate_input = gate_layers_inputs[i](dnn_input)
    gate_output = tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1))\
        (gate_layers_outputs[i](gate_input))
    gate_mul_expert = tf.keras.layers.Lambda(lambda x: reduce_sum(x[0]*x[1], axis=1, keep_dims=False),
                                             name='gate_mul_expert_'+('ctr', 'ctcvr')[i])\
        ([expert_concat, gate_output])
    mmoe_outputs.append(gate_mul_expert)
mmoe_outputs

In [None]:
tower_layers = [DNN((64,), 'relu', 0, 0, False, seed=1024, name='tower_'+('ctr', 'ctcvr')[i])
                for i in range(len(('ctr', 'ctcvr')))]
tower_layers

In [None]:
logits = [tf.keras.layers.Dense(1, use_bias=False, activation=None) for _ in range(len(('ctr', 'ctcvr')))]
logits  

In [None]:
output_layers = [PredictionLayer(task_type, name=task_name) for task_type, task_name in zip(('binary', 'binary'), ('ctr', 'ctcvr'))]

In [None]:
task_outputs = []
for i in range(len(('ctr', 'ctcvr'))):
    tower_output = tower_layers[i](mmoe_outputs[i])
    logit_output = logits[i](tower_output)
    task_output = output_layers[i](logit_output)
    task_outputs.append(task_output)
task_outputs

In [None]:
list_sparse_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_sparse_features.pickle')
print(len(list_sparse_features))
list_sparse_features[:10]

In [None]:
list_dense_features = utils_.load_pickle('../data/sample_dfn/union/list_feats/list_dense_features.pickle')
print(len(list_dense_features))
list_dense_features[:10]

In [None]:
dict_lbe = joblib.load('../data/sample_dfn/union/scaler/dict_lbe.pickle')
dict_lbe

In [None]:
mms = joblib.load('../data/sample_dfn/union/scaler/mms.pickle')
mms

In [None]:
list_features_config = []



for x in list_sparse_features:
    feature_config = []
    feature_config.append(x)
    feature_config.append(x+'_alias')
    feature_config.append('cate')
    feature_config.append(','.join([str(int(x)) for x in dict_lbe[x].classes_]))
    list_features_config.append(' '.join(feature_config)+'\n')

for i, x in enumerate(list_dense_features):
    feature_config = []
    feature_config.append(x)
    feature_config.append(x+'_alias')
    feature_config.append('minMax')
    feature_config.append(','.join([str(mms.data_min_[i]), str(mms.data_max_[i])]))
    list_features_config.append(' '.join(feature_config)+'\n')

list_features_config

In [None]:
with open('../data/sample_dfn/union/other/conf.config', 'w') as f:
    f.writelines(list_features_config)

In [None]:
list_features_name = []

list_features_name.append('uid_alias')

for x in list_sparse_features:
    list_features_name.append(x+'_alias')

for x in list_dense_features:
    list_features_name.append(x+'_alias')

print(len(list_features_name))
list_features_name[:10]

In [None]:
with open('../data/sample_dfn/union/other/features.txt', 'w') as f:
    f.write(' '.join(list_features_config))