# AITM

In [2]:
import datetime
import numpy as np
import pandas as pd
import joblib
import warnings
import logging
import os
import gc
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import collections
import re
import copy
import torch
import shap
import utils.utils as util

import utils_

from functools import reduce
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.inspection import permutation_importance
from model.aitm_condition import AITM
from torch.utils.data import DataLoader
from utils.dataset import DatasetLoader
from utils.warmup_lr import GradualWarmupScheduler


# pd.set_option('display.max_columns', None)
# pd.set_option('max_row', 500)
warnings.filterwarnings('ignore')
tqdm.pandas(desc='pandas bar')

In [None]:
torch.__version__

## 数据处理

* fusion & aspiration

In [None]:
# 2023.01.01~2023.03.19 train
df_fusion = utils_.load_pickle('../../data/other/cs/df_fusion_train_20230101_20230319.pickle')
df_aspiration_part1 = utils_.load_pickle('../../data/other/cs/df_aspiration_part1_tarin_20230101_20230319.pickle')
df_aspiration_part2 = utils_.load_pickle('../../data/other/cs/df_aspiration_part2_tarin_20230101_20230319.pickle')

print(df_fusion.shape)
print(df_aspiration_part1.shape)
print(df_aspiration_part2.shape)

In [None]:
# 2023.01.01~2023.03.19
# df_fusion = pd.read_csv('../../data/other/cs/sample_label_feature_fusion_obs_dt_20230101_20230319.txt', sep='\t', encoding='utf-8')
# df_aspiration_part1 = pd.read_csv('../../data/other/cs/sample_label_feature_aspiration_part1_obs_dt_20230101_20230319.txt', sep='\t', encoding='utf-8')
# df_aspiration_part2 = pd.read_csv('../../data/other/cs/sample_label_feature_aspiration_part2_obs_dt_20230101_20230319.txt', sep='\t', encoding='utf-8')

# print(df_fusion.shape)
# print(df_aspiration_part1.shape)
# print(df_aspiration_part2.shape)

In [None]:
list_feats_fusion = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
list_feats_aspiration = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')

print(len(list_feats_fusion))
print(len(list_feats_aspiration))

In [None]:
list_feats_aspiration_part_1 = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_aspiration_part1_20230101_20230319.pickle')
list_feats_aspiration_part_2 = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_aspiration_part2_20230101_20230319.pickle')

print(len(list_feats_aspiration_part_1))
print(len(list_feats_aspiration_part_2))

In [None]:
list_feats_aspiration_part_1 = [x for x in list_feats_aspiration_part_1 if x in list_feats_aspiration]
list_feats_aspiration_part_2 = [x for x in list_feats_aspiration_part_2 if x in list_feats_aspiration]

print(len(list_feats_aspiration_part_1))
print(len(list_feats_aspiration_part_2))

In [None]:
df_fusion['obs_dt'] = pd.to_datetime(df_fusion['obs_dt'])
df_aspiration_part1['obs_dt'] = pd.to_datetime(df_aspiration_part1['obs_dt'])
df_aspiration_part2['obs_dt'] = pd.to_datetime(df_aspiration_part2['obs_dt'])

df = df_fusion[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_fusion].\
    merge(df_aspiration_part1[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_aspiration_part_1], 
          on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_aspiration_part2[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_aspiration_part_2], 
          on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')
print(df.shape)
df.head()

In [None]:
df_des = utils_.df_des(df)
df_des[df_des['Miss Percent(%)']>0]

In [None]:
[x for x in df.columns if x.endswith('.1')]

In [None]:
df['obs_dt'].value_counts()

In [None]:
df['item'].value_counts()

In [None]:
df[['item', 'label_apply', 'uid']].groupby(by=['item', 'label_apply']).count()

In [None]:
df['label_apply'].value_counts()

In [None]:
df[['item', 'label_submit', 'uid']].groupby(by=['item', 'label_submit']).count()

In [None]:
df['label_submit'].value_counts()

In [None]:
df[['item', 'label_pass', 'uid']].groupby(by=['item', 'label_pass']).count()

In [None]:
df['label_pass'].value_counts()

In [None]:
df[['item', 'label_pass_1', 'uid']].groupby(by=['item', 'label_pass_1']).count()

In [None]:
df['label_pass_1'].value_counts()

In [None]:
# print("aar={}".format(300000/327506))
# print("aalabel1负样本占比{}".format(300000/1364774))
# print("ccr={}".format(300000/374856))
# print("cclabel1负样本占比{}".format(300000/1211726))

In [None]:
df['item_id'] = df['item'].apply(lambda x: 0 if x=='aaa' 
                                             else 1 if x=='bbb' 
                                             else 2 if x=='ccc' 
                                             else 3 if x=='ddd' 
                                             else 4)
df.head()

In [None]:
# 物料特征处理
# one hot encoding
list_feats_ohe = ['item_id']
list_df_ohe = []
try:
    with tqdm(list_feats_ohe) as t:
        for feat in t:
            df_ohe_feat = utils_.one_hot_encoder(df, feat)
            list_df_ohe.append(df_ohe_feat)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_ohe_part = pd.concat(list_df_ohe, axis=1)
print(df_ohe_part.shape)
df_ohe_part.head()

In [None]:
list_df_item = []
for i in range(5):
    list_df_item.append(df_ohe[df_ohe['item_id']==i][['item', 'item_id']+['item_id_{}'.format(x) for x in range(5)]].head(1))
df_item = pd.concat(list_df_item, axis=0)
print(df_item.shape)
df_item

In [None]:
df_ohe = pd.concat([df, df_ohe_part], axis=1)
print(df_ohe.shape)
df_ohe.head()

In [None]:
# utils_.save_pickle(df_ohe, '../../data/other/cs/df_ohe_train_20230101_20230319.pickle')
utils_.save_pickle(df_ohe, '../../data/other/cs/train_eval_oot/df_ohe_20230101_20230319.pickle')

* flow fixation xxx card

In [None]:
df_ffcc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_fixation_xxx_card_train_20230101_20230319.txt', sep='\t', encoding='utf-8')
print(df_ffcc.shape)
df_ffcc.head()

In [None]:
df_ffcc_des = utils_.df_des(df_ffcc)
df_ffcc_des.to_csv('../../data/other/cs/df_ffcc_des_20230101_20230319.csv', encoding='utf-8')

In [None]:
# 方差
list_feats_ffcc_y = ['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']
list_feats_ffcc_x = [x for x in df_ffcc.columns if x not in list_feats_ffcc_y]

print(len(list_feats_ffcc_x))
list_feats_ffcc_x[:10]

In [None]:
utils_.save_pickle(list_feats_ffcc_x, '../../data/other/cs/new_features/list_feats/list_feats_x_ffcc_20230101_20230319.pickle')

In [None]:
list_feats_ffcc_x_std_0 = [x for x in df_ffcc_des[df_ffcc_des['std']==0].index]
print(len(list_feats_ffcc_x_std_0))
list_feats_ffcc_x_std_0[:10]

In [None]:
utils_.save_pickle(list_feats_ffcc_x_std_0, '../../data/other/cs/new_features/list_feats/list_feats_x_std_0_ffcc_20230101_20230319.pickle')

In [None]:
list_feats_ffcc_x_std = [x for x in list_feats_ffcc_x if x not in list_feats_ffcc_x_std_0]
print(len(list_feats_ffcc_x_std))
list_feats_ffcc_x_std[:10]

In [None]:
utils_.save_pickle(list_feats_ffcc_x_std, '../../data/other/cs/new_features/list_feats/list_feats_x_std_ffcc_20230101_20230319.pickle')

In [None]:
list_feats_ffcc_y = ['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']
list_feats_ffcc_x_std = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_std_ffcc_20230101_20230319.pickle')
df_ffcc_ = df_ffcc[list_feats_ffcc_y+list_feats_ffcc_x_std]
df_ffcc_[list_feats_ffcc_x_std] = df_ffcc_[list_feats_ffcc_x_std].fillna(0)
print(df_ffcc_.shape)
df_ffcc_.head()

* flow card shelf xxx card

In [None]:
df_fcscc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_cs_xxx_card_train_20230101_20230319.txt', sep='\t', encoding='utf-8')
print(df_fcscc.shape)
df_fcscc.head()

In [None]:
df_fcscc_des = utils_.df_des(df_fcscc)
df_fcscc_des.to_csv('../../data/other/cs/df_fcscc_des_20230101_20230319.csv', encoding='utf-8')

In [None]:
# 方差
list_feats_fcscc_y = ['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']
list_feats_fcscc_x = [x for x in df_fcscc.columns if x not in list_feats_fcscc_y]

print(len(list_feats_fcscc_x))
list_feats_fcscc_x[:10]

In [None]:
utils_.save_pickle(list_feats_fcscc_x, '../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')

In [None]:
list_feats_fcscc_x_std_0 = [x for x in df_fcscc_des[df_fcscc_des['std']==0].index]
print(len(list_feats_fcscc_x_std_0))
list_feats_fcscc_x_std_0[:10]

In [None]:
list_feats_fcscc_x = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
df_fcscc_ = df_fcscc
df_fcscc_[list_feats_fcscc_x] = df_fcscc_[list_feats_fcscc_x].fillna(0)
print(df_fcscc_.shape)
df_fcscc_.head()

* xxx card

In [None]:
df_dc = pd.read_csv('../../data/other/cs/sample_label_feature_xxx_card_train_20230101_20230319.txt', sep='\t', encoding='utf-8')
print(df_dc.shape)
df_dc.head()

In [None]:
df_dc_des = utils_.df_des(df_dc)
df_dc_des.to_csv('../../data/other/cs/df_dc_des_20230101_20230319.csv', encoding='utf-8')

In [None]:
# 方差
list_feats_dc_y = ['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']
list_feats_dc_x = [x for x in df_dc.columns if x not in list_feats_dc_y]

print(len(list_feats_dc_x))
list_feats_dc_x[:10]

In [None]:
utils_.save_pickle(list_feats_dc_x, '../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')

In [None]:
list_feats_dc_x_std_0 = [x for x in df_dc_des[df_dc_des['std']==0].index]
print(len(list_feats_dc_x_std_0))
list_feats_dc_x_std_0[:10]

In [None]:
list_feats_dc_x = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')
df_dc_ = df_dc
df_dc_[list_feats_dc_x] = df_dc_[list_feats_dc_x].fillna(0)
print(df_dc_.shape)
df_dc_.head()

* ffd

In [None]:
df_ffcc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_fixation_xxx_card_train_20230101_20230319.txt', sep='\t', encoding='utf-8')
df_fcscc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_cs_xxx_card_train_20230101_20230319.txt', sep='\t', encoding='utf-8')
df_dc = pd.read_csv('../../data/other/cs/sample_label_feature_xxx_card_train_20230101_20230319.txt', sep='\t', encoding='utf-8')

print(df_ffcc.shape)
print(df_fcscc.shape)
print(df_dc.shape)

In [None]:
list_feats_x_ffd = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')

print(len(list_feats_x_ffd))

In [None]:
list_feats_x_ffcc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ffcc_20230101_20230319.pickle')
list_feats_x_fcscc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
list_feats_x_dc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
list_feats_x_ffcc = [x for x in list_feats_x_ffcc if x in list_feats_x_ffd]
list_feats_x_fcscc = [x for x in list_feats_x_fcscc if x in list_feats_x_ffd]
list_feats_x_dc = [x for x in list_feats_x_dc if x in list_feats_x_dc]

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
df_ffcc[list_feats_x_ffcc] = df_ffcc[list_feats_x_ffcc].fillna(0)
df_fcscc[list_feats_x_fcscc] = df_fcscc[list_feats_x_fcscc].fillna(0)

df_ffcc['obs_dt'] = pd.to_datetime(df_ffcc['obs_dt'])
df_fcscc['obs_dt'] = pd.to_datetime(df_fcscc['obs_dt'])
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])

df_ffd = df_ffcc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffcc].\
    merge(df_fcscc[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_fcscc], 
          on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_dc[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_dc], 
          on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')
print(df_ffd.shape)
df_ffd.head()

* 合并

In [None]:
df_ohe = utils_.load_pickle('../../data/other/cs/df_ohe_train_20230101_20230319.pickle')
print(df_ohe.shape)
df_ohe.head()

In [None]:
list_feats_x_ffd == list_feats_x_ffcc + list_feats_x_fcscc +list_feats_x_dc

In [None]:
%%time
# df_ffcc_['obs_dt'] = pd.to_datetime(df_ffcc_['obs_dt'])
# df_fcscc_['obs_dt'] = pd.to_datetime(df_fcscc_['obs_dt'])
# df_dc_['obs_dt'] = pd.to_datetime(df_dc_['obs_dt'])

# df_ohe_new_feats = df_ohe.\
#     merge(df_ffcc_[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_ffcc_x_std], 
#           on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
#     merge(df_fcscc_[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_fcscc_x], 
#           on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
#     merge(df_dc_[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_dc_x], 
#           on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')

df_ohe_new_feats = df_ohe.\
    merge(df_ffd[['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffd], 
          on=['uid', 'obs_dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')

print(df_ohe_new_feats.shape)
df_ohe_new_feats.head()

In [None]:
# utils_.save_pickle(df_ohe_new_feats, '../../data/other/cs/new_features/df_ohe_new_feats_train_20230101_20230319.pickle')
utils_.save_pickle(df_ohe_new_feats, '../../data/other/cs/new_features/df_ohe_ffd_train_20230101_20230319.pickle')

* 5物料，aa、bb、cc、dd、其他
* hpyk特征，963
* 2023.01.01~2023.03.19

In [None]:
df_train = utils_.load_pickle('../../data/other/cs/df_ohe_train_20230101_20230319.pickle')
print(df_train.shape)
df_train.head()

In [None]:
# 划分训练集&验证集，9:1，90%
df_train_train, df_train_eval = train_test_split(df_train, test_size=0.1, random_state=2023)
print(df_train_train.shape)
print(df_train_eval.shape)

In [None]:
df_train_train_id = df_train_train[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_train_y = df_train_train[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_train_X = df_train_train[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
]
print(df_train_train_id.shape)
print(df_train_train_y.shape)
print(df_train_train_X.shape)

df_train_eval_id = df_train_eval[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_eval_y = df_train_eval[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_eval_X = df_train_eval[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
]
print(df_train_eval_id.shape)
print(df_train_eval_y.shape)
print(df_train_eval_X.shape)

In [None]:
utils_.save_pickle(df_train_train_id, '../../data/other/cs/df_id_train_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_train_y, '../../data/other/cs/df_y_train_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_train_X, '../../data/other/cs/df_X_train_train_20230101_20230319.pickle')

utils_.save_pickle(df_train_eval_id, '../../data/other/cs/df_id_train_eval_20230101_20230319.pickle')
utils_.save_pickle(df_train_eval_y, '../../data/other/cs/df_y_train_eval_20230101_20230319.pickle')
utils_.save_pickle(df_train_eval_X, '../../data/other/cs/df_X_train_eval_20230101_20230319.pickle')

* 5物料，aa、bb、cc、dd、其他
* hpyk特征，963
* 2023.01.01~2023.03.19
* 只划分train & eval，测试直接观察oot

In [None]:
df = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_ohe_20230101_20230319.pickle')
print(df.shape)
df.head()

In [None]:
# 划分训练集&验证集，8:2，80%
df_train, df_eval = train_test_split(df, test_size=0.2, random_state=2023)
print(df_train.shape)
print(df_eval.shape)

In [None]:
df_train_id = df_train[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_y = df_train[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_X = df_train[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
]
print(df_train_id.shape)
print(df_train_y.shape)
print(df_train_X.shape)

df_eval_id = df_eval[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_eval_y = df_eval[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_eval_X = df_eval[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
]
print(df_eval_id.shape)
print(df_eval_y.shape)
print(df_eval_X.shape)

In [None]:
utils_.save_pickle(df_train_id, '../../data/other/cs/train_eval_oot/df_id_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_y, '../../data/other/cs/train_eval_oot/df_y_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_X, '../../data/other/cs/train_eval_oot/df_X_train_20230101_20230319.pickle')

utils_.save_pickle(df_eval_id, '../../data/other/cs/train_eval_oot/df_id_eval_20230101_20230319.pickle')
utils_.save_pickle(df_eval_y, '../../data/other/cs/train_eval_oot/df_y_eval_20230101_20230319.pickle')
utils_.save_pickle(df_eval_X, '../../data/other/cs/train_eval_oot/df_X_eval_20230101_20230319.pickle')

* 5物料，aa、bb、cc、dd、其他
* hpyk特征，963 + 固定位80 + 货架页110 + cx卡45
* 2023.01.01~2023.03.19

In [None]:
df_train = utils_.load_pickle('../../data/other/cs/new_features/df_ohe_new_feats_train_20230101_20230319.pickle')
print(df_train.shape)
df_train.head()

In [None]:
# 划分训练集&验证集，9:1，90%
df_train_train, df_train_eval = train_test_split(df_train, test_size=0.1, random_state=2023)
print(df_train_train.shape)
print(df_train_eval.shape)

In [None]:
df_train_train_id = df_train_train[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_train_y = df_train_train[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_train_X = df_train_train[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_std_ffcc_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')
]
print(df_train_train_id.shape)
print(df_train_train_y.shape)
print(df_train_train_X.shape)

df_train_eval_id = df_train_eval[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_eval_y = df_train_eval[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_eval_X = df_train_eval[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_std_ffcc_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')
]
print(df_train_eval_id.shape)
print(df_train_eval_y.shape)
print(df_train_eval_X.shape)

In [None]:
utils_.save_pickle(df_train_train_id, '../../data/other/cs/new_features/df_id_train_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_train_y, '../../data/other/cs/new_features/df_y_train_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_train_X, '../../data/other/cs/new_features/df_X_train_train_20230101_20230319.pickle')

utils_.save_pickle(df_train_eval_id, '../../data/other/cs/new_features/df_id_train_eval_20230101_20230319.pickle')
utils_.save_pickle(df_train_eval_y, '../../data/other/cs/new_features/df_y_train_eval_20230101_20230319.pickle')
utils_.save_pickle(df_train_eval_X, '../../data/other/cs/new_features/df_X_train_eval_20230101_20230319.pickle')

* 5物料，aa、bb、cc、dd、其他
* hpyk特征，963 + 新特征筛选，88
* 2023.01.01~2023.03.19

In [None]:
df_train = utils_.load_pickle('../../data/other/cs/new_features/df_ohe_ffd_train_20230101_20230319.pickle')
print(df_train.shape)
df_train.head()

In [None]:
# 划分训练集&验证集，9:1，90%
df_train_train, df_train_eval = train_test_split(df_train, test_size=0.1, random_state=2023)
print(df_train_train.shape)
print(df_train_eval.shape)

In [None]:
df_train_train_id = df_train_train[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_train_y = df_train_train[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_train_X = df_train_train[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')
]
print(df_train_train_id.shape)
print(df_train_train_y.shape)
print(df_train_train_X.shape)

df_train_eval_id = df_train_eval[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_train_eval_y = df_train_eval[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_train_eval_X = df_train_eval[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')
]
print(df_train_eval_id.shape)
print(df_train_eval_y.shape)
print(df_train_eval_X.shape)

In [None]:
utils_.save_pickle(df_train_train_id, '../../data/other/cs/new_features/df_ffd_id_train_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_train_y, '../../data/other/cs/new_features/df_ffd_y_train_train_20230101_20230319.pickle')
utils_.save_pickle(df_train_train_X, '../../data/other/cs/new_features/df_ffd_X_train_train_20230101_20230319.pickle')

utils_.save_pickle(df_train_eval_id, '../../data/other/cs/new_features/df_ffd_id_train_eval_20230101_20230319.pickle')
utils_.save_pickle(df_train_eval_y, '../../data/other/cs/new_features/df_ffd_y_train_eval_20230101_20230319.pickle')
utils_.save_pickle(df_train_eval_X, '../../data/other/cs/new_features/df_ffd_X_train_eval_20230101_20230319.pickle')

## 入参处理

* 训练集

In [None]:
# df_train_train_id = utils_.load_pickle('../../data/other/cs/df_id_train_train_20230101_20230319.pickle')
# df_train_train_y = utils_.load_pickle('../../data/other/cs/df_y_train_train_20230101_20230319.pickle')
# df_train_train_X = utils_.load_pickle('../../data/other/cs/df_X_train_train_20230101_20230319.pickle')

# df_train_train_id = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_id_train_20230101_20230319.pickle')
# df_train_train_y = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_y_train_20230101_20230319.pickle')
# df_train_train_X = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_X_train_20230101_20230319.pickle')

df_train_train_id = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_id_train_train_20230101_20230319.pickle')
df_train_train_y = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_y_train_train_20230101_20230319.pickle')
df_train_train_X = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_X_train_train_20230101_20230319.pickle')

print(df_train_train_id.shape)
print(df_train_train_y.shape)
print(df_train_train_X.shape)

In [None]:
condition_feats = ['item_id_{}'.format(x) for x in range(5)]
sparse_feats = [
    'xxx', 
]
sparse_feats_new = [
    'xxx', 
]
# sparse_feats = [x for x in df_train_train_X.columns if x in sparse_feats]
sparse_feats = [x for x in df_train_train_X.columns if x in sparse_feats+sparse_feats_new]
dense_feats = [x for x in df_train_train_X.columns if x not in condition_feats+sparse_feats]

print(len(condition_feats))
print(len(sparse_feats))
print(len(dense_feats))

In [None]:
for x in sparse_feats:
    print('{}:{}'.format(x, df_train_train_X[x].nunique()))

In [None]:
sparse_feats = [x for x in sparse_feats if df_train_train_X[x].nunique()<=25 and x not in ['xxx']]
print(len(sparse_feats))
sparse_feats

In [None]:
dense_feats = [x for x in df_train_train_X.columns if x not in condition_feats+sparse_feats]
print(len(dense_feats))

In [None]:
# utils_.save_pickle(condition_feats, '../../data/other/cs/list_condition_feats.pickle')
# utils_.save_pickle(sparse_feats, '../../data/other/cs/list_sparse_feats.pickle')
# utils_.save_pickle(dense_feats, '../../data/other/cs/list_dense_feats.pickle')

# utils_.save_pickle(condition_feats, '../../data/other/cs/train_eval_oot/list_condition_feats.pickle')
# utils_.save_pickle(sparse_feats, '../../data/other/cs/train_eval_oot/list_sparse_feats.pickle')
# utils_.save_pickle(dense_feats, '../../data/other/cs/train_eval_oot/list_dense_feats.pickle')

utils_.save_pickle(condition_feats, '../../data/other/cs/new_features/list_condition_feats_ffd.pickle')
utils_.save_pickle(sparse_feats, '../../data/other/cs/new_features/list_sparse_feats_ffd.pickle')
utils_.save_pickle(dense_feats, '../../data/other/cs/new_features/list_dense_feats_ffd.pickle')

In [None]:
# 连续
ss = StandardScaler()
X_train_train_dense_ss = ss.fit_transform(df_train_train_X[dense_feats])
# joblib.dump(ss, '../../data/other/cs/ss.pickle')
# joblib.dump(ss, '../../data/other/cs/train_eval_oot/ss.pickle')
joblib.dump(ss, '../../data/other/cs/new_features/ss_ffd.pickle')

In [None]:
df_X_train_train_dense = pd.DataFrame(X_train_train_dense_ss, columns=dense_feats)
print(df_X_train_train_dense.shape)
df_X_train_train_dense.head()

In [None]:
# 离散
dict_lbe = {}
list_X_train_train_sparse = []

try:
    with tqdm(sparse_feats) as t:
        for x in t:
            lbe = LabelEncoder()
            df_X_sparse_each = pd.DataFrame(lbe.fit_transform(df_train_train_X[x]), columns=[x])
            dict_lbe[x] = lbe
            list_X_train_train_sparse.append(df_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

# joblib.dump(dict_lbe, '../../data/other/cs/dict_lbe.pickle')
# joblib.dump(dict_lbe, '../../data/other/cs/train_eval_oot/dict_lbe.pickle')
joblib.dump(dict_lbe, '../../data/other/cs/new_features/dict_lbe_ffd.pickle')
df_X_train_train_sparse = pd.concat(list_X_train_train_sparse, axis=1)
print(df_X_train_train_sparse.shape)
df_X_train_train_sparse.head()

In [None]:
# 合并
df_train_train_X_condition = df_train_train_X[condition_feats].reset_index(drop=True)
df_train_train_X_transform = pd.concat([df_train_train_X_condition, df_X_train_train_sparse, df_X_train_train_dense], axis=1)
print(df_train_train_X_transform.shape)
df_train_train_X_transform.head()

In [None]:
df_train_train_X.head()

In [None]:
# utils_.save_pickle(df_train_train_X_transform, '../../data/other/cs/df_X_train_train_transform.pickle')
# utils_.save_pickle(df_train_train_X_transform, '../../data/other/cs/train_eval_oot/df_X_train_transform.pickle')
utils_.save_pickle(df_train_train_X_transform, '../../data/other/cs/new_features/df_X_train_train_transform_ffd.pickle')

* 验证集

In [None]:
# df_train_eval_id = utils_.load_pickle('../../data/other/cs/df_id_train_eval_20230101_20230319.pickle')
# df_train_eval_y = utils_.load_pickle('../../data/other/cs/df_y_train_eval_20230101_20230319.pickle')
# df_train_eval_X = utils_.load_pickle('../../data/other/cs/df_X_train_eval_20230101_20230319.pickle')

# df_train_eval_id = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_id_eval_20230101_20230319.pickle')
# df_train_eval_y = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_y_eval_20230101_20230319.pickle')
# df_train_eval_X = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_X_eval_20230101_20230319.pickle')

df_train_eval_id = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_id_train_eval_20230101_20230319.pickle')
df_train_eval_y = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_y_train_eval_20230101_20230319.pickle')
df_train_eval_X = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_X_train_eval_20230101_20230319.pickle')

print(df_train_eval_id.shape)
print(df_train_eval_y.shape)
print(df_train_eval_X.shape)

In [None]:
# condition_feats = utils_.load_pickle('../../data/other/cs/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/list_dense_feats.pickle')

# condition_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_dense_feats.pickle')

condition_feats = utils_.load_pickle('../../data/other/cs/new_features/list_condition_feats_ffd.pickle')
sparse_feats = utils_.load_pickle('../../data/other/cs/new_features/list_sparse_feats_ffd.pickle')
dense_feats = utils_.load_pickle('../../data/other/cs/new_features/list_dense_feats_ffd.pickle')

print(len(condition_feats))
print(len(sparse_feats))
print(len(dense_feats))

In [None]:
# 连续
# ss = joblib.load('../../data/other/cs/ss.pickle')
# ss = joblib.load('../../data/other/cs/train_eval_oot/ss.pickle')
ss = joblib.load('../../data/other/cs/new_features/ss_ffd.pickle')
X_train_eval_dense_ss = ss.transform(df_train_eval_X[dense_feats])
print(X_train_eval_dense_ss.shape)

In [None]:
df_X_train_eval_dense_ss = pd.DataFrame(X_train_eval_dense_ss, columns=dense_feats)
print(df_X_train_eval_dense_ss.shape)
df_X_train_eval_dense_ss.head()

In [None]:
# 离散
# dict_lbe = joblib.load('../../data/other/cs/dict_lbe.pickle')
# dict_lbe = joblib.load('../../data/other/cs/train_eval_oot/dict_lbe.pickle')
dict_lbe = joblib.load('../../data/other/cs/new_features/dict_lbe_ffd.pickle')
for x in sparse_feats:
    print(x, dict_lbe[x].classes_)

In [None]:
list_X_sparse = []

try:
    with tqdm(sparse_feats) as t:
        for x in t:
            list_feat_values_unseen = list(set(df_train_eval_X[x].unique())-set(dict_lbe[x].classes_))
            if len(list_feat_values_unseen) > 0:
                print(x)
                df_train_eval_X[x].replace(list_feat_values_unseen, -1, inplace=True)
            df_train_eval_X_sparse_each = pd.DataFrame(dict_lbe[x].transform(df_train_eval_X[x]), columns=[x])
            list_X_sparse.append(df_train_eval_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_train_eval_sparse = pd.concat(list_X_sparse, axis=1)
print(df_X_train_eval_sparse.shape)
df_X_train_eval_sparse.head()

In [None]:
# 合并
df_train_eval_X_condition = df_train_eval_X[condition_feats].reset_index(drop=True)
df_train_eval_X_transform = pd.concat([df_train_eval_X_condition, df_X_train_eval_sparse, df_X_train_eval_dense_ss], axis=1)
print(df_train_eval_X_transform.shape)
df_train_eval_X_transform.head()

In [None]:
df_train_eval_X.head()

In [None]:
# utils_.save_pickle(df_train_eval_X_transform, '../../data/other/cs/df_X_train_eval_transform.pickle')
# utils_.save_pickle(df_train_eval_X_transform, '../../data/other/cs/train_eval_oot/df_X_eval_transform.pickle')
utils_.save_pickle(df_train_eval_X_transform, '../../data/other/cs/new_features/df_X_train_eval_transform_ffd.pickle')

## 模型

In [None]:
# df_X_train_train_transform = utils_.load_pickle('../../data/other/cs/df_X_train_train_transform.pickle')
# df_X_train_train_transform = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_X_train_transform.pickle')
df_X_train_train_transform = utils_.load_pickle('../../data/other/cs/new_features/df_X_train_train_transform_ffd.pickle')
print(df_X_train_train_transform.shape)
df_X_train_train_transform.head()

In [None]:
# condition_feats = utils_.load_pickle('../../data/other/cs/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/list_dense_feats.pickle')

# condition_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_dense_feats.pickle')

condition_feats = utils_.load_pickle('../../data/other/cs/new_features/list_condition_feats_ffd.pickle')
sparse_feats = utils_.load_pickle('../../data/other/cs/new_features/list_sparse_feats_ffd.pickle')
dense_feats = utils_.load_pickle('../../data/other/cs/new_features/list_dense_feats_ffd.pickle')

print(len(condition_feats))
print(len(sparse_feats))
print(len(dense_feats))

In [None]:
feats_columns = [[util.denseFeature(feat) for feat in condition_feats]] + \
                 [[util.sparseFeature(x, int(df_X_train_train_transform[x].max()+1), 4) for x in sparse_feats]] + \
                 [[util.denseFeature(feat) for feat in dense_feats]]
feats_columns

In [None]:
# utils_.save_pickle(feats_columns, '../../data/other/cs/feats_columns.pcikle')
# utils_.save_pickle(feats_columns, '../../data/other/cs/train_eval_oot/feats_columns.pcikle')
utils_.save_pickle(feats_columns, '../../data/other/cs/new_features/feats_columns_ffd.pcikle')

In [None]:
# df_X_train_train_transform = utils_.load_pickle('../../data/other/cs/df_X_train_train_transform.pickle')
# df_y_train_train = utils_.load_pickle('../../data/other/cs/df_y_train_train_20230101_20230319.pickle')

# df_X_train_eval_transform = utils_.load_pickle('../../data/other/cs/df_X_train_eval_transform.pickle')
# df_y_train_eval = utils_.load_pickle('../../data/other/cs/df_y_train_eval_20230101_20230319.pickle') 

# df_X_train_train_transform = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_X_train_transform.pickle')
# df_y_train_train = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_y_train_20230101_20230319.pickle')

# df_X_train_eval_transform = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_X_eval_transform.pickle')
# df_y_train_eval = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_y_eval_20230101_20230319.pickle') 

df_X_train_train_transform = utils_.load_pickle('../../data/other/cs/new_features/df_X_train_train_transform_ffd.pickle')
df_y_train_train = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_y_train_train_20230101_20230319.pickle')

df_X_train_eval_transform = utils_.load_pickle('../../data/other/cs/new_features/df_X_train_eval_transform_ffd.pickle')
df_y_train_eval = utils_.load_pickle('../../data/other/cs/new_features/df_ffd_y_train_eval_20230101_20230319.pickle') 

print(df_X_train_train_transform.shape)
print(df_y_train_train.shape)

print(df_X_train_eval_transform.shape)
print(df_y_train_eval.shape)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
logger = util.get_logger('')
util.seed_everything(2023)

In [None]:
train_loader = DataLoader(DatasetLoader(df_X_train_train_transform.values, df_y_train_train.values), 1024, shuffle=True, num_workers=8)
# train_loader = DataLoader(DatasetLoader(df_X_train_train_transform.values, df_y_train_train.values), 512, shuffle=True, num_workers=8)
# train_loader = DataLoader(DatasetLoader(df_X_train_train_transform.values, df_y_train_train.values), 4096, shuffle=True, num_workers=8)

eval_loader = DataLoader(DatasetLoader(df_X_train_eval_transform.values, df_y_train_eval.values), 1024, shuffle=False, num_workers=8)

In [None]:
config = {
    'Model': {
        'num_experts': 5, 
        'expert_hidden_units': [1024, 512, 256], 
        'units': 128, 
        'num_tasks': 4, 
        'tower_hidden_units': [128, 64], 
        'tower_units': 32, 
        'dropout': 0.2, 
        'use_bn': False
    }
}

# feats_columns = utils_.load_pickle('../../data/other/cs/feats_columns.pcikle')
# feats_columns = utils_.load_pickle('../../data/other/cs/train_eval_oot/feats_columns.pcikle')
feats_columns = utils_.load_pickle('../../data/other/cs/new_features/feats_columns_ffd.pcikle')

model = AITM(config, feats_columns).to(device)
model

In [None]:
for m in model.modules():
    if isinstance(m, (torch.nn.Conv2d, torch.nn.Linear)):
        torch.nn.init.xavier_uniform_(m.weight)
        # nn.init.kaiming_uniform_(m.weight)
    elif isinstance(m, torch.nn.BatchNorm1d):
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 0)

In [None]:
def train(train_loader, eval_loader, model, optimizer, scheduler, criterion, epochs, data_eval, file):
    for epoch in range(epochs):
        model.train()
        
        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
            optimizer.zero_grad()
            output = model(x)
            list_loss = model.loss(output, y)[0]
            # list_loss = [criterion[j](output[:, j], y[:, j]) for j in range(len(criterion))]
            # list_loss = [criterion[j](output[:, j], y[:, j]) for j in range(len(criterion)-2)] + \
            #     [10*criterion[-2](output[:, -2], y[:, -2]), 10*criterion[-1](output[:, -1], y[:, -1])]
            loss = reduce(lambda x, y: x+y, list_loss)
            loss.backward()
            
            optimizer.step()
            if i % 100 == 0:
                str_loss = ', '.join(['loss{}: {:.6f}'.format(j, x.item()) for j, x in enumerate(list_loss)]) + ', loss: {:.6f}'.format(loss.item())
                logger.info('Epoch: [{}/{}], Step: [{}/{}], Lr: {:.6f}, '.format(
                    epoch+1, epochs, i+1, len(train_loader), optimizer.param_groups[0]['lr'])+str_loss)
        
        val(eval_loader, model, criterion, data_eval, file)
        
        scheduler.step()

In [None]:
def val(eval_loader, model, criterion, data_eval, file):
    model.eval()
    
    global best_loss
    global best_auc
    y_true = data_eval[1]
    num_class = y_true.shape[1]
    
    eval_loss = 0
    list_eval_loss = [0] * num_class
    y_pred = []
    
    with torch.no_grad():
        for i, (x, y) in enumerate(eval_loader):
            x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
            output = model(x)
            list_eval_loss = [x+criterion[j](output[:, j], y[:, j]).item() for j, x in enumerate(list_eval_loss)]
            eval_loss = sum(list_eval_loss)
            
            output1 = torch.sigmoid(output)
            if i == 0:
                y_pred = output1.cpu().numpy()
            else:
                y_pred = np.concatenate((y_pred, output1.cpu().numpy()), axis=0)
    
    list_eval_loss = [x/len(eval_loader) for x in list_eval_loss]
    eval_loss /= len(eval_loader)
    
    eval_auc = [util.auc(y_true[:, i], y_pred[:, i]) for i in range(num_class)]
    str_auc = ', '.join(['AUC{}: {:.6f}'.format(i, eval_auc[i]) for i in range(num_class)])
    str_loss = 'Eval set: Average loss: {:.6f}, '.format(eval_loss) + ', '.join(['Average loss{}: {:.6f}'.format(j, x) for j, x in enumerate(list_eval_loss)])
    
    logger.info('{}, {}'.format(str_loss, str_auc))
    
    if list_eval_loss[-1] < best_loss: # 只关注链路尾部输出
        best_loss = list_eval_loss[-1]
        torch.save(model.state_dict(), './save/{}.pth'.format(file))
        model1 = torch.jit.script(model)
        torch.jit.save(model1, './save/{}.pt'.format(file))
        logger.info('Save model with loss: {:.6f}, {}'.format(best_loss, str_auc))

* 训练

In [None]:
criterion = (torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[10, 20, 50], gamma=0.1)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# scheduler_ms = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[4, 14, 24, 44], gamma=0.1)
# scheduler_wu = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=5, after_scheduler=scheduler_ms)

best_loss = np.inf
best_auc = 0

# train(train_loader, eval_loader, model, optimizer, scheduler, criterion, 100, (df_X_train_eval_transform.values, df_y_train_eval.values), 'aitm_condition')
train(train_loader, eval_loader, model, optimizer, scheduler, criterion, 100, (df_X_train_eval_transform.values, df_y_train_eval.values), 'aitm_condition_d')

In [None]:
def test(test_loader, model, criterion, data_eval):
    model.eval()
    
    y_true = data_eval[1]
    num_class = y_true.shape[1]
    
    eval_loss = 0
    list_eval_loss = [0] * num_class
    y_pred = []
    
    with torch.no_grad():
        for i, (x, y) in enumerate(test_loader):
            x, y = x.to(device).to(torch.float32), y.to(device).to(torch.float32)
            output = model(x)
            list_eval_loss = [x+criterion[j](output[:, j], y[:, j]).item() for j, x in enumerate(list_eval_loss)]
            eval_loss = sum(list_eval_loss)
            
            output1 = torch.sigmoid(output)
            if i == 0:
                y_pred = output1.cpu().numpy()
            else:
                y_pred = np.concatenate((y_pred, output1.cpu().numpy()), axis=0)
    
    list_eval_loss = [x/len(test_loader) for x in list_eval_loss]
    eval_loss /= len(test_loader)
    
    eval_auc = [util.auc(y_true[:, i], y_pred[:, i]) for i in range(num_class)]
    str_auc = ', '.join(['AUC{}: {:.6f}'.format(i, eval_auc[i]) for i in range(num_class)])
    str_loss = 'Test set: Average loss: {:.6f}, '.format(eval_loss) + ', '.join(['Average loss{}: {:.6f}'.format(j, x) for j, x in enumerate(list_eval_loss)])
    
    print('{}, {}'.format(str_loss, str_auc))
    return y_pred

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)

config = {
    'Model': {
        'num_experts': 5, 
        'expert_hidden_units': [256, 128], 
        'units': 64, 
        'num_tasks': 4, 
        'tower_hidden_units': [64, 32], 
        'dropout': 0.2, 
        'use_bn': False
    }
}
# feats_columns = utils_.load_pickle('../../data/other/cs/feats_columns.pcikle')
feats_columns = utils_.load_pickle('../../data/other/cs/train_eval_oot/feats_columns.pcikle')

model_rebuild = MMOE(config, feats_columns).to(device)
# model_rebuild.load_state_dict(torch.load('save/mmoe_condition.pth'))
model_rebuild.load_state_dict(torch.load('save/mmoe_condition_train_eval_oot.pth'))
model_rebuild

In [None]:
# X_eval = utils_.load_pickle('../../data/other/cs/df_X_train_eval_transform.pickle')
# y_eval = utils_.load_pickle('../../data/other/cs/df_y_train_eval_20230101_20230319.pickle')
X_eval = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_X_eval_transform.pickle')
y_eval = utils_.load_pickle('../../data/other/cs/train_eval_oot/df_y_eval_20230101_20230319.pickle')

print(X_eval.shape)
print(y_eval.shape)

test_loader = DataLoader(DatasetLoader(X_eval.values, y_eval.values), 1024, shuffle=False, num_workers=8)

criterion = (torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device))

y_pred = test(test_loader, model_rebuild, criterion, (X_eval.values, y_eval.values))

In [None]:
df_y = pd.concat([y_eval.reset_index(drop=True), pd.DataFrame(data=y_pred, columns=['pred_'+x for x in y_eval.columns])], axis=1)
print(df_y.shape)
df_y.head()

In [None]:
def cal_recall(df_y, y_true, y_pred, threshold=0.3):
    df_y_sort = df_y.copy()
    df_y_sort.sort_values(by=y_pred, ascending=False, inplace=True)
    df_y_sort.reset_index(drop=True, inplace=True)
    
    df_y_sort['label_recall'] = 0
    df_y_sort.loc[:int(df_y_sort.shape[0]*threshold), 'label_recall'] = 1
    
    dict_cr = classification_report(df_y_sort[y_true], df_y_sort['label_recall'], output_dict=True)
    
    return dict_cr['1.0']['recall']

In [None]:
try:
    with tqdm(y_eval.columns) as t:
        for x in t:
            print('{}: {}'.format(x, cal_recall(df_y[[x, 'pred_'+x]], x, 'pred_'+x, threshold=0.2)))
except KeyboardInterrupt:
    t.close()
    raise
t.close()

In [None]:
df_id_train_eval = utils_.load_pickle('../../data/other/cs/df_id_train_eval_20230101_20230319.pickle')

print(df_id_train_eval.shape)
df_id_train_eval.head()

In [None]:
df_id_train_eval_ = df_id_train_eval.reset_index(drop=True)
df_id_train_eval_.head()

In [None]:
recall_p = 20

try:
    with tqdm([0, 1, 2, 3, 4]) as t:
        for item_id in t:
            index_item = df_id_train_eval_[df_id_train_eval_['item_id']==item_id].index
            df_y_item = df_y.loc[index_item, :]
            print(df_y_item.shape[0])
            for x in y_eval.columns:
                print('item_id:{bi}, label:{l}, AUC:{ras}, Recall@{p}%:{cr}'.format(
                    bi=item_id, 
                    l=x, 
                    ras=roc_auc_score(df_y_item[x], df_y_item['pred_'+x]), 
                    p=recall_p, 
                    cr=cal_recall(df_y_item, x, 'pred_'+x, threshold=recall_p*0.01)))
except:
    t.close()
    raise
t.close()

* 测试

In [None]:
df_fusion = pd.read_csv('../../data/other/cs/sample_label_feature_fusion_test_20230101_20230319.txt', sep='\t', encoding='utf-8')
df_aspiration_part1 = pd.read_csv('../../data/other/cs/sample_label_feature_aspiration_part1_test_20230101_20230319.txt', sep='\t', encoding='utf-8')
df_aspiration_part2 = pd.read_csv('../../data/other/cs/sample_label_feature_aspiration_part2_test_20230101_20230319.txt', sep='\t', encoding='utf-8')

print(df_fusion.shape)
print(df_aspiration_part1.shape)
print(df_aspiration_part2.shape)

In [None]:
list_feats_fusion = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
list_feats_aspiration = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')

print(len(list_feats_fusion))
print(len(list_feats_aspiration))

In [None]:
list_feats_aspiration_part_1 = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_aspiration_part1_20230101_20230319.pickle')
list_feats_aspiration_part_2 = utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_aspiration_part2_20230101_20230319.pickle')

print(len(list_feats_aspiration_part_1))
print(len(list_feats_aspiration_part_2))

In [None]:
list_feats_aspiration_part_1 = [x for x in list_feats_aspiration_part_1 if x in list_feats_aspiration]
list_feats_aspiration_part_2 = [x for x in list_feats_aspiration_part_2 if x in list_feats_aspiration]

print(len(list_feats_aspiration_part_1))
print(len(list_feats_aspiration_part_2))

In [None]:
df_fusion['obs_dt'] = pd.to_datetime(df_fusion['obs_dt'])
df_fusion['dt'] = pd.to_datetime(df_fusion['dt'])
df_aspiration_part1['obs_dt'] = pd.to_datetime(df_aspiration_part1['obs_dt'])
df_aspiration_part1['dt'] = pd.to_datetime(df_aspiration_part1['dt'])
df_aspiration_part2['obs_dt'] = pd.to_datetime(df_aspiration_part2['obs_dt'])
df_aspiration_part2['dt'] = pd.to_datetime(df_aspiration_part2['dt'])

df = df_fusion[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_fusion].\
    merge(df_aspiration_part1[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_aspiration_part_1], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_aspiration_part2[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_aspiration_part_2], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')
print(df.shape)
df.head()

In [None]:
df_des = utils_.df_des(df)
df_des.to_csv('../../data/other/cs/df_des_test_20230101_20230319.csv', encoding='utf-8')

In [None]:
df[['obs_dt', 'dt', 'uid']].groupby(by=['obs_dt', 'dt']).count()

In [None]:
df['item'].value_counts()

In [None]:
df[['item', 'label_apply', 'uid']].groupby(by=['item', 'label_apply']).count()

In [None]:
df['label_apply'].value_counts()

In [None]:
df[['item', 'label_submit', 'uid']].groupby(by=['item', 'label_submit']).count()

In [None]:
df['label_submit'].value_counts()

In [None]:
df[['item', 'label_pass', 'uid']].groupby(by=['item', 'label_pass']).count()

In [None]:
df['label_pass'].value_counts()

In [None]:
df[['item', 'label_pass_1', 'uid']].groupby(by=['item', 'label_pass_1']).count()

In [None]:
df['label_pass_1'].value_counts()

In [None]:
df['item_id'] = df['item'].apply(lambda x: 0 if x=='aaa' 
                                             else 1 if x=='bbb' 
                                             else 2 if x=='ccc' 
                                             else 3 if x=='ddd' 
                                             else 4)
df.head()

In [None]:
# 物料特征处理
# one hot encoding
list_feats_ohe = ['item_id']
list_df_ohe = []
try:
    with tqdm(list_feats_ohe) as t:
        for feat in t:
            df_ohe_feat = utils_.one_hot_encoder(df, feat)
            list_df_ohe.append(df_ohe_feat)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_ohe_part = pd.concat(list_df_ohe, axis=1)
print(df_ohe_part.shape)
df_ohe_part.head()

In [None]:
df_ohe = pd.concat([df, df_ohe_part], axis=1)
print(df_ohe.shape)
df_ohe.head()

In [None]:
utils_.save_pickle(df_ohe, '../../data/other/cs/df_ohe_test_20230101_20230319.pickle')

In [None]:
df_ohe = utils_.load_pickle('../../data/other/cs/df_ohe_test_20230101_20230319.pickle')
print(df_ohe.shape)
df_ohe.head()

In [None]:
df_ffcc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_fixation_xxx_card_test_20230101_20230319.txt', sep='\t', encoding='utf-8')
print(df_ffcc.shape)
df_ffcc.head()

In [None]:
df_fcscc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_cs_xxx_card_test_20230101_20230319.txt', sep='\t', encoding='utf-8')
print(df_fcscc.shape)
df_fcscc.head()

In [None]:
df_dc = pd.read_csv('../../data/other/cs/sample_label_feature_xxx_card_test_20230101_20230319.txt', sep='\t', encoding='utf-8')
print(df_dc.shape)
df_dc.head()

In [None]:
list_feats_x_ffd = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')

print(len(list_feats_x_ffd))

In [None]:
list_feats_x_ffcc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ffcc_20230101_20230319.pickle')
list_feats_x_fcscc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
list_feats_x_dc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
list_feats_x_ffcc = [x for x in list_feats_x_ffcc if x in list_feats_x_ffd]
list_feats_x_fcscc = [x for x in list_feats_x_fcscc if x in list_feats_x_ffd]
list_feats_x_dc = [x for x in list_feats_x_dc if x in list_feats_x_dc]

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
df_ffcc[list_feats_x_ffcc] = df_ffcc[list_feats_x_ffcc].fillna(0)
df_fcscc[list_feats_x_fcscc] = df_fcscc[list_feats_x_fcscc].fillna(0)

df_ffcc['obs_dt'] = pd.to_datetime(df_ffcc['obs_dt'])
df_ffcc['dt'] = pd.to_datetime(df_ffcc['dt'])
df_fcscc['obs_dt'] = pd.to_datetime(df_fcscc['obs_dt'])
df_fcscc['dt'] = pd.to_datetime(df_fcscc['dt'])
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])
df_dc['dt'] = pd.to_datetime(df_dc['dt'])

df_ffd = df_ffcc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffcc].\
    merge(df_fcscc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_fcscc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_dc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_dc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')
print(df_ffd.shape)
df_ffd.head()

In [None]:
df_ohe_new_feats = df_ohe.\
    merge(df_ffd[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffd], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')

print(df_ohe_new_feats.shape)
df_ohe_new_feats.head()

In [None]:
list_feats_x_ffcc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_std_ffcc_20230101_20230319.pickle')
list_feats_x_fcscc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
list_feats_x_dc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
%%time
df_ffcc[list_feats_x_ffcc] = df_ffcc[list_feats_x_ffcc].fillna(0)
df_fcscc[list_feats_x_fcscc] = df_fcscc[list_feats_x_fcscc].fillna(0)
df_dc[list_feats_x_dc] = df_dc[list_feats_x_dc].fillna(0)

df_ffcc['obs_dt'] = pd.to_datetime(df_ffcc['obs_dt'])
df_ffcc['dt'] = pd.to_datetime(df_ffcc['dt'])
df_fcscc['obs_dt'] = pd.to_datetime(df_fcscc['obs_dt'])
df_fcscc['dt'] = pd.to_datetime(df_fcscc['dt'])
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])
df_dc['dt'] = pd.to_datetime(df_dc['dt'])

df_ohe_new_feats = df_ohe.\
    merge(df_ffcc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffcc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_fcscc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_fcscc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_dc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_dc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')

print(df_ohe_new_feats.shape)
df_ohe_new_feats.head()

In [None]:
# utils_.save_pickle(df_ohe_new_feats, '../../data/other/cs/new_features/df_ohe_new_feats_test_20230101_20230319.pickle')
utils_.save_pickle(df_ohe_new_feats, '../../data/other/cs/new_features/df_ohe_ffd_test_20230101_20230319.pickle')

In [None]:
# df_id = df_ohe[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
# df_y = df_ohe[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
# df_X = df_ohe[
#     ['item_id_{}'.format(x) for x in range(5)]
#     +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
#     +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
# ]

df_id = df_ohe_new_feats[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_y = df_ohe_new_feats[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_X = df_ohe_new_feats[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')
]

print(df_id.shape)
print(df_y.shape)
print(df_X.shape)

In [None]:
# condition_feats = utils_.load_pickle('../../data/other/cs/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/list_dense_feats.pickle')

condition_feats = utils_.load_pickle('../../data/other/cs/new_features/list_condition_feats_ffd.pickle')
sparse_feats = utils_.load_pickle('../../data/other/cs/new_features/list_sparse_feats_ffd.pickle')
dense_feats = utils_.load_pickle('../../data/other/cs/new_features/list_dense_feats_ffd.pickle')

print(len(condition_feats))
print(len(sparse_feats))
print(len(dense_feats))

In [None]:
# 连续
# ss = joblib.load('../../data/other/cs/ss.pickle')
ss = joblib.load('../../data/other/cs/new_features/ss_ffd.pickle')
X_dense_ss = ss.transform(df_X[dense_feats])
print(X_dense_ss.shape)

In [None]:
df_X_dense = pd.DataFrame(X_dense_ss, columns=dense_feats)
print(df_X_dense.shape)
df_X_dense.head()

In [None]:
# 离散
# dict_lbe = joblib.load('../../data/other/cs/dict_lbe.pickle')
dict_lbe = joblib.load('../../data/other/cs/new_features/dict_lbe_ffd.pickle')
for x in sparse_feats:
    print(x, dict_lbe[x].classes_)

In [None]:
list_X_sparse = []

try:
    with tqdm(sparse_feats) as t:
        for x in t:
            list_feat_values_unseen = list(set(df_X[x].unique())-set(dict_lbe[x].classes_))
            if len(list_feat_values_unseen) > 0:
                print(x)
                df_X[x].replace(list_feat_values_unseen, -1, inplace=True)
            df_X_sparse_each = pd.DataFrame(dict_lbe[x].transform(df_X[x]), columns=[x])
            list_X_sparse.append(df_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_sparse = pd.concat(list_X_sparse, axis=1)
print(df_X_sparse.shape)
df_X_sparse.head()

In [None]:
# 合并
df_X_transform = pd.concat([df_X[condition_feats], df_X_sparse, df_X_dense], axis=1)
print(df_X_transform.shape)
df_X_transform.head()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)
config = {
    'Model': {
        'num_experts': 5, 
        # 'expert_hidden_units': [256, 128], 
        'expert_hidden_units': [512, 256, 128], 
        'units': 64, 
        'num_tasks': 4, 
        'tower_hidden_units': [64, 32], 
        'dropout': 0.2, 
        'use_bn': False
    }
}
# feats_columns = utils_.load_pickle('../../data/other/cs/feats_columns.pcikle')
feats_columns = utils_.load_pickle('../../data/other/cs/new_features/feats_columns_ffd.pcikle')

model = MMOE(config, feats_columns).to(device)
# model.load_state_dict(torch.load('save/mmoe_condition.pth'))
# model.load_state_dict(torch.load('save/mmoe_condition_0_01.pth'))
model.load_state_dict(torch.load('save/mmoe_condition_ffd_ed.pth'))
model

In [None]:
criterion = (torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device))
test_loader = DataLoader(DatasetLoader(df_X_transform.values, df_y.values), 1024, shuffle=False, num_workers=8)

y_pred = test(test_loader, model, criterion, (df_X_transform.values, df_y.values))
y_pred

In [None]:
df_y_res = pd.concat([df_y, pd.DataFrame(data=y_pred, columns=['pred_'+x for x in df_y.columns])], axis=1)
print(df_y_res.shape)
df_y_res.head()

In [None]:
try:
    with tqdm(df_y.columns) as t:
        for x in t:
            print('{}: {}'.format(x, cal_recall(df_y_res[[x, 'pred_'+x]], x, 'pred_'+x, threshold=0.2)))
except KeyboardInterrupt:
    t.close()
    raise
t.close()

In [None]:
# 分物料
recall_p = 20

try:
    with tqdm([0, 1, 2, 3, 4]) as t:
        for item_id in t:
            index_item = df_id[df_id['item_id']==item_id].index
            df_y_res_item = df_y_res.loc[index_item, :]
            print(df_y_res_item.shape[0])
            for x in df_y.columns:
                print('item_id:{bi}, label:{l}, AUC:{ras}, Recall@{p}%:{cr}'.format(
                    bi=item_id, 
                    l=x, 
                    ras=roc_auc_score(df_y_res_item[x], df_y_res_item['pred_'+x]), 
                    p=recall_p, 
                    cr=cal_recall(df_y_res_item, x, 'pred_'+x, threshold=recall_p*0.01)))
except:
    t.close()
    raise
t.close()

* feature importance

In [None]:
# shap
shap_explainer = shap.DeepExplainer(model, [df_X_transform.values])
shap_explainer

In [None]:
shap_values = shap_explainer.shap_values([df_X_transform.values])
shap_values

In [None]:
shap.summary_plot(shap_values, df_train_X, plot_type='bar')

## 模型比较，oot

In [None]:
df_fusion = pd.read_csv('../../data/other/cs/sample_label_feature_fusion_20230326.txt', sep='\t', encoding='utf-8')
df_aspiration_part1 = pd.read_csv('../../data/other/cs/sample_label_feature_aspiration_part1_20230326.txt', sep='\t', encoding='utf-8')
df_aspiration_part2 = pd.read_csv('../../data/other/cs/sample_label_feature_aspiration_part2_20230326.txt', sep='\t', encoding='utf-8')

print(df_fusion.shape)
print(df_aspiration_part1.shape)
print(df_aspiration_part2.shape)

In [None]:
df_fusion['obs_dt'] = pd.to_datetime(df_fusion['obs_dt'])
df_fusion['dt'] = pd.to_datetime(df_fusion['dt'])
df_aspiration_part1['obs_dt'] = pd.to_datetime(df_aspiration_part1['obs_dt'])
df_aspiration_part1['dt'] = pd.to_datetime(df_aspiration_part1['dt'])
df_aspiration_part2['obs_dt'] = pd.to_datetime(df_aspiration_part2['obs_dt'])
df_aspiration_part2['dt'] = pd.to_datetime(df_aspiration_part2['dt'])

df = df_fusion.merge(df_aspiration_part1, 
                     on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_aspiration_part2, 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')
print(df.shape)
df.head()

In [None]:
[x for x in df.columns if x.endswith('.1')]

In [None]:
df_des = utils_.df_des(df)
df_des.to_csv('../../data/other/cs/df_des_oot_20230326.csv', encoding='utf-8')

In [None]:
df[['obs_dt', 'dt', 'uid']].groupby(by=['obs_dt', 'dt']).count()

In [None]:
df['item'].value_counts()

In [None]:
df[['item', 'label_apply', 'uid']].groupby(by=['item', 'label_apply']).count()

In [None]:
df['label_apply'].value_counts()

In [None]:
df[['item', 'label_submit', 'uid']].groupby(by=['item', 'label_submit']).count()

In [None]:
df['label_submit'].value_counts()

In [None]:
df[['item', 'label_pass', 'uid']].groupby(by=['item', 'label_pass']).count()

In [None]:
df['label_pass'].value_counts()

In [None]:
df[['item', 'label_pass_1', 'uid']].groupby(by=['item', 'label_pass_1']).count()

In [None]:
df['label_pass_1'].value_counts()

In [None]:
df['item_id'] = df['item'].apply(lambda x: 0 if x=='aaa' 
                                             else 1 if x=='bbb' 
                                             else 2 if x=='ccc' 
                                             else 3 if x=='ddd' 
                                             else 4)
df.head()

In [None]:
# 物料特征处理
# one hot encoding
list_feats_ohe = ['item_id']
list_df_ohe = []
try:
    with tqdm(list_feats_ohe) as t:
        for feat in t:
            df_ohe_feat = utils_.one_hot_encoder(df, feat)
            list_df_ohe.append(df_ohe_feat)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_ohe_part = pd.concat(list_df_ohe, axis=1)
print(df_ohe_part.shape)
df_ohe_part.head()

In [None]:
df_ohe = pd.concat([df, df_ohe_part], axis=1)
print(df_ohe.shape)
df_ohe.head()

In [None]:
utils_.save_pickle(df_ohe, '../../data/other/cs/df_ohe_20230326.pickle')

In [None]:
df_ohe = utils_.load_pickle('../../data/other/cs/df_ohe_20230326.pickle')
print(df_ohe.shape)
df_ohe.head()

In [None]:
df_ffcc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_fixation_xxx_card_20230326.txt', sep='\t', encoding='utf-8')
print(df_ffcc.shape)
df_ffcc.head()

In [None]:
df_fcscc = pd.read_csv('../../data/other/cs/sample_label_feature_flow_cs_xxx_card_20230326.txt', sep='\t', encoding='utf-8')
print(df_fcscc.shape)
df_fcscc.head()

In [None]:
df_dc = pd.read_csv('../../data/other/cs/sample_label_feature_xxx_card_20230326.txt', sep='\t', encoding='utf-8')
print(df_dc.shape)
df_dc.head()

In [None]:
list_feats_x_ffd = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')

print(len(list_feats_x_ffd))

In [None]:
list_feats_x_ffcc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ffcc_20230101_20230319.pickle')
list_feats_x_fcscc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
list_feats_x_dc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
list_feats_x_ffcc = [x for x in list_feats_x_ffcc if x in list_feats_x_ffd]
list_feats_x_fcscc = [x for x in list_feats_x_fcscc if x in list_feats_x_ffd]
list_feats_x_dc = [x for x in list_feats_x_dc if x in list_feats_x_dc]

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
df_ffcc[list_feats_x_ffcc] = df_ffcc[list_feats_x_ffcc].fillna(0)
df_fcscc[list_feats_x_fcscc] = df_fcscc[list_feats_x_fcscc].fillna(0)

df_ffcc['obs_dt'] = pd.to_datetime(df_ffcc['obs_dt'])
df_ffcc['dt'] = pd.to_datetime(df_ffcc['dt'])
df_fcscc['obs_dt'] = pd.to_datetime(df_fcscc['obs_dt'])
df_fcscc['dt'] = pd.to_datetime(df_fcscc['dt'])
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])
df_dc['dt'] = pd.to_datetime(df_dc['dt'])

df_ffd = df_ffcc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffcc].\
    merge(df_fcscc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_fcscc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_dc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_dc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')
print(df_ffd.shape)
df_ffd.head()

In [None]:
df_ohe_new_feats = df_ohe.\
    merge(df_ffd[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffd], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')

print(df_ohe_new_feats.shape)
df_ohe_new_feats.head()

In [None]:
list_feats_x_ffcc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_std_ffcc_20230101_20230319.pickle')
list_feats_x_fcscc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_fcscc_20230101_20230319.pickle')
list_feats_x_dc = utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_dc_20230101_20230319.pickle')

print(len(list_feats_x_ffcc))
print(len(list_feats_x_fcscc))
print(len(list_feats_x_dc))

In [None]:
%%time
df_ffcc[list_feats_x_ffcc] = df_ffcc[list_feats_x_ffcc].fillna(0)
df_fcscc[list_feats_x_fcscc] = df_fcscc[list_feats_x_fcscc].fillna(0)
df_dc[list_feats_x_dc] = df_dc[list_feats_x_dc].fillna(0)

df_ffcc['obs_dt'] = pd.to_datetime(df_ffcc['obs_dt'])
df_ffcc['dt'] = pd.to_datetime(df_ffcc['dt'])
df_fcscc['obs_dt'] = pd.to_datetime(df_fcscc['obs_dt'])
df_fcscc['dt'] = pd.to_datetime(df_fcscc['dt'])
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])
df_dc['dt'] = pd.to_datetime(df_dc['dt'])

df_ohe_new_feats = df_ohe.\
    merge(df_ffcc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_ffcc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_fcscc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_fcscc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left').\
    merge(df_dc[['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1']+list_feats_x_dc], 
          on=['uid', 'obs_dt', 'dt', 'item', 'label_apply', 'label_submit', 'label_pass', 'label_pass_1'], how='left')

print(df_ohe_new_feats.shape)
df_ohe_new_feats.head()

In [None]:
# utils_.save_pickle(df_ohe_new_feats, '../../data/other/cs/new_features/df_ohe_new_feats_20230326.pickle')
utils_.save_pickle(df_ohe_new_feats, '../../data/other/cs/new_features/df_ohe_ffd_20230326pickle')

In [None]:
# df_id = df_ohe[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
# df_y = df_ohe[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
# df_X = df_ohe[
#     ['item_id_{}'.format(x) for x in range(5)]
#     +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
#     +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
# ]

df_id = df_ohe_new_feats[['uid', 'obs_dt', 'dt', 'item', 'item_id']]
df_y = df_ohe_new_feats[['label_apply', 'label_submit', 'label_pass', 'label_pass_1']]
df_X = df_ohe_new_feats[
    ['item_id_{}'.format(x) for x in range(5)]
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_fusion_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/list_feats/list_feats_x_pi_aspiration_20230101_20230319.pickle')
    +utils_.load_pickle('../../data/other/cs/new_features/list_feats/list_feats_x_ht_ffd_20230101_20230319.pickle')
]

print(df_id.shape)
print(df_y.shape)
print(df_X.shape)

In [None]:
# condition_feats = utils_.load_pickle('../../data/other/cs/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/list_dense_feats.pickle')

# condition_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_condition_feats.pickle')
# sparse_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_sparse_feats.pickle')
# dense_feats = utils_.load_pickle('../../data/other/cs/train_eval_oot/list_dense_feats.pickle')

condition_feats = utils_.load_pickle('../../data/other/cs/new_features/list_condition_feats_ffd.pickle')
sparse_feats = utils_.load_pickle('../../data/other/cs/new_features/list_sparse_feats_ffd.pickle')
dense_feats = utils_.load_pickle('../../data/other/cs/new_features/list_dense_feats_ffd.pickle')

print(len(condition_feats))
print(len(sparse_feats))
print(len(dense_feats))

In [None]:
# 连续
# ss = joblib.load('../../data/other/cs/ss.pickle')
# ss = joblib.load('../../data/other/cs/train_eval_oot/ss.pickle')
ss = joblib.load('../../data/other/cs/new_features/ss_ffd.pickle')
X_dense_ss = ss.transform(df_X[dense_feats])
print(X_dense_ss.shape)

In [None]:
df_X_dense = pd.DataFrame(X_dense_ss, columns=dense_feats)
print(df_X_dense.shape)
df_X_dense.head()

In [None]:
# 离散
# dict_lbe = joblib.load('../../data/other/cs/dict_lbe.pickle')
# dict_lbe = joblib.load('../../data/other/cs/train_eval_oot/dict_lbe.pickle')
dict_lbe = joblib.load('../../data/other/cs/new_features/dict_lbe_ffd.pickle')
list_X_sparse = []

try:
    with tqdm(sparse_feats) as t:
        for x in t:
            list_feat_values_unseen = list(set(df_X[x].unique())-set(dict_lbe[x].classes_))
            if len(list_feat_values_unseen) > 0:
                print(x)
                df_X[x].replace(list_feat_values_unseen, -1, inplace=True)
            df_X_sparse_each = pd.DataFrame(dict_lbe[x].transform(df_X[x]), columns=[x])
            list_X_sparse.append(df_X_sparse_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_X_sparse = pd.concat(list_X_sparse, axis=1)
print(df_X_sparse.shape)
df_X_sparse.head()

In [None]:
# 合并
df_X_transform = pd.concat([df_X[condition_feats], df_X_sparse, df_X_dense], axis=1)
print(df_X_transform.shape)
df_X_transform.head()

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
util.seed_everything(2023)
config = {
    'Model': {
        'num_experts': 5, 
        # 'expert_hidden_units': [256, 128], 
        'expert_hidden_units': [512, 256, 128], 
        'units': 64, 
        'num_tasks': 4, 
        'tower_hidden_units': [64, 32], 
        'dropout': 0.2, 
        'use_bn': False
    }
}
# feats_columns = utils_.load_pickle('../../data/other/cs/feats_columns.pcikle')
# feats_columns = utils_.load_pickle('../../data/other/cs/train_eval_oot/feats_columns.pcikle')
feats_columns = utils_.load_pickle('../../data/other/cs/new_features/feats_columns_ffd.pcikle')

model = MMOE(config, feats_columns).to(device)
# model.load_state_dict(torch.load('save/mmoe_condition.pth'))
# model.load_state_dict(torch.load('save/mmoe_condition_train_eval_oot.pth'))
model.load_state_dict(torch.load('save/mmoe_condition_ffd_ed.pth'))
model

In [None]:
criterion = (torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device), torch.nn.BCEWithLogitsLoss().to(device))
test_loader = DataLoader(DatasetLoader(df_X_transform.values, df_y.values), 1024, shuffle=False, num_workers=8)

y_pred = test(test_loader, model, criterion, (df_X_transform.values, df_y.values))
y_pred

In [None]:
df_y_res = pd.concat([df_y, pd.DataFrame(data=y_pred, columns=['pred_'+x for x in df_y.columns])], axis=1)
print(df_y_res.shape)
df_y_res.head()

In [None]:
try:
    with tqdm(df_y.columns) as t:
        for x in t:
            print('{}: {}'.format(x, cal_recall(df_y_res[[x, 'pred_'+x]], x, 'pred_'+x, threshold=0.2)))
except KeyboardInterrupt:
    t.close()
    raise
t.close()

In [None]:
# 分物料
recall_p = 20
df_id.reset_index(drop=True, inplace=True)

try:
    with tqdm([0, 1, 2, 3, 4]) as t:
        for item_id in t:
            print('{}:'.format(item_id))
            index_item = df_id[df_id['item_id']==item_id].index
            df_y_res_item = df_y_res.loc[index_item, :]
            print(df_y_res_item.shape[0])
            for x in df_y.columns:
                print('{l}: AUC:{ras}, Recall@{p}%:{cr}'.format(
                    bi=item_id, 
                    l=x, 
                    ras=roc_auc_score(df_y_res_item[x], df_y_res_item['pred_'+x]), 
                    p=recall_p, 
                    cr=cal_recall(df_y_res_item, x, 'pred_'+x, threshold=recall_p*0.01)))
except:
    t.close()
    raise
t.close()