# 多利益点CVR模型
## 增加不同item卡先验知识（特征one hot），预测最终cvr，不同item间打分可比
## 也可看作为多标签分类变种
## 目标item卡&平衡处理

In [1]:
import datetime
import numpy as np
import pandas as pd
import joblib
import warnings
import logging
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import collections
import re
import copy
import xgboost as xgb
import lightgbm as lgb
import shap

import utils

from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# pd.set_option('display.max_columns', None)
# pd.set_option('max_row', 500)
warnings.filterwarnings('ignore')
tqdm.pandas(desc='pandas bar')

KeyboardInterrupt: 

In [None]:
xgb.__version__

In [None]:
lgb.__version__

## data

* xxpay 特征

In [None]:
df_xxpay = utils.load_pickle('data/other/balance_34/df_xxpay_20240225_20240317.pickle')

print(df_xxpay.shape)
df_xxpay.head()

In [None]:
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_ht_xxpay_20240225_20240317.pickle')
print(len(list_feats_xxpay_x_std))
list_feats_x_xxpay_std[:10]

In [None]:
utils.save_pickle(list_feats_xxpay_x_std, 'data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

* flow cashier 特征

In [None]:
df_fc = pd.read_csv('data/other/balance_34/sample_label_feature_fc_20240225_20240317.txt', sep='\t', encoding='utf-8')

print(df_fc.shape)
df_fc.head()

In [None]:
df_fc_des = utils.df_des(df_fc)

df_fc_des.to_csv('data/other/balance_34/df_des_fc_20240225_20240317.csv')

In [None]:
list_feats_fc_std_0 = list(df_fc_des[df_fc_des['std']==0].index)
print(len(list_feats_fc_std_0))
list_feats_fc_std_0[:10]

In [None]:
utils.save_pickle(list_feats_fc_std_0, 'data/other/balance_34/list_feats/list_feats_x_fc_std_0.pickle')

In [None]:
list_feats_x_fc = [x for x in df_fc.columns if x not in ['uid', 'obs_dt', 'card', 'label']]
print(len(list_feats_x_fc))
list_feats_x_fc[:10]

In [None]:
utils.save_pickle(list_feats_x_fc, 'data/other/balance_34/list_feats/list_feats_x_fc.pickle')

In [None]:
list_feats_x_fc_std = [x for x in list_feats_x_fc if x not in list_feats_fc_std_0]
print(len(list_feats_x_fc_std))
list_feats_x_fc_std[:10]

In [None]:
utils.save_pickle(list_feats_x_fc_std, 'data/other/balance_34/list_feats/list_feats_x_fc_std.pickle')

In [None]:
list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fc_std.pickle')
print(len(list_feats_x_fc_std))
list_feats_x_fc_std[:10]

In [None]:
list_feats_x_fc_std_arti = [
    x for x in list_feats_x_fc_std if x not in 
    [
        'xxx'
    ]
]
print(len(list_feats_x_fc_std_arti))
list_feats_x_fc_std_arti[:10]

In [None]:
utils.save_pickle(list_feats_x_fc_std_arti, 'data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')

* debit card 特征

In [None]:
df_debit_card = pd.read_csv('data/other/balance_34/sample_label_feature_dc_20240225_20240317.txt', sep='\t', encoding='utf-8')

print(df_debit_card.shape)
df_debit_card.head()

In [None]:
df_debit_card_des = utils.df_des(df_debit_card)

df_debit_card_des.to_csv('data/other/balance_34/df_des_dc_20240225_20240317.csv')

In [None]:
list_feats_debit_card_std_0 = list(df_debit_card_des[df_debit_card_des['std']==0].index)
print(len(list_feats_debit_card_std_0))
list_feats_debit_card_std_0[:10]

In [None]:
utils.save_pickle(list_feats_debit_card_std_0, 'data/other/balance_34/list_feats/list_feats_x_dc_std_0.pickle')

In [None]:
list_feats_x_debit_card = [x for x in df_debit_card.columns if x not in ['uid', 'obs_dt', 'card', 'label']]
print(len(list_feats_x_debit_card))
list_feats_x_debit_card[:10]

In [None]:
utils.save_pickle(list_feats_x_debit_card, 'data/other/balance_34/list_feats/list_feats_x_dc.pickle')

In [None]:
list_feats_x_debit_card_std = [x for x in list_feats_x_debit_card if x not in list_feats_debit_card_std_0]
print(len(list_feats_x_debit_card_std))
list_feats_x_debit_card_std[:10]

In [None]:
utils.save_pickle(list_feats_x_debit_card_std, 'data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')

In [None]:
list_feats_x_debit_card_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
print(len(list_feats_x_debit_card_std))
list_feats_x_debit_card_std[:10]

* fusion + aspiration 特征

In [None]:
df_fusion = pd.read_csv('data/other/balance_34/sample_label_feature_fusion_20240225_20240317.txt', sep='\t', encoding='utf-8')
df_asp1 = pd.read_csv('data/other/balance_34/sample_label_feature_asp1_20240225_20240317.txt', sep='\t', encoding='utf-8')
df_asp2 = pd.read_csv('data/other/balance_34/sample_label_feature_asp2_20240225_20240317.txt', sep='\t', encoding='utf-8')

print(df_fusion.shape)
print(df_asp1.shape)
print(df_asp2.shape)

In [None]:
df_fusion_des = utils.df_des(df_fusion)
df_asp1_des = utils.df_des(df_asp1)
df_asp2_des = utils.df_des(df_asp2)

df_fusion_des.to_csv('data/other/balance_34/df_des_fusion_20240225_20240317.csv')
df_asp1_des.to_csv('data/other/balance_34/df_des_asp1_20240225_20240317.csv')
df_asp2_des.to_csv('data/other/balance_34/df_des_asp2_20240225_20240317.csv')

In [None]:
list_feats_x_fusion_std_0 = list(df_fusion_des[df_fusion_des['std']==0].index)
list_feats_x_asp1_std_0 = list(df_asp1_des[df_asp1_des['std']==0].index)
list_feats_x_asp2_std_0 = list(df_asp2_des[df_asp2_des['std']==0].index)

print(len(list_feats_x_fusion_std_0))
print(len(list_feats_x_asp1_std_0))
print(len(list_feats_x_asp2_std_0))

In [None]:
utils.save_pickle(list_feats_x_fusion_std_0, 'data/other/balance_34/list_feats/list_feats_x_fusion_std_0.pickle')
utils.save_pickle(list_feats_x_asp1_std_0, 'data/other/balance_34/list_feats/list_feats_x_asp1_std_0.pickle')
utils.save_pickle(list_feats_x_asp2_std_0, 'data/other/balance_34/list_feats/list_feats_x_asp2_std_0.pickle')

In [None]:
list_feats_x_fusion = [x for x in df_fusion.columns if x not in ['uid', 'obs_dt', 'card', 'label']]
list_feats_x_asp1 = [x for x in df_asp1.columns if x not in ['uid', 'obs_dt', 'card', 'label']]
list_feats_x_asp2 = [x for x in df_asp2.columns if x not in ['uid', 'obs_dt', 'card', 'label']]

print(len(list_feats_x_fusion))
print(len(list_feats_x_asp1))
print(len(list_feats_x_asp2))

In [None]:
utils.save_pickle(list_feats_x_fusion, 'data/other/balance_34/list_feats/list_feats_x_fusion.pickle')
utils.save_pickle(list_feats_x_asp1, 'data/other/balance_34/list_feats/list_feats_x_asp1.pickle')
utils.save_pickle(list_feats_x_asp2, 'data/other/balance_34/list_feats/list_feats_x_asp2.pickle')

In [None]:
list_feats_x_fusion_std = [x for x in list_feats_x_fusion if x not in list_feats_x_fusion_std_0]
list_feats_x_asp1_std = [x for x in list_feats_x_asp1 if x not in list_feats_x_asp1_std_0]
list_feats_x_asp2_std = [x for x in list_feats_x_asp2 if x not in list_feats_x_asp1_std_0]

print(len(list_feats_x_fusion_std))
print(len(list_feats_x_asp1_std))
print(len(list_feats_x_asp2_std))

In [None]:
utils.save_pickle(list_feats_x_fusion_std, 'data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
utils.save_pickle(list_feats_x_asp1_std, 'data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
utils.save_pickle(list_feats_x_asp2_std, 'data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')

In [None]:
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')

print(len(list_feats_x_fusion_std))
print(len(list_feats_x_asp1_std))
print(len(list_feats_x_asp2_std))

* 数据合并

In [None]:
list_feats_id_dt_card_y = ['uid', 'obs_dt', 'card', 'label']

In [None]:
# 存在部分重复负样本，采用排序后合并方式
df_fusion.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_fusion.reset_index(drop=True, inplace=True)

df_asp1.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_asp1.reset_index(drop=True, inplace=True)

df_asp2.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_asp2.reset_index(drop=True, inplace=True)

df_debit_card.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_debit_card.reset_index(drop=True, inplace=True)

df_fc.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_fc.reset_index(drop=True, inplace=True)

df_xxpay.sort_values(by=list_feats_id_dt_card_y, inplace=True)
df_xxpay.reset_index(drop=True, inplace=True)

In [None]:
# 校验id
df_fusion_id = df_fusion[list_feats_id_dt_card_y]
df_fusion_id.rename(columns={'uid': 'uid_f', 
                             'obs_dt': 'obs_dt_f', 
                             'card': 'card_f', 
                             'label': 'label_f'}, 
                    inplace=True)

df_asp1_id = df_asp1[list_feats_id_dt_card_y]
df_asp1_id.rename(columns={'uid': 'uid_ap1', 
                           'obs_dt': 'obs_dt_ap1', 
                           'card': 'card_ap1', 
                           'label': 'label_ap1'}, 
                  inplace=True)


df_asp2_id = df_asp2[list_feats_id_dt_card_y]
df_asp2_id.rename(columns={'uid': 'uid_ap2', 
                           'obs_dt': 'obs_dt_ap2', 
                           'card': 'card_ap2', 
                           'label': 'label_ap2'}, 
                  inplace=True)

df_dc_id = df_debit_card[list_feats_id_dt_card_y]
df_dc_id.rename(columns={'uid': 'uid_dc', 
                         'obs_dt': 'obs_dt_dc', 
                         'card': 'card_dc', 
                         'label': 'label_dc'}, 
                inplace=True)

df_fc_id = df_fc[list_feats_id_dt_card_y]
df_fc_id.rename(columns={'uid': 'uid_fc', 
                         'obs_dt': 'obs_dt_fc', 
                         'card': 'card_fc', 
                         'label': 'label_fc'}, 
                inplace=True)

df_xxpay_id = df_fc[list_feats_id_dt_card_y]
df_xxpay_id.rename(columns={'uid': 'uid_xxpay', 
                            'obs_dt': 'obs_dt_xxpay', 
                            'card': 'card_xxpay', 
                            'label': 'label_xxpay'}, 
                   inplace=True)

df_id = pd.concat([df_fusion_id, df_asp1_id, df_asp2_id, df_dc_id, df_fc_id, df_xxpay_id], axis=1)
print(df_id.shape)

print('fusion - aspiration part 1:', [sum(df_id[x+'_f']==df_id[x+'_ap1']) for x in list_feats_id_dt_card_y])
print('fusion - aspiration part 2:', [sum(df_id[x+'_f']==df_id[x+'_ap2']) for x in list_feats_id_dt_card_y])
print('fusion - debit card:', [sum(df_id[x+'_f']==df_id[x+'_dc']) for x in list_feats_id_dt_card_y])
print('fusion - flow cashier:', [sum(df_id[x+'_f']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])
print('fusion - xxpay:', [sum(df_id[x+'_f']==df_id[x+'_xxpay']) for x in list_feats_id_dt_card_y])

print('aspiration part 1 - aspiration part 2:', [sum(df_id[x+'_ap1']==df_id[x+'_ap2']) for x in list_feats_id_dt_card_y])
print('aspiration part 1 - debit card:', [sum(df_id[x+'_ap1']==df_id[x+'_dc']) for x in list_feats_id_dt_card_y])
print('aspiration part 1 - flow cashier:', [sum(df_id[x+'_ap1']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])
print('aspiration part 1 - xxpay:', [sum(df_id[x+'_ap1']==df_id[x+'_xxpay']) for x in list_feats_id_dt_card_y])

print('aspiration part 2 - debit card:', [sum(df_id[x+'_ap2']==df_id[x+'_dc']) for x in list_feats_id_dt_card_y])
print('aspiration part 2 - flow cashier:', [sum(df_id[x+'_ap2']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])
print('aspiration part 2 - xxpay:', [sum(df_id[x+'_ap2']==df_id[x+'_xxpay']) for x in list_feats_id_dt_card_y])

print('debit card - flow cashier:', [sum(df_id[x+'_dc']==df_id[x+'_fc']) for x in list_feats_id_dt_card_y])
print('debit card - xxpay:', [sum(df_id[x+'_dc']==df_id[x+'_xxpay']) for x in list_feats_id_dt_card_y])

print('flow cashier - xxpay:', [sum(df_id[x+'_fc']==df_id[x+'_xxpay']) for x in list_feats_id_dt_card_y])

In [None]:
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
# list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fc_std.pickle')

list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

print(len(list_feats_x_fusion_std))
print(len(list_feats_x_asp1_std))
print(len(list_feats_x_asp2_std))
print(len(list_feats_x_dc_std))
print(len(list_feats_x_fc_std))

print(len(list_feats_x_xxpay_std))

In [None]:
# 331+137+60+45+60
331+137+60+45+59+43

In [None]:
df = pd.concat([df_fusion[list_feats_id_dt_card_y+list_feats_x_fusion_std], 
                df_asp1[list_feats_x_asp1_std], 
                df_asp2[list_feats_x_asp2_std], 
                df_debit_card[list_feats_x_dc_std], 
                df_fc[list_feats_x_fc_std], 
                df_xxpay[list_feats_x_xxpay_std]
               ], axis=1)
print(df.shape)
df.head()

In [None]:
df_des = utils.df_des(df)

# df_des.to_csv('data/other/balance_34/df_des_20240225_20240317.csv')
df_des.to_csv('data/other/balance_34/xxpay/df_des_20240225_20240317.csv')

In [None]:
# card编码
# 目标item卡（202404，34）
# dict_card = {
#     'item1-a': 11, 
#     'item1-b': 12, 
# }
# utils.save_pickle(dict_card, 'data/other/balance_34/dict_card_34.pickle')

dict_card = utils.load_pickle('data/other/balance_34/dict_card_34.pickle')
print(len(dict_card))
dict_card

In [None]:
df['card_id'] = df['card']
df['card_id'].replace(dict_card, inplace=True)
df.head()

In [None]:
df[['card', 'card_id', 'uid']].groupby(['card', 'card_id']).count()

In [None]:
df['card_id'].nunique()

In [None]:
# item特征处理
# one hot encoding
list_feats_ohe = ['card_id']
list_df_ohe_card = []
try:
    with tqdm(list_feats_ohe) as t:
        for feat in t:
            df_ohe_feat = utils.one_hot_encoder(df, feat)
            list_df_ohe_card.append(df_ohe_feat)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_ohe_card = pd.concat(list_df_ohe_card, axis=1)
print(df_ohe_card.shape)
df_ohe_card.head()

In [None]:
df_ohe = pd.concat([df, df_ohe_card], axis=1)
print(df_ohe.shape)
df_ohe.head()

In [None]:
df_ohe['obs_dt'] = pd.to_datetime(df_ohe['obs_dt'])

In [None]:
# utils.save_pickle(df_ohe, 'data/other/balance_34/df_ohe_20240225_20240317.pickle')
utils.save_pickle(df_ohe, 'data/other/balance_34/xxpay/df_ohe_20240225_20240317.pickle')

* 目标item卡34，平衡处理，1:1生成负样本
* 尾部item上采样（对齐'xxx'，5k）
* fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709维
* 2024.02.25~2024.03.17

In [None]:
df = utils.load_pickle('data/other/balance_34/xxpay/df_ohe_20240225_20240317.pickle')
print(df.shape)
df.head()

In [None]:
df[['card', 'card_id', 'label', 'uid']].groupby(by=['card', 'card_id', 'label']).count()

In [None]:
# 划分训练集（含验证集）&测试集，8:2，80%
df_train, df_test = train_test_split(df, test_size=0.2, random_state=2024)
print(df_train.shape)
print(df_test.shape)

In [None]:
# 划分训练集&验证集，8:2，80%
df_train_train, df_train_eval = train_test_split(df_train, test_size=0.2, random_state=2024)
print(df_train_train.shape)
print(df_train_eval.shape)

In [None]:
list_feats_item = [x for x in df_train.columns if x.startswith('card_id_')]
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

df_train_eval_id = df_train_eval[['uid', 'obs_dt', 'card', 'card_id']]
df_train_eval_y = df_train_eval['label']
df_train_eval_X = df_train_eval[
    list_feats_item
    +list_feats_x_fusion_std
    +list_feats_x_asp1_std
    +list_feats_x_asp2_std
    +list_feats_x_dc_std
    +list_feats_x_fc_std
    +list_feats_x_xxpay_std
]
print(df_train_eval_id.shape)
print(df_train_eval_y.shape)
print(df_train_eval_X.shape)

df_test_id = df_test[['uid', 'obs_dt', 'card', 'card_id']]
df_test_y = df_test['label']
df_test_X = df_test[
    list_feats_item
    +list_feats_x_fusion_std
    +list_feats_x_asp1_std
    +list_feats_x_asp2_std
    +list_feats_x_dc_std
    +list_feats_x_fc_std
    +list_feats_x_xxpay_std
]
print(df_test_id.shape)
print(df_test_y.shape)
print(df_test_X.shape)

In [None]:
utils.save_pickle(df_train_eval_id, 'data/other/balance_34/xxpay/df_id_train_eval_20240225_20240317.pickle')
utils.save_pickle(df_train_eval_y, 'data/other/balance_34/xxpay/df_y_train_eval_20240225_20240317.pickle')
utils.save_pickle(df_train_eval_X, 'data/other/balance_34/xxpay/df_X_train_eval_20240225_20240317.pickle')

utils.save_pickle(df_test_id, 'data/other/balance_34/xxpay/df_id_test_20240225_20240317.pickle')
utils.save_pickle(df_test_y, 'data/other/balance_34/xxpay/df_y_test_20240225_20240317.pickle')
utils.save_pickle(df_test_X, 'data/other/balance_34/xxpay/df_X_test_20240225_20240317.pickle')

In [None]:
# 训练集尾部样本上采样
df_train_train_bc_cnt = df_train_train[['card', 'card_id', 'uid']].groupby(by=['card', 'card_id']).count()
df_train_train_bc_cnt.reset_index(inplace=True)
df_train_train_bc_cnt.sort_values(by=['uid'], ascending=False, inplace=True)
df_train_train_bc_cnt

In [None]:
list_df_train_train_tail = []

for x in ['xxx']:
    df_train_train_tail_each = df_train_train[df_train_train['card']==x]
    list_df_train_train_tail += [df_train_train_tail_each] * (df_train_train[df_train_train['card']=='北京item-借记卡'].shape[0]//df_train_train_tail_each.shape[0])
    
df_train_train_tail = pd.concat(list_df_train_train_tail, axis=0)
print(df_train_train_tail.shape)
df_train_train_tail.head()

In [None]:
df_train_train_tail[['card', 'card_id', 'uid']].groupby(by=['card', 'card_id']).count()

In [None]:
df_train_train = pd.concat([df_train_train[~df_train_train['card'].isin(
    ['xxx'])], df_train_train_tail], axis=0)
df_train_train.reset_index(drop=True, inplace=True)
print(df_train_train.shape)
df_train_train.head()

In [None]:
df_train_train_bc_cnt_ = df_train_train[['card', 'card_id', 'uid']].groupby(by=['card', 'card_id']).count()
df_train_train_bc_cnt_.reset_index(inplace=True)
df_train_train_bc_cnt_.sort_values(by=['uid'], ascending=False, inplace=True)
df_train_train_bc_cnt_

In [None]:
# 打散
df_train_train = df_train_train.sample(frac=1)
df_train_train.reset_index(drop=True, inplace=True)

df_train_train_id = df_train_train[['uid', 'obs_dt', 'card', 'card_id']]
df_train_train_y = df_train_train['label']
df_train_train_X = df_train_train[
    list_feats_item
    +list_feats_x_fusion_std
    +list_feats_x_asp1_std
    +list_feats_x_asp2_std
    +list_feats_x_dc_std
    +list_feats_x_fc_std
    +list_feats_x_xxpay_std
]
print(df_train_train_id.shape)
print(df_train_train_y.shape)
print(df_train_train_X.shape)

In [None]:
utils.save_pickle(df_train_train_id, 'data/other/balance_34/xxpay/df_id_train_train_20240225_20240317.pickle')
utils.save_pickle(df_train_train_y, 'data/other/balance_34/xxpay/df_y_train_train_20240225_20240317.pickle')
utils.save_pickle(df_train_train_X, 'data/other/balance_34/xxpay/df_X_train_train_20240225_20240317.pickle')

## model

* train

In [None]:
df_id_train = utils.load_pickle('data/df_id_train_20231217_20240204.pickle')
df_y_train = utils.load_pickle('data/df_y_train_20231217_20240204.pickle')
df_X_train = utils.load_pickle('data/df_X_train_20231217_20240204.pickle')

print(df_id_train.shape)
print(df_y_train.shape)
print(df_X_train.shape)

In [None]:
df_X_y_train_dm = xgb.DMatrix(df_X_train, label=df_y_train)
df_X_y_train_dm

In [None]:
# cv
# 调参：学习率、树深
params = {
    'booster': 'gbtree', 
    'objective': 'binary:logistic', 
    'eta': 0.1, 
    'max_depth': 5, 'min_child_weight': 1, 
    'gamma': 0.1, 
    'subsample': 0.8, 'colsample_bytree': 0.8, 
    'alpha': 0.01, 'lambda': 0.01 
}

In [None]:
# 调参：树的个数（early stopping）
print(params)
xgb_cv_eval = xgb.cv(params, df_X_y_train_dm, 
                     num_boost_round=10000, 
                     nfold=5, stratified=True, 
                     metrics=['auc', 'logloss'], 
                     early_stopping_rounds=50, 
                     verbose_eval=10, 
                     seed=2024)
xgb_cv_eval

In [None]:
xgb_cv_eval.shape[0]

train cv
condition + p123 + rpl + lc + bc 特征，293
2023.12.17~2024.02.04
1.'eta': 0.1,  'max_depth': 5, num_boost_round: 4979, loss: 0.260447±0.001259, AUC: 0.947906±0.000447
2.'eta': 0.05, 'max_depth': 5, num_boost_round: , loss: , AUC: 
3.'eta': 0.01, 'max_depth': 5, num_boost_round: , loss: , AUC: 

In [None]:
# 目标item卡34，平衡处理，1:1生成负样本
# 尾部item上采样（对齐'xxx'，5k）
# fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709维
# 2024.02.25~2024.03.17
df_id_train_train = utils.load_pickle('data/other/balance_34/xxpay/df_id_train_train_20240225_20240317.pickle')
df_y_train_train = utils.load_pickle('data/other/balance_34/xxpay/df_y_train_train_20240225_20240317.pickle')
df_X_train_train = utils.load_pickle('data/other/balance_34/xxpay/df_X_train_train_20240225_20240317.pickle')

df_id_train_eval = utils.load_pickle('data/other/balance_34/xxpay/df_id_train_eval_20240225_20240317.pickle')
df_y_train_eval = utils.load_pickle('data/other/balance_34/xxpay/df_y_train_eval_20240225_20240317.pickle')
df_X_train_eval = utils.load_pickle('data/other/balance_34/xxpay/df_X_train_eval_20240225_20240317.pickle')

print(df_id_train_train.shape)
print(df_y_train_train.shape)
print(df_X_train_train.shape)

print(df_id_train_eval.shape)
print(df_y_train_eval.shape)
print(df_X_train_eval.shape)

In [None]:
list_feats_item = [x for x in df_X_train_train.columns if x.startswith('card_id_')]
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
# list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fc_std.pickle')

list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

list_feats_x = list_feats_item \
    + list_feats_x_fusion_std \
    + list_feats_x_asp1_std \
    + list_feats_x_asp2_std \
    + list_feats_x_dc_std \
    + list_feats_x_fc_std \
    + list_feats_x_xxpay_std

print(len(list_feats_x))

In [None]:
df_X_y_train_dm = xgb.DMatrix(df_X_train_train[list_feats_x], label=df_y_train_train)
df_X_y_train_dm

In [None]:
df_X_y_eval_dm = xgb.DMatrix(df_X_train_eval[list_feats_x], label=df_y_train_eval)
df_X_y_eval_dm

In [None]:
# 单验证集调参
params = {
    'booster': 'gbtree', 
    'objective': 'binary:logistic', 
    'eta': 0.1, 
    'max_depth':5, 'min_child_weight': 1, 
    'gamma': 0.1, 
    'subsample': 0.8, 'colsample_bytree': 0.8, 
    'alpha': 0.01, 'lambda': 0.01, 
    'eval_metric': ['auc', 'logloss']
}
print(params)

list_watch = [(df_X_y_train_dm, 'train'), (df_X_y_eval_dm, 'eval')]
dict_eval = {}

clf_xgb = xgb.train(params, df_X_y_train_dm, 
                    num_boost_round=10000, 
                    evals=list_watch, 
                    early_stopping_rounds=100, 
                    evals_result=dict_eval)

In [None]:
df_eval_metric = pd.DataFrame(dict_eval['eval'])
df_eval_metric.iloc[:clf_xgb.best_iteration+1, :]

In [None]:
%%time
# model
print(params)
clf_xgb = xgb.train(params, df_X_y_train_dm, num_boost_round=1188)

# clf_xgb.save_model('data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_34_20240225_20240317_xgb.model')
clf_xgb.save_model('data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_xxpay_34_20240225_20240317_xgb.model')

In [None]:
df_X_eval_dm = xgb.DMatrix(df_X_train_eval[list_feats_x])

pred_prob = clf_xgb.predict(df_X_eval_dm)
print(len(pred_prob))
pred_prob

In [None]:
df_y = pd.DataFrame({'y_true': df_y_train_eval, 'y_pred_prob': pred_prob})
print(df_y.shape)
df_y.head()

In [None]:
roc_auc_score(df_y['y_true'], df_y['y_pred_prob'])

* test

In [None]:
# 目标item卡34，平衡处理，1:1生成负样本
# 尾部item上采样（对齐'xxx'，5k）
# fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709
# 2024.02.25~2024.03.17
df_id_test = utils.load_pickle('data/other/balance_34/xxpay/df_id_test_20240225_20240317.pickle')
df_y_test = utils.load_pickle('data/other/balance_34/xxpay/df_y_test_20240225_20240317.pickle')
df_X_test = utils.load_pickle('data/other/balance_34/xxpay/df_X_test_20240225_20240317.pickle')

print(df_id_test.shape)
print(df_y_test.shape)
print(df_X_test.shape)

In [None]:
list_feats_item = [x for x in df_X_test.columns if x.startswith('card_id_')]
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
# list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fc_std.pickle')

list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

list_feats_x = list_feats_item \
    + list_feats_x_fusion_std \
    + list_feats_x_asp1_std \
    + list_feats_x_asp2_std \
    + list_feats_x_dc_std \
    + list_feats_x_fc_std \
    + list_feats_x_xxpay_std

print(len(list_feats_x))

In [None]:
df_X_test_dm = xgb.DMatrix(df_X_test[list_feats_x])
df_X_test_dm

In [None]:
# 目标item卡34，平衡处理，1:1生成负样本
# 尾部item上采样（对齐'xxx'，5k）
# fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709
# 2024.02.25~2024.03.17
clf_xgb = xgb.Booster(model_file='data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_xxpay_34_20240225_20240317_xgb.model')

clf_xgb

In [None]:
pred_prob = clf_xgb.predict(df_X_test_dm)
print(len(pred_prob))
pred_prob

In [None]:
df_y = pd.DataFrame({'y_true': df_y_test, 'y_pred_prob': pred_prob})
print(df_y.shape)
df_y.head()

In [None]:
# 整体效果
roc_auc_score(df_y['y_true'], df_y['y_pred_prob'])

In [None]:
def cal_recall(df_y, y_true, y_pred, threshold=0.3):
    df_y_sort = df_y.copy()
    df_y_sort.sort_values(by=y_pred, ascending=False, inplace=True)
    df_y_sort.reset_index(drop=True, inplace=True)
    
    df_y_sort['label_recall'] = 0
    df_y_sort.loc[:int(df_y_sort.shape[0]*threshold), 'label_recall'] = 1
    
    dict_cr = classification_report(df_y_sort[y_true], df_y_sort['label_recall'], output_dict=True)
    
    return dict_cr['1']['recall']

In [None]:
cal_recall(df_y, 'y_true', 'y_pred_prob', threshold=0.2)

* feature importance

In [None]:
# 目标item卡34，平衡处理，1:1生成负样本
# 尾部item上采样（对齐'xxx'，5k）
# fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709
# 2024.02.25~2024.03.17
clf_xgb = xgb.Booster(model_file='data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_xxpay_34_20240225_20240317_xgb.model')

clf_xgb

In [None]:
dict_card = utils.load_pickle('data/other/balance_34/dict_card_34.pickle')
list_feats_item = ['card_id_'+str(v) for _, v in dict_card.items()]

list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
# list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fc_std.pickle')

list_feats_x_fc_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

list_feats_x = list_feats_item \
    + list_feats_x_fusion_std \
    + list_feats_x_asp1_std \
    + list_feats_x_asp2_std \
    + list_feats_x_dc_std \
    + list_feats_x_fc_std \
    + list_feats_x_xxpay_std

print(len(list_feats_x))
list_feats_x[:50]

In [None]:
dict_feats = {'f{i}'.format(i=i): x for i, x in enumerate(list_feats_x)}
dict_feats

In [None]:
dict_fi = clf_xgb.get_score(importance_type='total_gain')
dict_fi

In [None]:
df_fn = pd.DataFrame.from_dict(dict_feats, orient='index', columns=['f_name'])
df_fn.reset_index(inplace=True)
df_fn.rename(columns={'index': 'f_index'}, inplace=True)
df_fn

In [None]:
df_fi = pd.DataFrame.from_dict(dict_fi, orient='index', columns=['f_importance'])
df_fi.reset_index(inplace=True)
df_fi.rename(columns={'index': 'f_index'}, inplace=True)
df_fi

In [None]:
df_feature_importance = df_fn.merge(df_fi, on='f_index', how='left')
df_feature_importance['f_importance'].fillna(0, inplace=True)
df_feature_importance

In [None]:
df_feature_importance.sort_values(by='f_importance', ascending=False, inplace=True)
df_feature_importance.reset_index(drop=True, inplace=True)
df_feature_importance

In [None]:
# df_feature_importance.to_csv('data/other/balance_34/feature_importance_20240225_20240317_xgb.csv', encoding='utf-8', index=False)
df_feature_importance.to_csv('data/other/balance_34/xxpay/feature_importance_20240225_20240317_xgb.csv', encoding='utf-8', index=False)

In [None]:
plt.rcParams['font.sans-serif'] = ['simhei']
plt.rcParams['font.serif'] = ['simhei']
plt.rcParams['axes.unicode_minus'] = False
sns.set(font_scale=1.5)
sns.set_style('darkgrid', {'font.sans-serif':['simhei', 'Droid Sans Fallback']})

plt.rcParams['figure.figsize'] = (12.0, 8.0)
fig, axes = plt.subplots(1, 1)
sns.barplot(x='f_importance', y='f_name', data=df_feature_importance.head(25), ax=axes)
axes.set_title('Top-25 importance features')

plt.show()

In [None]:
df_feature_importance.head(25)

In [None]:
df_feature_importance[df_feature_importance['f_name'].isin(list_feats_item)]

In [None]:
# df_feature_importance = pd.read_csv('data/other/balance_34/feature_importance_20240225_20240317_xgb.csv', encoding='utf-8')
df_feature_importance = pd.read_csv('data/other/balance_34/xxpay/feature_importance_20240225_20240317_xgb.csv', encoding='utf-8')

print(df_feature_importance.shape)
df_feature_importance.head()

In [None]:
dict_card = utils.load_pickle('data/other/balance_34/dict_card_34.pickle')
list_feats_item = ['card_id_'+str(v) for _, v in dict_card.items()]

df_fi25 = df_feature_importance[~df_feature_importance['f_name'].isin(list_feats_item)].head(25)
df_fi25

* compare

In [None]:
# 2024.03.24
df_fusion = pd.read_csv('data/other/balance_34/sample_label_feature_fusion_20240324.txt', sep='\t', encoding='utf-8')
df_asp1 = pd.read_csv('data/other/balance_34/sample_label_feature_asp1_20240324.txt', sep='\t', encoding='utf-8')
df_asp2 = pd.read_csv('data/other/balance_34/sample_label_feature_asp2_20240324.txt', sep='\t', encoding='utf-8')
df_dc = pd.read_csv('data/other/balance_34/sample_label_feature_dc_20240324.txt', sep='\t', encoding='utf-8')
df_fc = pd.read_csv('data/other/balance_34/sample_label_feature_fc_20240324.txt', sep='\t', encoding='utf-8')

df_xxpay = pd.read_csv('data/other/balance_34/sample_label_feature_xxpay_20240324.txt', sep='\t', encoding='utf-8')

print(df_fusion.shape)
print(df_asp1.shape)
print(df_asp2.shape)
print(df_dc.shape)
print(df_fc.shape)
print(df_xxpay.shape)

In [None]:
df_fusion['obs_dt'] = pd.to_datetime(df_fusion['obs_dt'])
df_asp1['obs_dt'] = pd.to_datetime(df_asp1['obs_dt'])
df_asp2['obs_dt'] = pd.to_datetime(df_asp2['obs_dt'])
df_dc['obs_dt'] = pd.to_datetime(df_dc['obs_dt'])
df_fc['obs_dt'] = pd.to_datetime(df_fc['obs_dt'])

df_xxpay['obs_dt'] = pd.to_datetime(df_xxpay['obs_dt'])

df_fusion['obs_dt'].value_counts()

In [None]:
df_fusion.head()

In [None]:
df_fusion[['card', 'label', 'uid']].groupby(by=['card', 'label']).count()

In [None]:
list_feats_id_dt_card_y = ['uid', 'obs_dt', 'card', 'label']

# 目标item卡34，平衡处理，1:1生成负样本
# 尾部item上采样（对齐'xxx'，5k）
# fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
list_feats_x_fc_std_arti = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

print(len(list_feats_x_fusion_std))
print(len(list_feats_x_asp1_std))
print(len(list_feats_x_asp2_std))
print(len(list_feats_x_dc_std))
print(len(list_feats_x_fc_std_arti))

print(len(list_feats_x_xxpay_std))

In [None]:
df = df_fusion[list_feats_id_dt_card_y+list_feats_x_fusion_std].\
    merge(df_asp1[list_feats_id_dt_card_y+list_feats_x_asp1_std], on=list_feats_id_dt_card_y, how='left').\
    merge(df_asp2[list_feats_id_dt_card_y+list_feats_x_asp2_std], on=list_feats_id_dt_card_y, how='left').\
    merge(df_dc[list_feats_id_dt_card_y+list_feats_x_dc_std], on=list_feats_id_dt_card_y, how='left').\
    merge(df_fc[list_feats_id_dt_card_y+list_feats_x_fc_std_arti], on=list_feats_id_dt_card_y, how='left').\
    merge(df_xxpay[list_feats_id_dt_card_y+list_feats_x_xxpay_std], on=list_feats_id_dt_card_y, how='left')

print(df.shape)
df.head()

In [None]:
df_des = utils.df_des(df)

# df_des.to_csv('data/other/balance_26/df_des_pos_20240324.csv', encoding='utf-8')
# df_des.to_csv('data/other/balance_34/df_des_pos_20240324.csv', encoding='utf-8')
df_des.to_csv('data/other/balance_34/xxpay/df_des_pos_20240324.csv', encoding='utf-8')

In [None]:
# dict_card = utils.load_pickle('data/other/balance_26/dict_card_26.pickle')
dict_card = utils.load_pickle('data/other/balance_34/dict_card_34.pickle')

print(len(dict_card))
dict_card

In [None]:
# 部分item测试
df = df[df['card'].isin(list(dict_card.keys()))]
df.reset_index(drop=True, inplace=True)
print(df.shape)

In [None]:
df['card_id'] = df['card']
df['card_id'].replace(dict_card, inplace=True)
df.head()

In [None]:
df[['card', 'card_id', 'uid']].groupby(['card', 'card_id']).count()

In [None]:
df['card_id'].nunique()

In [None]:
# 目标item卡34，平衡处理，1:1生成负样本
# 尾部item上采样（对齐'xxx'，5k）
# fusion + aspiration(replace) + v2 imp + debit card + flow cashier + xxpay + item 特征，709
list_feats_x_fusion_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_fusion_std.pickle')
list_feats_x_asp1_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp1_std.pickle')
list_feats_x_asp2_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_asp2_std.pickle')
list_feats_x_dc_std = utils.load_pickle('data/other/balance_34/list_feats/list_feats_x_dc_std.pickle')
list_feats_x_fc_std_arti = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_fc_std_arti.pickle')
list_feats_x_xxpay_std = utils.load_pickle('data/other/balance_34/xxpay/list_feats/list_feats_x_xxpay_std.pickle')

list_feats_x = list_feats_x_fusion_std \
    + list_feats_x_asp1_std \
    + list_feats_x_asp2_std \
    + list_feats_x_dc_std \
    + list_feats_x_fc_std_arti \
    + list_feats_x_xxpay_std

print(len(list_feats_x))
list_feats_x[:10]

In [None]:
def cal_each(df, card_id):
    df_ = df.copy()
    
    for k, v in dict_card.items():
        if v == card_id:
            df_['card_id_'+str(v)] = 1
        else:
            df_['card_id_'+str(v)] = 0
            
    df_X = df_[[x for x in df_.columns if x.startswith('card_id_')]+list_feats_x]
    
    df_X_dm = xgb.DMatrix(df_X)
    
    # clf_xgb = xgb.Booster(model_file='data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_26_20231008_20231029_xgb.model')
    # clf_xgb = xgb.Booster(model_file='data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_34_20240225_20240317_xgb.model')
    clf_xgb = xgb.Booster(model_file='data/model/zf_cashier_bind_card_balance_tos_ht_v2_995_dc_fc_xxpay_34_20240225_20240317_xgb.model')
    
    pred_prob = clf_xgb.predict(df_X_dm)
    
    return pd.DataFrame({'card_id_'+str(card_id): pred_prob})

In [None]:
df_y = df[['uid', 'obs_dt', 'card_id']].copy()
df_y.rename(columns={'card_id': 'y_true'}, inplace=True)

list_df = [df_y]
try:
    with tqdm(dict_card.items()) as t:
        for k, v in t:
            df_each = cal_each(df, v)
            list_df.append(df_each)
except KeyboardInterrupt:
    t.close()
    raise
t.close()

df_res = pd.concat(list_df, axis=1)
print(df_res.shape)
df_res.head()

In [None]:
%%time
list_card_id = [x for x in df_res.columns if x.startswith('card_id_')]
dict_card_index = {x: i for i, x in enumerate(list_card_id)}

df_res['y_true_rank'] = df_res.\
    apply(lambda x: {x:i for i, x in enumerate(list(np.array(x[list_card_id]).argsort())[::-1])}[dict_card_index['card_id_'+str(x['y_true'])]], axis=1)

df_res.head()

In [None]:
pd.set_option('display.max_columns', None)
df_res.sample(10)

In [None]:
# utils.save_pickle(df_res, 'data/other/balance_26/df_pos_pred_20240324.pickle')
# utils.save_pickle(df_res, 'data/other/balance_34/df_pos_pred_20240324.pickle')
utils.save_pickle(df_res, 'data/other/balance_34/xxpay/df_pos_pred_20240324.pickle')

In [None]:
# 活动item测试
# df_res = utils.load_pickle('data/other/balance_34/df_pos_pred_20240324.pickle')
df_res = utils.load_pickle('data/other/balance_34/xxpay/df_pos_pred_20240324.pickle')

print(df_res.shape)
df_res.head()

In [None]:
dict_card_26 = utils.load_pickle('data/other/balance_26/dict_card_26.pickle')

df_res_26 = df_res[
    (df_res['y_true'].isin(list(dict_card_26.values())))
][['uid', 'obs_dt', 'y_true']+['card_id_'+str(v) for _, v in dict_card_26.items()]]
df_res_26.head()

In [None]:
%%time
list_card_id_26 = [x for x in df_res_26.columns if x.startswith('card_id_')]
dict_card_index_26 = {x: i for i, x in enumerate(list_card_id_26)}

df_res_26['y_true_rank'] = df_res_26.\
    apply(lambda x: {x:i for i, x in enumerate(list(np.array(x[list_card_id_26]).argsort())[::-1])}[dict_card_index_26['card_id_'+str(x['y_true'])]], axis=1)

df_res_26.head()

In [None]:
pd.set_option('display.max_columns', None)
df_res_26.sample(10)

In [None]:
# df_res['y_pred_top_1'] = df_res['y_true_rank'].apply(lambda x: 1 if x<1 else 0)
# df_res['y_pred_top_2'] = df_res['y_true_rank'].apply(lambda x: 1 if x<2 else 0)
# df_res['y_pred_top_3'] = df_res['y_true_rank'].apply(lambda x: 1 if x<3 else 0)

# df_res.head()

df_res_26['y_pred_top_1'] = df_res_26['y_true_rank'].apply(lambda x: 1 if x<1 else 0)
df_res_26['y_pred_top_2'] = df_res_26['y_true_rank'].apply(lambda x: 1 if x<2 else 0)
df_res_26['y_pred_top_3'] = df_res_26['y_true_rank'].apply(lambda x: 1 if x<3 else 0)

df_res_26.head()

In [None]:
# print('Recall top 1: {}'.format(df_res['y_pred_top_1'].sum()/df_res.shape[0]))
# print('Recall top 2: {}'.format(df_res['y_pred_top_2'].sum()/df_res.shape[0]))
# print('Recall top 3: {}'.format(df_res['y_pred_top_3'].sum()/df_res.shape[0]))

print('Recall top 1: {}'.format(df_res_26['y_pred_top_1'].sum()/df_res_26.shape[0]))
print('Recall top 2: {}'.format(df_res_26['y_pred_top_2'].sum()/df_res_26.shape[0]))
print('Recall top 3: {}'.format(df_res_26['y_pred_top_3'].sum()/df_res_26.shape[0]))

In [None]:
# dict_card_reverse = {v:k for k, v in dict_card.items()}
dict_card_reverse = {v:k for k, v in dict_card_26.items()}
dict_card_reverse

In [None]:
# 分item卡效果
# try:
#     with tqdm(list(dict_card.values())) as t:
#         for card_id in t:
#             index_card = df_res[df_res['y_true']==card_id].index
#             df_y_card = df_res.loc[index_card, :]
#             print(df_y_card.shape)
#             print('card_id:{ci}, card:{c}, Recall top 1:{rt1}, Recall top 2:{rt2}, Recall top 3:{rt3},'.format(
#                 ci=card_id, 
#                 c=dict_card_reverse[card_id], 
#                 rt1=df_y_card['y_pred_top_1'].sum()/df_y_card.shape[0], 
#                 rt2=df_y_card['y_pred_top_2'].sum()/df_y_card.shape[0], 
#                 rt3=df_y_card['y_pred_top_3'].sum()/df_y_card.shape[0]))
# except KeyboardInterrupt:
#     t.close()
#     raise
# t.close()

try:
    with tqdm(list(dict_card_26.values())) as t:
        for card_id in t:
            index_card = df_res_26[df_res_26['y_true']==card_id].index
            df_y_card = df_res_26.loc[index_card, :]
            print(df_y_card.shape)
            print('card_id:{ci}, card:{c}, Recall top 1:{rt1}, Recall top 2:{rt2}, Recall top 3:{rt3},'.format(
                ci=card_id, 
                c=dict_card_reverse[card_id], 
                rt1=df_y_card['y_pred_top_1'].sum()/df_y_card.shape[0], 
                rt2=df_y_card['y_pred_top_2'].sum()/df_y_card.shape[0], 
                rt3=df_y_card['y_pred_top_3'].sum()/df_y_card.shape[0]))
except KeyboardInterrupt:
    t.close()
    raise
t.close()