# 增益模型
## X Learner lightgbm

In [1]:
import datetime
import numpy as np
import pandas as pd
import joblib
import warnings
import logging
import os
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import collections
import re
import copy
import xgboost as xgb
import lightgbm as lgb
import shap
import causalml
import statsmodels.api as sm

import utils
import propensity

from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from joblib import Parallel, delayed
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from supervised.automl import AutoML
from causalml.inference.meta import BaseSClassifier, BaseTClassifier, BaseXClassifier, BaseRClassifier

from causalml_visualize.visualize_ import plot_all

pd.set_option('display.max_columns', None)
pd.set_option('max_row', 500)
warnings.filterwarnings('ignore')
tqdm.pandas(desc='pandas bar')
logger = logging.getLogger('causalml')
logging.basicConfig(level=logging.INFO)
os.environ['NUMEXPR_MAX_THREADS'] = str(os.cpu_count())

KeyboardInterrupt: 

In [None]:
xgb.__version__

In [None]:
lgb.__version__

## data

In [None]:
df_train = pd.read_csv('data/no_secret_bt/sample_label_feature_union_uplift_20220206_20220227_train.txt', sep='\t', encoding='utf-8')

print(df_train.shape)
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.groupby(by=['obs_dt', 'dt'])['uid'].count()

In [None]:
df_train.groupby(by=['coupon'])['uid'].count()

In [None]:
df_train['label'].value_counts()

In [None]:
64223/(1000000+64223)

In [None]:
1000000 / 6650831

In [None]:
df_train['obs_dt'] = pd.to_datetime(df_train['obs_dt'])
df_train['dt'] = pd.to_datetime(df_train['dt'])
df_train.head()

In [None]:
df_train_des = utils.df_des(df_train)
df_train_des.to_csv('data/no_secret_bt/df_des_sample_label_feature_union_uplift_20220206_20220227_train.csv', encoding='utf-8')
df_train_des.head()

In [None]:
utils.save_pickle(df_train, 'data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_train.pickle')

## feature process

In [None]:
df_train = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_train.pickle')
print(df_train.shape)
df_train.head()

In [None]:
df_train[['obs_dt', 'dt']].info()

In [None]:
# dt转周一～周日
# LabelEncoding
df_train['dt_weekday'] = df_train['dt'].apply(lambda x: x.weekday())
df_train.head()

In [None]:
df_train_uid_obsDt = df_train[['uid', 'obs_dt', 'dt']]
df_train_treatment = df_train['coupon']
df_train_y = df_train['label']
df_train_X = df_train[['dt_weekday']+[x for x in df_train.columns 
                                      if x not in ['uid', 'obs_dt', 'dt', 'coupon', 'label', 'dt_weekday']]]

print(df_train_uid_obsDt.shape)
print(df_train_treatment.shape)
print(df_train_y.shape)
print(df_train_X.shape)

utils.save_pickle(df_train_uid_obsDt, 'data/no_secret_bt/df_sample_label_feature_union_uplift_uid_obsDate_20220206_20220227_train.pickle')
utils.save_pickle(df_train_treatment, 'data/no_secret_bt/df_sample_label_feature_union_uplift_treatment_20220206_20220227_train.pickle')
utils.save_pickle(df_train_y, 'data/no_secret_bt/df_sample_label_feature_union_uplift_y_20220206_20220227_train.pickle')
utils.save_pickle(df_train_X, 'data/no_secret_bt/df_sample_label_feature_union_uplift_X_20220206_20220227_train.pickle')

In [None]:
df_train_X.head()

In [None]:
df_train_treatment_map = df_train_treatment.map({0.0: 'control', 
                                                 0.2: 'treatment_1', 
                                                 0.5: 'treatment_2', 
                                                 1.0: 'treatment_3', 
                                                 1.5: 'treatment_4', 
                                                 1.8: 'treatment_5'})
utils.save_pickle(df_train_treatment_map, 'data/no_secret_bt/df_sample_label_feature_union_uplift_treatment_map_20220206_20220227.pickle')
df_train_treatment_map

In [None]:
# 初始化倾向性得分（暂使用各treatment组占比（treatment+control））
dict_p = {}
dict_p['treatment_1'] = np.array([0.0671]*df_train.shape[0])
dict_p['treatment_2'] = np.array([0.2996]*df_train.shape[0])
dict_p['treatment_3'] = np.array([0.0696]*df_train.shape[0])
dict_p['treatment_4'] = np.array([0.0533]*df_train.shape[0])
dict_p['treatment_5'] = np.array([0.0423]*df_train.shape[0])
utils.save_pickle(dict_p, 'data/no_secret_bt/dict_p_feature_union_uplift_20220206_20220227.pickle')
dict_p

In [None]:
# 0.0    635377
# 0.2     45707
# 0.5    271845
# 1.0     47513
# 1.5     35744
# 1.8     28037
print(45707/(45707+635377))
print(271845/(271845+635377))
print(47513/(47513+635377))
print(35744/(35744+635377))
print(28037/(28037+635377))

## model

* train

In [None]:
df_train_uid_obsDt = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_uid_obsDate_20220206_20220227_train.pickle')
df_train_treatment = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_treatment_20220206_20220227_train.pickle')
df_train_y = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_y_20220206_20220227_train.pickle')
df_train_X = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_X_20220206_20220227_train.pickle')

print(df_train_uid_obsDt.shape)
print(df_train_treatment.shape)
print(df_train_y.shape)
print(df_train_X.shape)

In [None]:
df_train_treatment_map = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_treatment_map_20220206_20220227.pickle')
print(df_train_treatment_map.shape)
df_train_treatment_map.head()

In [None]:
dict_p_train = utils.load_pickle('data/no_secret_bt/dict_p_feature_union_uplift_20220206_20220227.pickle')
dict_p_train

In [None]:
%%time
# X-learner-lgb
learner_x_lgb = BaseXClassifier(control_outcome_learner=lgb.LGBMClassifier(), 
                                treatment_outcome_learner=lgb.LGBMClassifier(), 
                                control_effect_learner=lgb.LGBMRegressor(), 
                                treatment_effect_learner=lgb.LGBMRegressor(), 
                                control_name='control')
pred_x = learner_x_lgb.fit_predict(df_train_X.values, df_train_treatment_map.values, df_train_y.values, p=dict_p_train)
joblib.dump(learner_x_lgb, 'data/model/no_secret_bt_20220206_20220227_union_XLearnerLgb.model')

In [None]:
pred_x

In [None]:
df_auuc_x = pd.DataFrame({'model': ['X-learner-lgb']})
for group in learner_x_lgb.t_groups:
    print(group)
    auuc = plot_all(cate=pred_x, treatment_groups=sorted(list(learner_x_lgb.t_groups)), treatment_test=df_train_treatment_map.values, 
                    y_test=df_train_y.values, cost_test=df_train_treatment.values, title='X-learner-LGB-p multi-treatment-{g} uplift curve'.format(g=group), 
                    select_treatment_group=group)
    df_auuc_x['auuc_'+group] = [auuc]

treatments_group_num = [45707, 271845, 47513, 35744, 28037]
df_auuc_x['auuc_mean'] = df_auuc_x[[x for x in df_auuc_x.columns if x != 'model']].\
    agg(func=np.average, axis=1, weights=treatments_group_num)

In [None]:
df_auuc_x

* test

In [None]:
# data
df_test = pd.read_csv('data/no_secret_bt/sample_label_feature_union_uplift_20220206_20220227_test.txt', sep='\t', encoding='utf-8')

print(df_test.shape)
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.groupby(by=['obs_dt', 'dt'])['uid'].count()

In [None]:
df_test.groupby(by=['coupon'])['uid'].count()

In [None]:
df_test['label'].value_counts()

In [None]:
16184 / 1346014

In [None]:
df_test['obs_dt'] = pd.to_datetime(df_test['obs_dt'])
df_test['dt'] = pd.to_datetime(df_test['dt'])
df_test.head()

In [None]:
df_test_des = utils.df_des(df_test)
df_test_des.to_csv('data/no_secret_bt/df_des_sample_label_feature_union_uplift_20220206_20220227_test.csv', encoding='utf-8')
df_test_des.head()

In [None]:
utils.save_pickle(df_test, 'data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_test.pickle')

In [None]:
# feature process
df_test = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_test.pickle')
print(df_test.shape)
df_test.head()

In [None]:
df_test[['obs_dt', 'dt']].info()

In [None]:
# dt转周一～周日
# LabelEncoding
df_test['dt_weekday'] = df_test['dt'].apply(lambda x: x.weekday())
df_test.head()

In [None]:
df_test_uid_obsDt = df_test[['uid', 'obs_dt', 'dt']]
df_test_treatment = df_test['coupon']
df_test_y = df_test['label']
df_test_X = df_test[['dt_weekday']+[x for x in df_test.columns 
                                    if x not in ['uid', 'obs_dt', 'dt', 'coupon', 'label', 'dt_weekday']]]

print(df_test_uid_obsDt.shape)
print(df_test_treatment.shape)
print(df_test_y.shape)
print(df_test_X.shape)

In [None]:
df_test_X.head()

In [None]:
utils.save_pickle(df_test_X, 'data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_test_X_transfor.pickle')

In [None]:
df_test_X = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_test_X_transfor.pickle')

In [None]:
df_test_treatment_map = df_test_treatment.map({0.0: 'control', 
                                               0.2: 'treatment_1', 
                                               0.5: 'treatment_2', 
                                               1.0: 'treatment_3', 
                                               1.5: 'treatment_4', 
                                               1.8: 'treatment_5'})
df_test_treatment_map

In [None]:
# 初始化倾向性得分（暂使用各treatment组占比（treatment+control））
dict_p_test = {}
dict_p_test['treatment_1'] = np.array([0.0653]*df_test.shape[0])
dict_p_test['treatment_2'] = np.array([0.2908]*df_test.shape[0])
dict_p_test['treatment_3'] = np.array([0.0653]*df_test.shape[0])
dict_p_test['treatment_4'] = np.array([0.0476]*df_test.shape[0])
dict_p_test['treatment_5'] = np.array([0.0367]*df_test.shape[0])
dict_p_test

In [None]:
# 0.0    821864
# 0.2     57405
# 0.5    337024
# 1.0     57371
# 1.5     41075
# 1.8     31275
print(57405/(57405+821864))
print(337024/(337024+821864))
print(57371/(57371+821864))
print(41075/(41075+821864))
print(31275/(31275+821864))

In [None]:
learner_x_lgb = joblib.load('data/model/no_secret_bt_20220206_20220227_union_XLearnerLgb.model')
learner_x_lgb

In [None]:
# 单条数据测试
dict_p_test_1 = {}
dict_p_test_1['treatment_1'] = np.array([0.0653])
dict_p_test_1['treatment_2'] = np.array([0.2908])
dict_p_test_1['treatment_3'] = np.array([0.0653])
dict_p_test_1['treatment_4'] = np.array([0.0476])
dict_p_test_1['treatment_5'] = np.array([0.0367])

data_1 = df_test_X.values[0:1,:]

In [None]:
%%time
pred_x_test = learner_x_lgb.predict(data_1, p=dict_p_test_1)
# pred_x_test

In [None]:
pred_x_test = learner_x_lgb.predict(df_test_X.values, p=dict_p_test)
pred_x_test

In [None]:
utils.save_pickle(pred_x_test, 'data/no_secret_bt/df_sample_label_feature_union_uplift_pred_20220206_20220227_test.pickle')

In [None]:
df_auuc_x = pd.DataFrame({'model': ['X-learner-LGB']})
for group in sorted(list(learner_x_lgb.t_groups)):
    print(group)
    auuc = plot_all(cate=pred_x_test, treatment_groups=sorted(list(learner_x_lgb.t_groups)), treatment_test=df_test_treatment_map.values, 
                    y_test=df_test_y.values, cost_test=df_test_treatment.values, title='X-learner-LGB multi-treatment-{g} uplift curve'.format(g=group), 
                    select_treatment_group=group)
    df_auuc_x['auuc_'+group] = [auuc]
    
treatments_group_num = [57405, 337024, 57371, 41075, 31275]
df_auuc_x['auuc_mean'] = df_auuc_x[[x for x in df_auuc_x.columns if x != 'model']].\
    agg(func=np.average, axis=1, weights=treatments_group_num)

In [None]:
df_auuc_x

## feature importance

In [None]:
df_X = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_20220206_20220227_test_X_transfor.pickle')
pred_x = utils.load_pickle('data/no_secret_bt/df_sample_label_feature_union_uplift_pred_20220206_20220227_test.pickle')

print(df_X.shape)
print(pred_x.shape)

In [None]:
learner_x_lgb = joblib.load('data/model/no_secret_bt_20220206_20220227_union_XLearnerLgb.model')
learner_x_lgb

In [None]:
%%time
dict_feats_imp = learner_x_lgb.get_importance(X=df_X.values, tau=pred_x, method='permutation')
utils.save_pickle(dict_feats_imp, 'data/no_secret_bt/feature_importance_union_uplift_20220206_20220227.pickle')
dict_feats_imp

In [None]:
list_feats_x_union = ['dt_weekday'] + utils.load_pickle('data/no_secret_bt/list_feats/list_feats_x_pi_feature_union_20220206_20220227.pickle')
print(len(list_feats_x_union))
list_feats_x_union[:10]

In [None]:
df_fi = pd.DataFrame({'feature': list_feats_x_union, 
                      'feature_index': ['Feature_{i}'.format(i=(str(x) if x>=100 else '0'+str(x) if x>=10 else '00'+str(x))) 
                                        for x in range(len(list_feats_x_union))]
                     })

for x in sorted(list(dict_feats_imp.keys())):
    df_fi_tg = pd.DataFrame(dict_feats_imp[x], columns=['permutation_importance_{}'.format(x)]).\
        reset_index().sort_values(by=['index'])
    df_fi_tg.rename(columns={'index': 'feature_index'}, inplace=True)
    df_fi = df_fi.merge(df_fi_tg, on='feature_index', how='left')

treatments_group_num = [57405, 337024, 57371, 41075, 31275]
df_fi['permutation_importance_mean'] = df_fi[[x for x in df_fi.columns if x not in ['feature', 'feature_index']]].\
    agg(func=np.average, axis=1, weights=treatments_group_num)
df_fi.sort_values(by=['permutation_importance_mean'], ascending=[False], inplace=True)
df_fi

In [None]:
df_fi[['permutation_importance_mean']].quantile([x/10 for x in range(11)]).T

In [None]:
df_fi.to_csv('data/no_secret_bt/feature_importance_union_uplift_20220206_20220227.csv', encoding='utf-8', index=False)

In [None]:
plt.rcParams['font.sans-serif'] = ['simhei']
plt.rcParams['font.serif'] = ['simhei']
plt.rcParams['axes.unicode_minus'] = False
sns.set(font_scale=1.5)
sns.set_style('darkgrid', {'font.sans-serif':['simhei', 'Droid Sans Fallback']})

plt.rcParams['figure.figsize'] = (12.0, 8.0)
fig, axes = plt.subplots(1, 1)
sns.barplot(x='permutation_importance_mean', y='feature', data=df_fi.head(25), ax=axes)
axes.set_title('Top-25 importance features')

plt.show()

In [None]:
df_fi[['feature', 'permutation_importance_mean']].head(25)