In [2]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import econml # 0.14.1
from econml.dml import DML, LinearDML, SparseLinearDML, CausalForestDML
import numpy as np
from itertools import product
from sklearn.linear_model import (Lasso, LassoCV, LogisticRegression,
                                  LogisticRegressionCV,LinearRegression,
                                  MultiTaskElasticNet,MultiTaskElasticNetCV)
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, clone
import lightgbm as lgb
import seaborn as sns
import glob
import os

In [None]:
%matplotlib inline

In [None]:
# M_y(x)分类模型需要包装成回归模型
class RegressionWrapper(BaseEstimator):
    def __init__(self, clf):
        self.clf = clf

    def fit(self, X, y, **kwargs):
        self.clf_ = clone(self.clf)
        self.clf_.fit(X, y, **kwargs)
        return self

    def predict(self, X):
        return self.clf_.predict_proba(X)[:, 1]

### 1.加载数据和预处理

In [None]:
cols = pd.read_csv('cols_select.txt', header=None)
cols = list(cols[0])

df = pd.read_csv('train_dataset_2023-07-10.txt', sep='\t')
df1 = pd.read_csv('train_dataset_2023-07-03.txt', sep='\t')
df2 = pd.read_csv('train_dataset_2023-06-26.txt', sep='\t')
df3 = pd.read_csv('train_dataset_2023-06-19.txt', sep='\t')
df4 = pd.read_csv('train_dataset_2023-06-12.txt', sep='\t')
df5 = pd.read_csv('train_dataset_2023-06-05.txt', sep='\t')
df = pd.concat([df, df1, df2, df3, df4, df5], axis=0)
df1 = df2 = df3 = df4 = df5 = pd.DataFrame()
df = df[cols]
df = df.fillna(0)

In [None]:
df

In [None]:
df['t1.label_purchase'].value_counts()

In [None]:
df['t1.product'].value_counts()

In [None]:
df['tmp'] = df['t1.product'] + df['t1.label_purchase'].astype(str)

In [None]:
df['tmp']

In [None]:
df['tmp'].value_counts()
df.pop('tmp')

In [None]:
label_encode_dict = {
    'xxx':5,
    'xxx':2,
    'xxx':0,
    'xxx':1, 
    'xxx':3, 
    'xxx':4 }
df = df[df['t1.product'].isin(list(label_encode_dict.keys()))].reset_index(drop=True)
df['t1.product'] = df['t1.product'].apply(lambda x: label_encode_dict[x])

In [None]:
T, Y = df['t1.product'], df['t1.label_purchase']

In [None]:
X = df.iloc[:, :-2]

In [None]:
X

### 2.模型构建和训练

In [None]:

est = CausalForestDML(
    model_y= RegressionWrapper(lgb.LGBMClassifier(random_state=2023, num_leaves=63, learning_rate=0.05, n_estimators=200, objective='binary')),
    model_t = lgb.LGBMClassifier(random_state=2023, num_leaves=63, learning_rate=0.05, n_estimators=200, objective='multiclass'),
    discrete_treatment=True,
    max_depth=10,
    min_samples_split=10,
    cv=10,
    #,criterion='het'
    verbose=10,
    random_state=2023,
    n_estimators=100,
    subforest_size=2,
)

In [None]:
est.fit(Y.values, T=T.values,  X=X.values, W=None, cache_values=True)

In [None]:
est.summary()

### 3.模型效果评估

In [None]:
import glob
import os
import pandas as pd

cols = pd.read_csv('cols_select_test.txt', header=None)
cols = list(cols[0])

joined_files = os.path.join("./", "test_dataset_*.txt")
joined_list = glob.glob(joined_files)
print(joined_list)

df_test = pd.concat([pd.read_csv(i, sep='\t') for i in joined_list], ignore_index=True)

df_test = df_test[cols]
df_test = df_test.fillna(0)
df_test = df_test.rename(columns={'t1.label_invert': 't1.label_purchase'})

In [None]:
df_test

In [None]:
df_test = df_test[df_test['t1.product'].isin(list(label_encode_dict.keys()))].reset_index(drop=True)
df_test['t1.product'] = df_test['t1.product'].apply(lambda x: label_encode_dict[x])

T_test, Y_test = df_test['t1.product'], df_test['t1.label_purchase']
X_test = df_test.iloc[:, :-2]

In [None]:
res = est.const_marginal_effect(X_test)

In [None]:
res.shape

In [None]:
res[:, -1]

In [None]:
for p in range(len(label_encode_dict)-1):
    cnt = 0
    for i in res[:, p] > 0:
        if i: cnt += 1
    print(p, cnt)

In [None]:
df_test['tmp'] = df_test['t1.product'].astype(str) + df_test['t1.label_purchase'].astype(str)

In [None]:
df_test['tmp'].value_counts()

In [None]:
def pmg(df_test, predict_result):
    df_test_label = df_test[['t1.product', 't1.label_purchase']].rename(
        columns={'t1.product': 'treatment', 't1.label_purchase': 'label'})
    zeros = np.zeros(predict_result.shape[0])
    res_tmp = np.insert(predict_result, 0, zeros, axis=1)
    res_tmp = pd.DataFrame(res_tmp) 
    res_tmp['predict_treatment'] = res_tmp.apply(
        lambda row: sorted(dict(row[:-1]).items(), key=lambda x: x[1], reverse=True)[0][0], axis=1)

    df_summary = pd.concat([res_tmp, df_test_label], axis=1)
    df_summary['if_same'] = (df_summary['predict_treatment'] == df_summary['treatment']).astype(int)
    df_overlap_summary = df_summary[df_summary['if_same'] == 1].groupby('predict_treatment') \
        .agg({'label':'mean'}) \
        .rename(columns={'label':'mean'}) \
        .reset_index()
    df_stg_summary = df_summary.groupby('predict_treatment')\
        .count().reset_index() \
        .rename(columns={'treatment':'num_stg_treatment'})[['predict_treatment','num_stg_treatment']]
    df_finnal_summary = df_overlap_summary.merge(df_stg_summary, how = 'inner', on = 'predict_treatment')
    total_gain = sum(df_finnal_summary['mean'] * df_finnal_summary['num_stg_treatment'])
    total_count = df_finnal_summary.num_stg_treatment.sum()
    avg_gain = total_gain/total_count
    base = df_summary.label.mean()
    gain = (avg_gain - base)/base

    return gain

In [None]:
pmg(df_test, res)

In [None]:
# import joblib

# modelpath = 'dml_v1_0.pkl'
# joblib.dump(est, filename=modelpath)

import cloudpickle
modelpath = 'dml_v1_0_.pkl'
with open(modelpath, 'wb') as f: 
    cloudpickle.dump(est, f)

In [None]:
# m = joblib.load(modelpath)
# print(m)

### 4.特征重要度及可视化

#### 4.1 feature_importance_

In [None]:
feature_importance_dict = {df.columns[i]: est.feature_importances_[i] for i in range(df.shape[1] - 2)}

In [None]:
feature_importance = pd.DataFrame({'feature_name': df.columns[:-2], 'importance': est.feature_importances_})

In [None]:
feature_importance = feature_importance.sort_values(by="importance", ascending=False)

In [None]:
feature_importance

In [None]:
feature_importance[feature_importance['importance'] > 0]

In [None]:
import seaborn as sns

plt.figure(figsize=(5, 20))
# data=feature_importance[feature_importance['importance'] > 0]
data = feature_importance.reset_index(drop=True).loc[:50, :]
sns.barplot(x="importance", y="feature_name", data=data, order=data["feature_name"], orient="h")

In [None]:
feature_importance.reset_index(drop=True).loc[:400, :][['feature_name']].to_csv('cols_select_importance.txt', header=None, index=False, sep='\t')

#### 4.2 shap_values

In [None]:
shap_values = est.shap_values(X)

In [None]:
shap_values

### 线上打分pmg计算

In [None]:
import glob
import os
import pandas as pd

score_cols = ['insurance_driver_end_trip_popup_window_score.uid', 'insurance_driver_end_trip_popup_window_score.score_0',	'insurance_driver_end_trip_popup_window_score.score_1',	'insurance_driver_end_trip_popup_window_score.score_2', 'insurance_driver_end_trip_popup_window_score.score_5',	'insurance_driver_end_trip_popup_window_score.score_6',	'insurance_driver_end_trip_popup_window_score.score_7']
label_cols = ['t1.uid', 't1.product', 't1.label_invert']

score_files = os.path.join("./", "online_score_*.txt")
score_list = glob.glob(score_files)
print(score_list)

test_files = os.path.join("./", "test_dataset_*.txt")
test_list = glob.glob(test_files)
print(test_list)

df_tmp = pd.concat([pd.read_csv(score_list[i], sep='\t')[score_cols].merge(pd.read_csv(test_list[i], sep='\t')[label_cols], how='inner', left_on=score_cols[0], right_on=label_cols[0]) for i in range(len(score_list))], ignore_index=True)

# df_test = pd.concat([pd.read_csv(i, sep='\t') for i in joined_list], ignore_index=True)

In [None]:
df_tmp

In [None]:
product_name = ['xxx', 'xxx', 'xxx', 'xxx', 'xxx', 'xxx']
df_tmp = df_tmp.rename(columns={score_cols[i+1]: product_name[i] for i in range(len(product_name))})
df_tmp['predict_treatment'] = df_tmp[product_name].apply(
        lambda row: sorted(dict(row).items(), key=lambda x: x[1], reverse=True)[0][0], axis=1)

In [None]:
df_tmp = df_tmp[df_tmp['t1.product'].isin(product_name)].reset_index(drop=True)
df_tmp

In [None]:
# pmg
def pmg(df_tmp):
    """
    df_test: DataFrame of test data
    predict_result: dml model output
    """
    df_summary = df_tmp[['t1.product', 't1.label_invert', 'predict_treatment']].rename(
        columns={'t1.product': 'treatment', 't1.label_invert': 'label'})
    
    df_summary['if_same'] = (df_summary['predict_treatment'] == df_summary['treatment']).astype(int)
    df_overlap_summary = df_summary[df_summary['if_same'] == 1].groupby('predict_treatment') \
        .agg({'label':'mean'}) \
        .rename(columns={'label':'mean'}) \
        .reset_index()
    df_stg_summary = df_summary.groupby('predict_treatment')\
        .count().reset_index() \
        .rename(columns={'treatment':'num_stg_treatment'})[['predict_treatment','num_stg_treatment']]
    df_finnal_summary = df_overlap_summary.merge(df_stg_summary, how = 'inner', on = 'predict_treatment')
    total_gain = sum(df_finnal_summary['mean'] * df_finnal_summary['num_stg_treatment'])
    total_count = df_finnal_summary.num_stg_treatment.sum()
    avg_gain = total_gain/total_count
    base = df_summary.label.mean()
    gain = (avg_gain - base)/base

    return gain

In [None]:
pmg(df_tmp)

### 附录：
#### 1. PMG


![pmg.png](pmg.png)

#### 2.特征筛选

In [None]:
from people_uplift_tree import PeopleUpliftTree

In [None]:
X

In [None]:
top50_cols = pd.read_csv('cols_select_importance.txt', header=None)
top50_cols = list(top50_cols[0])[:50]

In [None]:
top50_cols

In [None]:
from people_uplift_tree import PeopleUpliftTree
from IPython.display import Image

put = PeopleUpliftTree(max_depth=3, min_samples_leaf=100)

In [None]:
T1 = T.apply(lambda x: x if x == 0 else 1)

In [None]:
T1.shape

In [None]:
for i in top50_cols:
    tree = put.fit(x=df[i], treatment=T1, y=Y)
    graph = put.plot_tree(tree, x_name=i, score_name="convert_prob")
    
    display(Image(graph.create_png()))

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import econml
from econml.dml import DML, LinearDML, SparseLinearDML, CausalForestDML
import numpy as np
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import matplotlib
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, clone
import lightgbm as lgb
import seaborn as sns
import glob
import os
import joblib
import cloudpickle
from reg_wrapper import RegressionWrapper
import argparse

In [None]:
TEST_FILE_DATE = ['2023-10-02', '2023-10-09']

In [None]:
print('-------------------------\n testing... \n-------------------------\n')
cols = pd.read_csv('./cols_select_test_50.txt', header=None)
cols = list(cols[0])

test_file_list = ['test_dataset_{}.txt'.format(i) for i in TEST_FILE_DATE]
df_test = pd.concat([pd.read_csv(i, sep='\t') for i in test_file_list], ignore_index=True)

df_test = df_test[cols]
df_test = df_test.fillna(0)
df_test = df_test.rename(columns={'t1.label_invert': 't1.label_purchase'})
df_test = df_test[df_test['t1.product'].isin(list(label_encode_dict.keys()))].reset_index(drop=True)
df_test['t1.product'] = df_test['t1.product'].apply(lambda x: label_encode_dict[x])
print(df_test['t1.product'].value_counts())