In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel
from catboost import CatBoostClassifier

In [2]:
# Чтение данных
df_clients = pd.read_csv('./Data/clients.csv', index_col='client_id')
df_train = pd.read_csv('./Data/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('./Data/uplift_test.csv', index_col='client_id')

# Извлечение признаков
df_features = df_clients.copy()
df_features['first_issue_time'] = \
    (pd.to_datetime(df_features['first_issue_date'])
     - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
df_features['first_redeem_time'] = \
    (pd.to_datetime(df_features['first_redeem_date'])
     - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
df_features['issue_redeem_delay'] = df_features['first_redeem_time'] \
    - df_features['first_issue_time']
df_features = df_features.drop(['first_issue_date', 'first_redeem_date'], axis=1)

indices_train = df_train.index
indices_test = df_test.index

In [10]:
indices_train = df_features.loc[indices_train, :].dropna().index
indices_test = df_features.loc[indices_test, :].dropna().index
indices_learn, indices_valid = train_test_split(indices_train, 
                                                test_size=0.4, 
                                                random_state=12)

In [11]:
df_features.head()

Unnamed: 0_level_0,age,gender,first_issue_time,first_redeem_time,issue_redeem_delay
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000012768d,45,U,1501947648,1515094000.0,13146559.0
000036f903,72,F,1491832463,1492951000.0,1118613.0
000048b7a6,68,F,1544880791,,
000073194a,60,F,1495544174,1511522000.0,15978107.0
00007c7133,67,U,1495469828,1546277000.0,50806825.0


In [13]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [21]:
df_for_scale = df_features.dropna().drop(columns='gender', axis=1)
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df_for_scale)
pca_ = PCA(n_components=2, svd_solver='full')
X_pca = pca_.fit_transform(scaled_df)
df_for_scale['pca1'] = X_pca[:, 0]
df_for_scale['pca2'] = X_pca[:, 1]
df_for_scale['gender'] = df_features.dropna()['gender']

In [36]:
X_train = df_for_scale.loc[indices_learn, :]
y_train = df_train.loc[indices_learn, 'target']
treat_train = df_train.loc[indices_learn, 'treatment_flg']

X_val = df_for_scale.loc[indices_valid, :]
y_val = df_train.loc[indices_valid, 'target']
treat_val =  df_train.loc[indices_valid, 'treatment_flg']

X_train_full = df_for_scale.loc[indices_train, :]
y_train_full = df_train.loc[:, 'target']
treat_train_full = df_train.loc[:, 'treatment_flg']

X_test = df_for_scale.loc[indices_test, :]

cat_features = ['gender']

models_results = {
    'approach': [],
    'uplift@30%': [],
    'f1_treated': [],
    'f1_control': [],
    'ROC AUC treated': [],
    'ROC AUC control': [],
    'log loss treated': [],
    'log loss control': []
}

In [26]:
from sklearn.metrics import f1_score, roc_auc_score, log_loss

In [58]:
sm = SoloModel(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
sm = sm.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})

uplift_sm = sm.predict(X_val)

In [41]:
sm_score = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.3)

models_results['approach'].append('SoloModel')
models_results['uplift@30%'].append(sm_score)

In [30]:
import numpy as np

In [39]:
sm_trmnt_preds = sm.trmnt_preds_
sm_ctrl_preds = sm.ctrl_preds_
f1_t = f1_score(y_val, np.round(sm_trmnt_preds))
f1_c = f1_score(y_val, np.round(sm_ctrl_preds))
roc_auc_t = roc_auc_score(y_val, np.round(sm_trmnt_preds))
roc_auc_c = roc_auc_score(y_val, np.round(sm_ctrl_preds))
log_loss_t = log_loss(y_val, np.round(sm_trmnt_preds))
log_loss_c = log_loss(y_val, np.round(sm_ctrl_preds))
models_results['f1_treated'].append(f1_t)
models_results['f1_control'].append(f1_c)
models_results['ROC AUC treated'].append(roc_auc_t)
models_results['ROC AUC control'].append(roc_auc_c)
models_results['log loss treated'].append(log_loss_t)
models_results['log loss control'].append(log_loss_c)

In [59]:
sm_fi = pd.DataFrame({
    'feature_name': sm.estimator.feature_names_,
    'feature_score': sm.estimator.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

sm_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,64.759345
1,age,9.534838
2,first_issue_time,6.63371
3,treatment,5.197429
4,issue_redeem_delay,4.48841
5,pca1,4.077929
6,gender,3.147295
7,pca2,2.161044


In [43]:
param_grid = {'depth': [None, 5, 7, 12], 'learning_rate': [None, 0.1, 0.001, 0.0001], 'n_estimators': [50, 100, 200, 500]}

In [48]:
X_conf = pd.concat([X_train, treat_train], axis=1)
X_conf

Unnamed: 0_level_0,age,first_issue_time,first_redeem_time,issue_redeem_delay,pca1,pca2,gender,treatment_flg
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
b7d54d5442,63,1538214326,1.570192e+09,31977588.0,2.750131,-0.614157,U,1
ba6620d19e,58,1501015506,1.511463e+09,10447294.0,-1.315366,0.022321,F,0
73e5719380,39,1519815670,1.530048e+09,10231840.0,0.093816,0.511220,U,1
4fc8a46c30,38,1494059330,1.495111e+09,1051274.0,-2.385040,0.550559,F,0
69787fc4a3,11,1538651504,1.562580e+09,23928667.0,2.327205,0.055342,U,0
...,...,...,...,...,...,...,...,...
565d48034b,40,1533672931,1.542896e+09,9222877.0,1.077382,0.900997,U,0
b4fe37c41c,59,1516304369,1.540301e+09,23996664.0,0.632450,-0.562648,M,1
32e3b84c5e,32,1549311478,1.569060e+09,19749006.0,2.876726,0.551137,F,0
3815fe0a9c,67,1505563540,1.514030e+09,8466451.0,-1.092554,0.247453,U,0


In [54]:
model = CatBoostClassifier(thread_count=-1, cat_features=cat_features, random_state=42, silent=True)
result = model.grid_search(param_grid, X_conf, y_train)
result['params']


bestTest = 0.6331775883
bestIteration = 49

0:	loss: 0.6331776	best: 0.6331776 (0)	total: 1.22s	remaining: 1m 16s

bestTest = 0.6881028202
bestIteration = 49

1:	loss: 0.6881028	best: 0.6331776 (0)	total: 2.38s	remaining: 1m 13s

bestTest = 0.6926261859
bestIteration = 49

2:	loss: 0.6926262	best: 0.6331776 (0)	total: 3.5s	remaining: 1m 11s

bestTest = 0.6327665252
bestIteration = 86

3:	loss: 0.6327665	best: 0.6327665 (3)	total: 5.77s	remaining: 1m 26s

bestTest = 0.6835694011
bestIteration = 99

4:	loss: 0.6835694	best: 0.6327665 (3)	total: 8.03s	remaining: 1m 34s

bestTest = 0.6921137618
bestIteration = 99

5:	loss: 0.6921138	best: 0.6327665 (3)	total: 10.1s	remaining: 1m 37s

bestTest = 0.6323909642
bestIteration = 169

6:	loss: 0.6323910	best: 0.6323910 (6)	total: 20.5s	remaining: 2m 47s

bestTest = 0.6756674688
bestIteration = 199

7:	loss: 0.6756675	best: 0.6323910 (6)	total: 31.6s	remaining: 3m 41s

bestTest = 0.691087354
bestIteration = 199

8:	loss: 0.6910874	best: 0.6323910

{'depth': 5, 'iterations': 200, 'learning_rate': 0.1}

In [55]:
sm = SoloModel(CatBoostClassifier(**result['params'], random_state=42, silent=True))
sm = sm.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})

uplift_sm = sm.predict(X_val)
sm_score = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.3)

models_results['approach'].append('SoloModel(param selected)')
models_results['uplift@30%'].append(sm_score)
sm_trmnt_preds = sm.trmnt_preds_
sm_ctrl_preds = sm.ctrl_preds_
f1_t = f1_score(y_val, np.round(sm_trmnt_preds))
f1_c = f1_score(y_val, np.round(sm_ctrl_preds))
roc_auc_t = roc_auc_score(y_val, np.round(sm_trmnt_preds))
roc_auc_c = roc_auc_score(y_val, np.round(sm_ctrl_preds))
log_loss_t = log_loss(y_val, np.round(sm_trmnt_preds))
log_loss_c = log_loss(y_val, np.round(sm_ctrl_preds))
models_results['f1_treated'].append(f1_t)
models_results['f1_control'].append(f1_c)
models_results['ROC AUC treated'].append(roc_auc_t)
models_results['ROC AUC control'].append(roc_auc_c)
models_results['log loss treated'].append(log_loss_t)
models_results['log loss control'].append(log_loss_c)

In [57]:
sm_fi = pd.DataFrame({
    'feature_name': sm.estimator.feature_names_,
    'feature_score': sm.estimator.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

sm_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,77.507595
1,age,5.830796
2,first_issue_time,3.866163
3,pca1,3.543704
4,issue_redeem_delay,3.387721
5,pca2,2.168576
6,treatment,1.903456
7,gender,1.79199


In [60]:
from sklift.models import ClassTransformation


ct = ClassTransformation(CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True))
ct = ct.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})

uplift_ct = ct.predict(X_val)

ct_score = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.3)

  ct = ct.fit(X_train, y_train, treat_train, estimator_fit_params={'cat_features': cat_features})


In [62]:
models_results['approach'].append('ClassTransformation')
models_results['uplift@30%'].append(ct_score)
models_results['f1_treated'].append(None)
models_results['f1_control'].append(None)
models_results['ROC AUC treated'].append(None)
models_results['ROC AUC control'].append(None)
models_results['log loss treated'].append(None)
models_results['log loss control'].append(None)

In [64]:
ct_fi = pd.DataFrame({
    'feature_name': ct.estimator.feature_names_,
    'feature_score': ct.estimator.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

ct_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,74.797974
1,age,7.186103
2,issue_redeem_delay,4.6946
3,pca1,3.800551
4,first_issue_time,3.740684
5,pca2,3.122471
6,gender,2.657618


В этой модели я не понял как посчитать остальные метрики.

In [77]:
from sklift.models import TwoModels


tm = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=20, thread_count=-1, random_state=42, silent=True), 
    estimator_ctrl=CatBoostClassifier(iterations=20, thread_count=-1, random_state=42, silent=True), 
    method='vanilla'
)
tm = tm.fit(
    X_train, y_train, treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features}, 
    estimator_ctrl_fit_params={'cat_features': cat_features}
)

uplift_tm = tm.predict(X_val)

tm_score = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.3)

models_results['approach'].append('TwoModels')
models_results['uplift@30%'].append(tm_score)
tm_trmnt_preds = tm.trmnt_preds_
tm_ctrl_preds = tm.ctrl_preds_
f1_t = f1_score(y_val, np.round(tm_trmnt_preds))
f1_c = f1_score(y_val, np.round(tm_ctrl_preds))
roc_auc_t = roc_auc_score(y_val, np.round(tm_trmnt_preds))
roc_auc_c = roc_auc_score(y_val, np.round(tm_ctrl_preds))
log_loss_t = log_loss(y_val, np.round(tm_trmnt_preds))
log_loss_c = log_loss(y_val, np.round(tm_ctrl_preds))
models_results['f1_treated'].append(f1_t)
models_results['f1_control'].append(f1_c)
models_results['ROC AUC treated'].append(roc_auc_t)
models_results['ROC AUC control'].append(roc_auc_c)
models_results['log loss treated'].append(log_loss_t)
models_results['log loss control'].append(log_loss_c)

In [65]:
tm_trmnt_fi = pd.DataFrame({
    'feature_name': tm.estimator_trmnt.feature_names_,
    'feature_score': tm.estimator_trmnt.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

tm_trmnt_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,67.060957
1,age,10.527632
2,first_issue_time,7.253696
3,issue_redeem_delay,5.838499
4,gender,4.707691
5,pca1,2.742404
6,pca2,1.869123


In [66]:
tm_ctrl_fi = pd.DataFrame({
    'feature_name': tm.estimator_ctrl.feature_names_,
    'feature_score': tm.estimator_ctrl.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

tm_ctrl_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,33.086619
1,age,18.439912
2,issue_redeem_delay,14.001784
3,first_issue_time,11.694744
4,pca1,10.385095
5,pca2,6.416918
6,gender,5.974927


In [78]:
tm_ctrl = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    estimator_ctrl=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    method='ddr_control'
)
tm_ctrl = tm_ctrl.fit(
    X_train, y_train, treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features}, 
    estimator_ctrl_fit_params={'cat_features': cat_features}
)

uplift_tm_ctrl = tm_ctrl.predict(X_val)

tm_ctrl_score = uplift_at_k(y_true=y_val, uplift=uplift_tm_ctrl, treatment=treat_val, strategy='by_group', k=0.3)

models_results['approach'].append('TwoModels_ddr_control')
models_results['uplift@30%'].append(tm_ctrl_score)
tm_ctrl_trmnt_preds = tm_ctrl.trmnt_preds_
tm_ctrl_ctrl_preds = tm_ctrl.ctrl_preds_
f1_t = f1_score(y_val, np.round(tm_ctrl_trmnt_preds))
f1_c = f1_score(y_val, np.round(tm_ctrl_ctrl_preds))
roc_auc_t = roc_auc_score(y_val, np.round(tm_ctrl_trmnt_preds))
roc_auc_c = roc_auc_score(y_val, np.round(tm_ctrl_ctrl_preds))
log_loss_t = log_loss(y_val, np.round(tm_ctrl_trmnt_preds))
log_loss_c = log_loss(y_val, np.round(tm_ctrl_ctrl_preds))
models_results['f1_treated'].append(f1_t)
models_results['f1_control'].append(f1_c)
models_results['ROC AUC treated'].append(roc_auc_t)
models_results['ROC AUC control'].append(roc_auc_c)
models_results['log loss treated'].append(log_loss_t)
models_results['log loss control'].append(log_loss_c)

In [79]:
tm_ctrl_trmnt_fi = pd.DataFrame({
    'feature_name': tm_ctrl.estimator_trmnt.feature_names_,
    'feature_score': tm_ctrl.estimator_trmnt.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

tm_ctrl_trmnt_fi

Unnamed: 0,feature_name,feature_score
0,ddr_control,38.576581
1,first_redeem_time,23.404433
2,age,8.661961
3,issue_redeem_delay,7.598724
4,gender,7.070415
5,first_issue_time,6.633057
6,pca1,5.386503
7,pca2,2.668326


In [80]:
tm_ctrl_ctrl_fi = pd.DataFrame({
    'feature_name': tm_ctrl.estimator_ctrl.feature_names_,
    'feature_score': tm_ctrl.estimator_ctrl.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

tm_ctrl_ctrl_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,33.086619
1,age,18.439912
2,issue_redeem_delay,14.001784
3,first_issue_time,11.694744
4,pca1,10.385095
5,pca2,6.416918
6,gender,5.974927


In [81]:
tm_trmnt = TwoModels(
    estimator_trmnt=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    estimator_ctrl=CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True), 
    method='ddr_treatment'
)
tm_trmnt = tm_trmnt.fit(
    X_train, y_train, treat_train,
    estimator_trmnt_fit_params={'cat_features': cat_features}, 
    estimator_ctrl_fit_params={'cat_features': cat_features}
)

uplift_tm_trmnt = tm_trmnt.predict(X_val)

tm_trmnt_score = uplift_at_k(y_true=y_val, uplift=uplift_tm_trmnt, treatment=treat_val, strategy='by_group', k=0.3)

models_results['approach'].append('TwoModels_ddr_treatment')
models_results['uplift@30%'].append(tm_trmnt_score)
tm_trmnt_trmnt_preds = tm_trmnt.trmnt_preds_
tm_trmnt_ctrl_preds = tm_trmnt.ctrl_preds_
f1_t = f1_score(y_val, np.round(tm_trmnt_trmnt_preds))
f1_c = f1_score(y_val, np.round(tm_trmnt_ctrl_preds))
roc_auc_t = roc_auc_score(y_val, np.round(tm_trmnt_trmnt_preds))
roc_auc_c = roc_auc_score(y_val, np.round(tm_trmnt_ctrl_preds))
log_loss_t = log_loss(y_val, np.round(tm_trmnt_trmnt_preds))
log_loss_c = log_loss(y_val, np.round(tm_trmnt_ctrl_preds))
models_results['f1_treated'].append(f1_t)
models_results['f1_control'].append(f1_c)
models_results['ROC AUC treated'].append(roc_auc_t)
models_results['ROC AUC control'].append(roc_auc_c)
models_results['log loss treated'].append(log_loss_t)
models_results['log loss control'].append(log_loss_c)

In [82]:
tm_trmnt_trmnt_fi = pd.DataFrame({
    'feature_name': tm_trmnt.estimator_trmnt.feature_names_,
    'feature_score': tm_trmnt.estimator_trmnt.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

tm_trmnt_trmnt_fi

Unnamed: 0,feature_name,feature_score
0,first_redeem_time,67.060957
1,age,10.527632
2,first_issue_time,7.253696
3,issue_redeem_delay,5.838499
4,gender,4.707691
5,pca1,2.742404
6,pca2,1.869123


In [83]:
tm_trmnt_ctrl_fi = pd.DataFrame({
    'feature_name': tm_trmnt.estimator_ctrl.feature_names_,
    'feature_score': tm_trmnt.estimator_ctrl.feature_importances_
}).sort_values('feature_score', ascending=False).reset_index(drop=True)

tm_trmnt_ctrl_fi

Unnamed: 0,feature_name,feature_score
0,ddr_treatment,41.934553
1,age,11.855829
2,issue_redeem_delay,9.939036
3,first_redeem_time,9.838277
4,pca1,9.507321
5,first_issue_time,8.244655
6,pca2,6.609344
7,gender,2.070984


In [84]:
pd.DataFrame(data=models_results).sort_values('uplift@30%', ascending=False)

Unnamed: 0,approach,uplift@30%,f1_treated,f1_control,ROC AUC treated,ROC AUC control,log loss treated,log loss control
2,ClassTransformation,0.06693,,,,,,
3,TwoModels,0.06405,0.778486,0.781558,0.515963,0.510457,12.228273,12.206516
4,TwoModels_ddr_control,0.061744,0.777698,0.781558,0.517858,0.510457,12.226851,12.206516
0,SoloModel,0.056783,0.778559,0.781719,0.515487,0.510673,12.232532,12.198472
1,SoloModel(param selected),0.04832,0.7795,0.782008,0.51613,0.511943,12.194207,12.171975
5,TwoModels_ddr_treatment,0.041754,0.778486,0.781131,0.515963,0.511004,12.228273,12.212666
