1. скачать набор данных маркетинговых кампаний отсюда https://www.kaggle.com/davinwijaya/customer-retention
2. поле conversion - это целевая переменная, а offer - коммуникация. Переименовать поля (conversion -> target, offer -> treatment) и привести поле treatment к бинарному виду (1 или 0, т.е было какое-то предложение или нет) - значение No Offer означает отсутствие коммуникации, а все остальные - наличие.
3. сделать разбиение набора данных не тренировочную и тестовую выборки
4. провести uplift-моделирование 3 способами:
    1. одна модель с признаком коммуникации (S-learner)
    2. модель с трансформацией таргета
    3. вариант с двумя независимыми моделями
5. в конце вывести единую таблицу сравнения метрик uplift@10%, uplift@20% 3 моделей
6. *для модели S-learner построить зависимость таргета (конверсии - поле conversion) от значения uplift:
    1. сделать прогноз и получить uplift для тестовой выборки
    2. отсортировать тестовую выборку по uplift по убыванию
    3. разбить на децили (pandas qcut вам в помощь)
    4. для каждого дециля посчитать среднюю conversion

In [1]:
!pip install scikit-uplift catboost



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
df = pd.read_csv('data.csv')
df.head(10)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0
5,6,134.83,0,1,Surburban,0,Phone,Buy One Get One,1
6,9,280.2,1,0,Surburban,1,Phone,Buy One Get One,0
7,9,46.42,0,1,Urban,0,Phone,Buy One Get One,0
8,9,675.07,1,1,Rural,1,Phone,Discount,0
9,10,32.84,0,1,Urban,1,Web,Buy One Get One,0


In [4]:
df.rename(columns={"conversion": "target", "offer": "treatment"}, inplace=True)

In [5]:
df['treatment'] = (df['treatment'] != 'No Offer').astype(int)
df.head(5)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,treatment,target
0,10,142.44,1,0,Surburban,0,Phone,1,0
1,6,329.08,1,1,Rural,1,Web,0,0
2,7,180.65,0,1,Surburban,1,Web,1,0
3,9,675.83,1,0,Rural,1,Web,1,0
4,2,45.34,1,0,Urban,0,Web,1,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64000 entries, 0 to 63999
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   recency        64000 non-null  int64  
 1   history        64000 non-null  float64
 2   used_discount  64000 non-null  int64  
 3   used_bogo      64000 non-null  int64  
 4   zip_code       64000 non-null  object 
 5   is_referral    64000 non-null  int64  
 6   channel        64000 non-null  object 
 7   treatment      64000 non-null  int64  
 8   target         64000 non-null  int64  
dtypes: float64(1), int64(6), object(2)
memory usage: 4.4+ MB


In [7]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=15)

In [8]:
features = ['recency', 'history', 'used_discount', 'used_bogo', 'zip_code', 'is_referral', 'channel']

In [9]:
X_train = df_train[features]
y_train = df_train['target']
treat_train = df_train['treatment']

X_test = df_test[features]
y_test = df_test['target']
treat_test = df_test['treatment']

In [10]:
from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel

from catboost import CatBoostClassifier

In [11]:
models_results = {
    'approach': [],
    'uplift@20%': [],
    'uplift@10%': []
}

In [12]:
cat_features =['zip_code', 'channel']

In [13]:
#одна модель с признаком коммуникации
sm = SoloModel(
    CatBoostClassifier(iterations=20, random_state=15, silent=True, cat_features=cat_features)
)

sm = sm.fit(X_train, y_train, treat_train)

uplift_sm = sm.predict(X_test)

sm_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.2)
sm_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_sm, treatment=treat_test, strategy='by_group', k=0.1)

models_results['approach'].append('SoloModel')
models_results['uplift@20%'].append(sm_score_20)
models_results['uplift@10%'].append(sm_score_10)

In [14]:
#модель с трансформацией таргета
from sklift.models import ClassTransformation


ct = ClassTransformation(
    CatBoostClassifier(iterations=20, random_state=15, silent=True, cat_features=cat_features)
)
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_test)

ct_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.2)
ct_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_ct, treatment=treat_test, strategy='by_group', k=0.1)

models_results['approach'].append('ClassTransformation')
models_results['uplift@20%'].append(ct_score_20)
models_results['uplift@10%'].append(ct_score_10)

In [15]:
#вариант с двумя независимыми моделями
from sklift.models import TwoModels


tm = TwoModels(
    CatBoostClassifier(iterations=20, random_state=15, silent=True, cat_features=cat_features),
    CatBoostClassifier(iterations=20, random_state=15, silent=True, cat_features=cat_features),
    method='vanilla'  # независимые модели
)
tm = tm.fit(
    X_train, y_train, treat_train
)

uplift_tm = tm.predict(X_test)

tm_score_20 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.2)
tm_score_10 = uplift_at_k(y_true=y_test, uplift=uplift_tm, treatment=treat_test, strategy='by_group', k=0.1)

models_results['approach'].append('TwoModels')
models_results['uplift@20%'].append(tm_score_20)
models_results['uplift@10%'].append(tm_score_10)

In [16]:
pd.DataFrame(data=models_results)

Unnamed: 0,approach,uplift@20%,uplift@10%
0,SoloModel,0.082456,0.103039
1,ClassTransformation,0.081182,0.111484
2,TwoModels,0.086728,0.092554


На первом дециле лучшие показатели у модели с трансформацией таргета. На втором дециле все модели очень близки, но лучше себя показал вариант с двумя независимыми моделями

In [17]:
sm_uptarget = pd.DataFrame({'uplift': uplift_sm, 'target': y_test}).sort_values(by='uplift', ascending=False)

In [18]:
dependence = {
    'decile': [],
    'mean_target': [],
}
chunks = np.array_split(sm_uptarget, 10)
for n, chunk in enumerate(chunks):
    dependence['decile'].append(f'{n+1}0%')
    dependence['mean_target'].append(chunk['target'].mean())

In [19]:
pd.DataFrame(data=dependence)

Unnamed: 0,decile,mean_target
0,10%,0.232813
1,20%,0.173437
2,30%,0.16875
3,40%,0.157292
4,50%,0.156771
5,60%,0.133854
6,70%,0.122917
7,80%,0.119271
8,90%,0.113542
9,100%,0.111458


На первом дециле наибольшая зависимость