In [8]:
import pandas as pd
import numpy as np


df = pd.read_csv("data/train_lenta.csv")
df.fillna(0, inplace=True)
df.shape

(714510, 195)

In [17]:
from sklearn.model_selection import train_test_split

stratify_cols = pd.concat([df.treatment, df.target], axis=1)

df_train, df_val, treat_train, treat_val, y_train, y_val = train_test_split(
    df.drop(['treatment', 'target'], axis=1),
    df.treatment,
    df.target,
    test_size=0.2,
    random_state=59,
    stratify=stratify_cols
)

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")

Train shape: (571608, 193)
Validation shape: (142902, 193)


In [18]:
df_train['maturity'] = df_train['age'].apply(lambda x: x >= 18)
df_val['maturity'] = df_val['age'].apply(lambda x: x >= 18)

In [19]:
top_10_features = set([
    'stdev_days_between_visits_15d', 'k_var_days_between_visits_15d',
    'perdelta_days_between_visits_15_30d', 'k_var_days_between_visits_1m',
    'k_var_disc_per_cheque_15d', 'cheque_count_6m_g40',
    'k_var_cheque_group_width_15d', 'k_var_cheque_category_width_15d',
    'k_var_discount_depth_15d', 'food_share_15d'
])

for feature_name in top_10_features:
    for second_feature in top_10_features:
        df_train[feature_name + '_' + second_feature] = df_train[feature_name] * df_train[second_feature]
        df_val[feature_name + '_' + second_feature] = df_val[feature_name] * df_val[second_feature]

  df_train[feature_name + '_' + second_feature] = df_train[feature_name] * df_train[second_feature]
  df_val[feature_name + '_' + second_feature] = df_val[feature_name] * df_val[second_feature]
  df_train[feature_name + '_' + second_feature] = df_train[feature_name] * df_train[second_feature]
  df_val[feature_name + '_' + second_feature] = df_val[feature_name] * df_val[second_feature]
  df_train[feature_name + '_' + second_feature] = df_train[feature_name] * df_train[second_feature]
  df_val[feature_name + '_' + second_feature] = df_val[feature_name] * df_val[second_feature]


In [23]:
from sklearn.base import clone, BaseEstimator


class OneModelApproach(BaseEstimator):

    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, treatment, y, fit_params={}):
        # assume treatment is binary
        s_learner = clone(self.estimator)
        X_train = X.copy()
        X_train.loc[:, 'control_feature'] = treatment
        s_learner.fit(X_train, y, **fit_params)

        self.s_learner = s_learner

    def predict_proba(self, X_test_original):
        X_test = X_test_original.copy()
        X_test.loc[:, 'control_feature'] = 0
        pred_w0 = self.s_learner.predict_proba(X_test)[:, 1]

        X_test.loc[:, 'control_feature'] = 1
        pred_w1 = self.s_learner.predict_proba(X_test)[:, 1]
        pred_uplift = pred_w1 - pred_w0
        return pred_uplift

In [24]:
from sklearn.ensemble import RandomForestClassifier

rf_estimator = RandomForestClassifier(max_depth=17, random_state=59)

one_model = OneModelApproach(estimator=rf_estimator)
one_model.fit(df_train, treat_train, y_train)

In [25]:
uplift_pred = one_model.predict_proba(df_val)
uplift_pred

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       5.66641991e-05, 0.00000000e+00, 0.00000000e+00])

In [26]:
from sklift.metrics import uplift_at_k

# Assuming y_true, uplift_pred, and treatment are your data
score = uplift_at_k(y_true=y_val, uplift=uplift_pred, treatment=treat_val, k=0.05, strategy='overall')
score

0.06768072735424245