## Все что вам нужно это глубина 😏. И немного времени подождать обучение RandomForest'a

## Устанавливаем необходимое

In [8]:
!pip install scikit-uplift

Collecting scikit-uplift
  Downloading scikit_uplift-0.5.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-uplift
Successfully installed scikit-uplift-0.5.1


## Считываем данные

In [15]:
import numpy as np
import pandas as pd

import os

path_train = "/kaggle/input/uplift-ai-talent-hub/train_lenta.csv"
path_test = "/kaggle/input/uplift-ai-talent-hub/test_lenta.csv"
path_sample_sub = "/kaggle/input/uplift-ai-talent-hub/sample_submission.csv"

df = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)
df.shape, df_test.shape

((714510, 195), (170297, 193))

## Заполняем пропуски нулями
Никакие медианы тут не нужны (экспериментально и эвристически протестил несколько раз, более умные подходы попробовать не успел)

In [3]:
df.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)

df.isna().sum().sum(), df_test.isna().sum().sum()

(0, 0)

## Сплитим на трейн и валидацию
Строго 80 на 20, закон Парето, все дела

In [4]:
from sklearn.model_selection import train_test_split

stratify_cols = pd.concat([df.treatment, df.target], axis=1)

df_train, df_val, treat_train, treat_val, y_train, y_val = train_test_split(
    df.drop(['treatment', 'target'], axis=1),
    df.treatment,
    df.target,
    test_size=0.2,
    random_state=59,
    stratify=stratify_cols
)

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")

Train shape: (571608, 193)
Validation shape: (142902, 193)


## Прописываем двухмодельный подход

Меняем код из семинара чут-чут, чтобы он использовал копии данных вместо их реальных

Это поможет сохранить данные в исходном виде (для кручения гиперпараметров к примеру), а при тесте ничего не отвалится

Пробовал также S-learner, но он с треском провалился

In [5]:
from sklearn.base import clone, BaseEstimator


class TwoModelApproach(BaseEstimator):

    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, treatment, y, fit_params={}):
        # assume treatment is binary
        X_treatment = X.loc[treatment == 1].copy()
        y_treatment = pd.Series(y).loc[treatment == 1]
        X_control = X.loc[treatment == 0].copy()
        y_control = pd.Series(y).loc[treatment == 0]

        model_control = clone(self.estimator)
        model_control.fit(X_control, y_control, **fit_params)

        X_treatment.loc[:, 'control_feature'] = model_control.predict_proba(X_treatment)[:, 1]

        model_treatment = clone(self.estimator)
        model_treatment.fit(X_treatment, y_treatment, **fit_params)

        self.model_control = model_control
        self.model_treatment = model_treatment

    def predict_proba(self, X_test_original):
        pred_control_test = self.model_control.predict_proba(X_test_original)[:, 1]
        X_test = X_test_original.copy()
        X_test.loc[:, 'control_feature'] = pred_control_test
        pred_treatment_test = self.model_treatment.predict_proba(X_test)[:, 1]
        pred_uplift = pred_treatment_test - pred_control_test
        return pred_uplift

## Обучаем модельку

Запускаем _RandomForest_ глубиной 45+ на 1000 деревьев

In [6]:
from sklearn.ensemble import RandomForestClassifier


rf_estimator = RandomForestClassifier(n_estimators=1000, max_depth=47, random_state=59, verbose=True, n_jobs=4)

two_model = TwoModelApproach(estimator=rf_estimator)
two_model.fit(df_train, treat_train, y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   28.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  4.6min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:  8.1min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed: 10.3min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   12.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   27.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   50.1s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:  1.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  7.5min
[Paral

## Получаем предикт
И радуемся жизни

In [7]:
uplift_pred = two_model.predict_proba(df_val)
uplift_pred

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    8.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   14.6s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   18.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   19.9s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   25.1s finished


array([ 0.00695756,  0.00335482,  0.00998789, ...,  0.00504391,
       -0.00740696,  0.02596104])

In [9]:
from sklift.metrics import uplift_at_k

# Assuming y_true, uplift_pred, and treatment are your data
score = uplift_at_k(y_true=y_val, uplift=uplift_pred, treatment=treat_val, k=0.05, strategy='overall')
score

0.9576825711298222

Impressive, very nice

## Сабмитим тест

In [10]:
test_uplift_pred = two_model.predict_proba(df_test)
test_uplift_pred

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   10.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   19.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   24.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   12.7s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   23.2s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:   29.5s finished


array([ 0.01315206, -0.03385089,  0.01115189, ...,  0.04552827,
        0.01530259,  0.0152981 ])

In [16]:
sample_sub_df = pd.read_csv(path_sample_sub)
sample_sub_df['predicted_uplift'] = test_uplift_pred
sample_sub_df.to_csv("Simple_RF_depth47_trees1000.csv", index=False)

Public LB: __0.933__, Enjoy!