# ✅ Импорты

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklift.models import SoloModel
from sklift.models import TwoModels
from sklift.models import ClassTransformation
from sklift.metrics import uplift_at_k
from sklift.datasets import fetch_hillstrom

from causalml.inference.meta.slearner import BaseSClassifier as CausalSoloModel
from causalml.inference.meta.tlearner import BaseTClassifier as CausalTwoModels

from upninja.pipelines import DataTransformers
from upninja.pipelines import BasePipeline
from upninja.utils.Score import upliftComparingHist, scoreUpliftAtK, scorePipelines
from upninja.models import findBestParams, baseModelSelection
from upninja.models import Spaces

import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Uplift на Kevin Hillstrom датасете

## ⭐ Загрузка и обработка Kevin Hillstrom датасета

In [3]:
data = fetch_hillstrom()
X, y, t = data['data'], data['target'], data['treatment']
# упростим целевую группу - сократим до была рассылка/не было рассылки
t = t.map({'Womens E-Mail':1, 'Mens E-Mail':1, 'No E-Mail':0})

X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(X, 
                                                                     y, t, 
                                                                     test_size=0.3, 
                                                                     random_state=42)

## ⭐ Выбор лучшей базовой модели

In [4]:
X_prepared = DataTransformers.HillstromTransformer().fit_transform(X_train)
y_prepared = y_train.copy()

In [5]:
log_reg_best = findBestParams(LogisticRegression,
                               X_prepared,
                               y_prepared,
                               Spaces.log_reg_hp_space
                              )

with open('saved_models/best_base/log_reg.pkl', 'wb') as f:
    pickle.dump(log_reg_best, f)

knn_best = findBestParams(KNeighborsClassifier,
                               X_prepared,
                               y_prepared,
                               Spaces.knn_hp_space
                              )

with open('saved_models/best_base/knn.pkl', 'wb') as f:
    pickle.dump(knn_best, f)

dt_best = findBestParams(DecisionTreeClassifier,
                               X_prepared,
                               y_prepared,
                               Spaces.dt_hp_space
                              )

with open('saved_models/best_base/dt.pkl', 'wb') as f:
    pickle.dump(dt_best, f)

rf_best = findBestParams(RandomForestClassifier,
                               X_prepared,
                               y_prepared,
                               Spaces.rf_hp_space
                              )

with open('saved_models/best_base/rf.pkl', 'wb') as f:
    pickle.dump(rf_best, f)

cb_best = findBestParams(CatBoostClassifier,
                               X_prepared,
                               y_prepared,
                               Spaces.cb_hp_space
                              )

with open('saved_models/best_base/cb.pkl', 'wb') as f:
    pickle.dump(cb_best, f)

  0%|                                     | 0/5 [00:00<?, ?trial/s, best loss=?]

  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


 20%|██        | 1/5 [00:04<00:18,  4.64s/trial, best loss: -0.6274522168450039]

  return _ForkingPickler.loads(res)


100%|██████████| 5/5 [00:13<00:00,  2.80s/trial, best loss: -0.6274522168450039]
100%|███████████| 5/5 [00:22<00:00,  4.52s/trial, best loss: -0.553012915389317]
100%|██████████| 5/5 [00:01<00:00,  4.71trial/s, best loss: -0.6031430900366076]
100%|██████████| 5/5 [00:09<00:00,  1.98s/trial, best loss: -0.6159847855399736]
100%|██████████| 5/5 [01:04<00:00, 12.96s/trial, best loss: -0.6120419386189359]


In [6]:
with open('saved_models/best_base/log_reg.pkl', 'rb') as f:
    log_reg_best = pickle.load(f)

with open('saved_models/best_base/knn.pkl', 'rb') as f:
    knn_best = pickle.load(f)

with open('saved_models/best_base/dt.pkl', 'rb') as f:
    dt_best = pickle.load(f)

with open('saved_models/best_base/rf.pkl', 'rb') as f:
    rf_best = pickle.load(f)

with open('saved_models/best_base/cb.pkl', 'rb') as f:
    cb_best = pickle.load(f)

log_reg_best['best_params']['max_iter'] = int(log_reg_best['best_params']['max_iter'])

knn_best['best_params']['metric'] = knn_best['best_params']['metric'] if\
                                    knn_best['best_params']['metric'] in\
                                    {'pyfunc', 'l1', 'mahalanobis', 'minkowski',
                                     'braycurtis', 'l2', 'chebyshev', 'correlation',
                                     'seuclidean', 'sokalmichener', 'hamming', 'precomputed',
                                     'euclidean', 'haversine', 'cosine', 'dice', 'russellrao',
                                     'cityblock', 'sokalsneath', 'yule', 'infinity', 'sqeuclidean',
                                     'manhattan', 'nan_euclidean', 'p', 'canberra',
                                     'rogerstanimoto', 'jaccard'} else 'euclidean'
knn_best['best_params']['weights'] = knn_best['best_params']['weights'] if\
                                     knn_best['best_params']['weights'] in\
                                     {'distance', 'uniform'} else 'distance'

dt_best['best_params']['criterion'] = ['gini', 'entropy'][dt_best['best_params']['criterion']]
dt_best['best_params']['max_depth'] = int(dt_best['best_params']['max_depth'])

rf_best['best_params']['criterion'] = ['gini', 'entropy'][rf_best['best_params']['criterion']]
rf_best['best_params']['max_depth'] = int(rf_best['best_params']['max_depth'])
rf_best['best_params']['n_estimators'] = int(rf_best['best_params']['n_estimators'])

cb_best['best_params']['verbose'] = False

models = {
    'LogisticRregressionSklearn': LogisticRegression(**log_reg_best['best_params']),
    'KNearestSklearn': KNeighborsClassifier(**knn_best['best_params']),
    'TreeClassifierSklearn': DecisionTreeClassifier(**dt_best['best_params']),
    'RandomForestSklearn': RandomForestClassifier(**rf_best['best_params']),
    'GradientBoostingCatBoost': CatBoostClassifier(**cb_best['best_params'])
}

res = baseModelSelection(models, X_prepared, y_prepared)

In [7]:
res

Unnamed: 0,model_name,fit_time,score_time,test_accuracy,test_roc_auc,test_f1
1,KNearestSklearn,0.020627,1.482143,0.847366,0.553325,0.017513
2,TreeClassifierSklearn,0.042022,0.012076,0.853058,0.599789,0.0
3,RandomForestSklearn,1.395977,0.103046,0.853058,0.615059,0.0
0,LogisticRregressionSklearn,1.865285,0.024664,0.853058,0.62749,0.0
4,GradientBoostingCatBoost,2.844391,0.021228,0.853036,0.613629,0.001819


## ⭐ Протестируем Scikit-uplifts

In [4]:
with open('saved_models/best_base/cb.pkl', 'rb') as f:
    cb_best = pickle.load(f)
cb_best['best_params']['logging_level'] = 'Silent'

s_learner_model = SoloModel(CatBoostClassifier(**cb_best['best_params']))

s_learner_pipeline = BasePipeline.BasePipeline([
    ('hilstrom-transformer', DataTransformers.HillstromTransformer()),
    ('slearner', s_learner_model)
])

# независимые модели
two_models_undepended_model = TwoModels(
    CatBoostClassifier(**cb_best['best_params']),
    CatBoostClassifier(**cb_best['best_params']),
    method='vanilla'
)

two_models_undepended_pipeline = BasePipeline.BasePipeline([
    ('hilstrom-transformer', DataTransformers.HillstromTransformer()),
    ('two-models-undepended', two_models_undepended_model)
])

# зависимые модели
two_models_depended_model = TwoModels(
    CatBoostClassifier(**cb_best['best_params']),
    CatBoostClassifier(**cb_best['best_params']),
    method='ddr_control'
)

two_models_depended_pipeline = BasePipeline.BasePipeline([
    ('hilstrom-transformer', DataTransformers.HillstromTransformer()),
    ('two-models-depended', two_models_depended_model)
])

# Трансформация классов
class_transform_model = ClassTransformation(
    CatBoostClassifier(**cb_best['best_params'])
)

class_transform_pipeline = BasePipeline.BasePipeline([
    ('hilstrom-transformer', DataTransformers.HillstromTransformer()),
    ('class-transform', class_transform_model)
])

# соберем словарь для скоринга
pipelines_unfited = {
    'slearner': s_learner_pipeline,
    'two-models-undepended': two_models_undepended_pipeline,
    'two-models-depended': two_models_depended_pipeline,
    'class-transform': class_transform_pipeline
}

In [5]:
res = scorePipelines(pipelines_unfited,
                     X_train, y_train,
                     X_test, y_test,
                     t_train, t_test
                    )

In [6]:
res

Unnamed: 0,model_name,fit_time,score_time,weighted_average_uplift_test,auqc_test,auuq_test
3,class-transform,2.911708,0.097244,0.065836,0.018426,0.010575
0,slearner,3.180722,0.197214,0.065959,0.030529,0.01724
1,two-models-undepended,3.580388,0.185339,0.066015,0.002657,0.001662
2,two-models-depended,3.620152,0.18889,0.065764,0.010999,0.006565


## ⭐ Протестируем Causal-ML

In [None]:
s_learner_model = CausalSoloModel(CatBoostClassifier(logging_level='Silent'))

s_learner_pipeline = BasePipeline.BasePipeline([
    ('hilstrom-transformer', DataTransformers.HillstromTransformer()),
    ('slearner', s_learner_model)
])

In [None]:
%%time

s_learner_pipeline.fit(X_train, y_train, t_train)

In [None]:
two_models_model = CausalTwoModels(
    CatBoostClassifier(logging_level='Silent'),
    CatBoostClassifier(logging_level='Silent')
)

two_models_pipeline = BasePipeline.BasePipeline([
    ('hilstrom-transformer', DataTransformers.HillstromTransformer()),
    ('two-models', two_models_model)
])

In [None]:
%%time

two_models_pipeline.fit(X_train, y_train, t_train)

In [None]:
models_w_preds = (
    ('s-learner', s_learner_pipeline.predict(X_test).flatten()),
    ('two-models', two_models_pipeline.predict(X_test).flatten())
)

In [None]:
res = upliftComparingHist(
    model_name_1=models_w_preds[0][0],
    model_predictions_1=models_w_preds[0][1],
    model_name_2=models_w_preds[1][0],
    model_predictions_2=models_w_preds[1][1]
)

In [None]:
scoreUpliftAtK(
    models_w_preds,
    y_test,
    t_test
)