In [1]:
from pathlib import Path
import sys
 
if sys.argv:
    sys.path.insert(0, str(Path('/Users/ogrobertino/UpliftModelingResearch/').resolve()))
%load_ext autoreload
%autoreload 2
import pandas as pd
from src.datasets import sample_features, TorchDataset, NumpyDataset
from src.global_params import * 

In [183]:
import json
import os
import pickle
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
import copy

from catboost import CatBoostClassifier
import causalml
import causalml.metrics as cmetrics
import causalml.inference.tree as ctree
import causalml.inference.meta.tlearner as tlearner
import causalml.inference.meta.slearner as slearner
import causalml.inference.meta.rlearner as rlearner
import causalml.inference.meta.xlearner as xlearner
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier


class IModelUplift(ABC):
    """
    Интерфейс для реализации моделей uplift.
    """

    def __init__(self, config_json=None, from_load=False, path=None):
        if from_load == False:
            if config_json is None:
                raise ValueError(f"No config while contstructing model.")
            self.model = None
            self.config = config_json
        else:
            if path is None:
                raise ValueError(f"No config or model paths while contstructing model.")
            # Дебильный баг, что если сделать self.moldel=loaded_model то models_t, models_s не будут внутри self.model
            model, config = self.load(path)

            self.model = model
            self.config = config

    @abstractmethod
    def fit(self, X, y):
        """
        Метод для обучения модели.
        """
        pass

    @abstractmethod
    def predict(self, X):
        """
        Метод для предсказания.
        """
        pass

    @abstractmethod
    def load(self, path):
        """
        Метод для загрузки обученной модели из файла.
        """
        pass

class ICausalML(IModelUplift):
    def __init__(self, config_json=None, from_load=False, path=None):
        super().__init__(config_json, from_load, path)

    def fit(self, train):
        self.model.fit(
            X=train.data.loc[:, train.cols_features].values,
            treatment=train.data.loc[:, train.col_treatment].values,
            y=train.data.loc[:, train.col_target].values,
        )

    def predict(self, X):
        scores = X.data.copy(deep=True)
        scores['score'] = self.model.predict(scores.loc[:, X.cols_features])
        return scores[['score', X.col_treatment, X.col_target]]

    def save(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self, f)
        print(f"Model saved to {path}.")

    def load(self, path):
        config_path = path + "/config.json" 
        model_path = path + "/model.pkl"
        if not os.path.exists(config_path):
            raise ValueError(f"No file found at '{config_path}'.")
        if not os.path.exists(model_path):
            raise ValueError(f"No file found at '{model_path}'.")
        
        with open(model_path, 'rb') as f:
            loaded_model = pickle.load(f)
        with open(config_path, 'rb') as f:
            loaded_config = json.load(f)
            
        print(f"Model loaded from {model_path}.")
        print(f"Config loaded from {config_path}.")

        return loaded_model, loaded_config

# Конкретная реализация модели
class TModel(ICausalML):
    """
    t-моделинг с помощью causalml.
    """

    def __init__(self, config_json=None, from_load=False, path=None):
        super().__init__(config_json, from_load, path)
        self.model = tlearner.BaseTClassifier(
            control_learner=CatBoostClassifier(verbose=0, **self.config['lvl_1']['control']),
            treatment_learner=CatBoostClassifier(verbose=0, **self.config['lvl_1']['treatment']),
            **self.config['lvl_0']['meta']
        )

def ModelUpliftFactory(config_json, model_class):
    """
    Фабрика для создания экземпляра модели uplift.

    :param config_json: JSON-строка с конфигурацией модели.
    :param model_class: Класс, реализующий интерфейс ModelUplift.
    """
    if not issubclass(model_class, ModelUplift):
        raise ValueError("model_class должен быть подклассом ModelUplift.")
    
    return model_class(config_json)

class IFactory(ABC):
    @staticmethod
    @abstractmethod
    def create():
        """Фабричный метод, создающий объект модели и датасета."""
        pass

class TModelFactory(IFactory):
    @staticmethod
    def create(config_json, train_path, test_path):
        model = TModel(config_json)
        train = NumpyDataset(train_path)
        test = NumpyDataset(test_path)
        return model, train, test

In [3]:
# model, train, test = TModelFactory.create(config, path_train, path_test)
# model.fit(train)
# predicted = model.predict(test)

In [4]:
# cmetrics.auuc_score(
#     predicted, 
#     outcome_col=col_target, 
#     treatment_col=col_treatment, 
# )

In [5]:
# ml_auuc, random_auuc = cmetrics.auuc_score(
#     predicted, 
#     outcome_col=col_target, 
#     treatment_col=col_treatment, 
# )

In [6]:
# cmetrics.plot_gain(
#     predicted,
#     treatment_col=col_treatment,
#     outcome_col=col_target,
# )

In [7]:
def get_paths_train_test(ds_name, features_percent):

    path_to_data_train = f'../data/{ds_name}/{features_percent}/train.tsv'
    path_to_data_test = f'../data/{ds_name}/{features_percent}/test.tsv'

    return path_to_data_train, path_to_data_test

In [8]:
#TODO outpath
def train_test_model(ds_name, features_percent, factory, config):
    train_path, test_path = get_paths_train_test(ds_name=ds_name, features_percent=features_percent)
    model, train, test = factory.create(config, train_path, test_path)
    model.fit(train)
    predicted = model.predict(test)

    return model, predicted
    ml_auuc, random_auuc = cmetrics.auuc_score(
        predicted, 
        outcome_col=col_target, 
        treatment_col=col_treatment, 
    )

    
    print(ml_auuc, random_auuc)
    cmetrics.plot_gain(
        predicted,
        treatment_col=col_treatment,
        outcome_col=col_target,
    )
    
    

In [9]:
config = \
{
  "lvl_0": {
    "meta": {
      "control_name": 0
    }
  },
  "lvl_1": {
    "treatment": {
      "iterations": 20,
      "learning_rate": 0.1,
      "depth": 6,
      "loss_function": "Logloss",
      "eval_metric": "AUC"
    },
    "control": {
      "iterations": 30,
      "learning_rate": 0.05,
      "depth": 4,
      "loss_function": "Logloss",
      "eval_metric": "AUC"
    }
  }
}

In [10]:
train_path, test_path = get_paths_train_test(ds_name='lazada', features_percent=100)

In [11]:
model, predicted = train_test_model(ds_name='lazada', features_percent=100, factory=TModelFactory, config=config)

In [19]:
predicted

Unnamed: 0,score,treatment,target
0,0.045564,1,0
1,0.055329,0,0
2,0.022363,0,0
3,0.010221,1,0
4,0.012022,0,0
...,...,...,...
181664,0.056568,1,1
181665,0.063529,1,0
181666,0.010840,1,0
181667,0.007119,1,0


In [26]:
model.config

{'lvl_0': {'meta': {'control_name': 0}},
 'lvl_1': {'treatment': {'iterations': 20,
   'learning_rate': 0.1,
   'depth': 6,
   'loss_function': 'Logloss',
   'eval_metric': 'AUC'},
  'control': {'iterations': 30,
   'learning_rate': 0.05,
   'depth': 4,
   'loss_function': 'Logloss',
   'eval_metric': 'AUC'}}}

In [71]:
import os
import json
import pickle
from typing import Any

def _write_files_(model: IModelUplift, predictions, ds_name, features_percent):
    """
    Метод создает папку в нужной директории и записывает туда бинарик модели, предикты модели и конфиг.
    """

    path_overall_stats = "../exps" 
    
    free_folder_number = 0
    os.makedirs(f'../exps/{ds_name}', exist_ok=True)
    os.makedirs(f'../exps/{ds_name}/{features_percent}', exist_ok=True)        
    while os.path.exists(os.path.join(f'../exps/{ds_name}/{features_percent}', str(free_folder_number))):
        free_folder_number += 1
    path_current_setup = f'../exps/{ds_name}/{features_percent}/{free_folder_number}'    
    os.makedirs(path_current_setup, exist_ok=True)

    # Сохранение модели
    model_path = os.path.join(path_current_setup, "model.pkl")
    with open(model_path, "wb") as model_file:
        pickle.dump(model.model, model_file)

    # Сохранение предсказаний
    predictions_path = os.path.join(path_current_setup, "predictions.tsv")
    predictions.to_csv(predictions_path)

    # Сохранение конфига
    config_path = os.path.join(path_current_setup, "config.json")
    with open(config_path, "w") as config_file:
        json.dump(model.config, config_file)

    return path_current_setup


def write(model: IModelUplift, predictions, ds_name, features_percent):
    """
    Метод создает папку с первым свободным числом внутри папки с названием датасета,
    куда сохраняет модель, предсказания, конфиг, метрики и тп.
    """
    path_current_setup = _write_files_(model, predictions, ds_name, features_percent)
    print(f"Модель, предсказания и конфиг сохранены в директории {path_current_setup}")



In [72]:
write(model, predicted, ds_name='lazada', features_percent=100)

Модель, предсказания и конфиг сохранены в директории ../exps/lazada/100/0


In [160]:
model2 = TModel(from_load=True, path="../exps/lazada/100/0")

Model loaded from ../exps/lazada/100/0/model.pkl.
Config loaded from ../exps/lazada/100/0/config.json.


In [163]:
model2.load??

[0;31mSignature:[0m [0mmodel2[0m[0;34m.[0m[0mload[0m[0;34m([0m[0mpath[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Метод для загрузки обученной модели из файла.
[0;31mSource:[0m   
    [0;32mdef[0m [0mload[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mpath[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0mconfig_path[0m [0;34m=[0m [0mpath[0m [0;34m+[0m [0;34m"/config.json"[0m [0;34m[0m
[0;34m[0m        [0mmodel_path[0m [0;34m=[0m [0mpath[0m [0;34m+[0m [0;34m"/model.pkl"[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0;32mnot[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mexists[0m[0;34m([0m[0mconfig_path[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0;32mraise[0m [0mValueError[0m[0;34m([0m[0;34mf"No file found at '{config_path}'."[0m[0;34m)[0m[0;34m[0m
[0;34m[0m        [0;32mif[0m [0;32mnot[0m [0mos[0m[0;34m.[0m[0mpath[0m[0;34m.[0m[0mexists[0m[0;34m([0m[0mmod

In [157]:
def load2(path):
    config_path = path + "/config.json" 
    model_path = path + "/model.pkl"
    if not os.path.exists(config_path):
        raise ValueError(f"No file found at '{config_path}'.")
    if not os.path.exists(model_path):
        raise ValueError(f"No file found at '{model_path}'.")
    
    with open(model_path, 'rb') as f:
        loaded_model = pickle.load(f)
    with open(config_path, 'rb') as f:
        loaded_config = json.load(f)
        
    print(f"Model loaded from {model_path}.")
    print(f"Config loaded from {config_path}.")

    return loaded_model, loaded_config

In [158]:
model3, _ = load2("../exps/lazada/100/0")

Model loaded from ../exps/lazada/100/0/model.pkl.
Config loaded from ../exps/lazada/100/0/config.json.


In [159]:
model3.models_t[1].best_score_

{'learn': {'Logloss': 0.20083979341393296}}

In [93]:
model.model.models_c[1]

{1: <catboost.core.CatBoostClassifier at 0x3416d22d0>}

In [90]:
model2.model.model_t.best_iteration_

In [97]:
model3 = pickle.load(open("model.pkl", "rb"))

In [103]:
model3 = pickle.load(open("../exps/lazada/100/0/model.pkl", "rb"))

In [104]:
model3.models_c

{1: <catboost.core.CatBoostClassifier at 0x3bcb74fd0>}

In [62]:
predicted

Unnamed: 0,score,treatment,target
0,0.045564,1,0
1,0.055329,0,0
2,0.022363,0,0
3,0.010221,1,0
4,0.012022,0,0
...,...,...,...
181664,0.056568,1,1
181665,0.063529,1,0
181666,0.010840,1,0
181667,0.007119,1,0


   * Назваине модели/класса (. __ class __)
   * Конфиг (гиперпараметры)
   * Название датасета
   * Процент фичей
   * Путь до бинаря модели
   * Время работы (latency)
   * Размер бинаря?
   * AUUC на тесте
   * precision@[5, 100] на тесте

In [60]:
os.path.getsize("model.pkl") / 1e6

0.053735

In [None]:
pd.DataFrame(columns=[
    'Model',
    'Path',
    'Dataset',
    'Features Percent',
    'Latency',
    'Binary Size (MB)',
    
    
])

In [None]:
def measure_inference_time(model: Callable, data: pd.DataFrame, batch_size: int = 32) -> float:
    """
    Функция для измерения среднего времени инференса модели на данных в виде батчей.
    
    :param model: Callable (модель или функция с методом `predict` или аналогичным)
    :param data: pd.DataFrame (входные данные для инференса)
    :param batch_size: int (Размер батча для инференса)
    :return: Среднее время инференса одного батча в миллисекундах
    """
    # Разделение данных на батчи
    batches = [
        data.iloc[i:i + batch_size]
        for i in range(0, len(data), batch_size)
    ]

    # Список для хранения времени инференса каждого батча
    inference_times = []

    for batch in batches:
        start_time = time.time()  # Замер начала времени
        predictions = model.predict(batch)  # Инференс модели
        end_time = time.time()  # Замер окончания времени
        
        # Добавляем время инференса текущего батча в список (в миллисекундах)
        inference_times.append((end_time - start_time) * 1000)

    # Рассчет среднего времени инференса на один батч
    mean_inference_time = np.mean(inference_times)
    
    print(f"Среднее время инференса одного батча: {mean_inference_time:.2f} ms")
    return mean_inference_time


# Пример модели с методом `predict`
class DummyModel:
    def predict(self, data_batch):
        # Эмуляция обработки данными (например, sleep на 10 мс)
        time.sleep(0.01)
        return np.random.rand(len(data_batch), 1)  # Возвращаем рандомные предсказания


# Пример использования
if __name__ == "__main__":
    # Создаем фиктивный DataFrame для демонстрации
    num_samples = 1000
    num_features = 10
    dummy_data = pd.DataFrame(np.random.rand(num_samples, num_features), columns=[f"feature_{i}" for i in range(num_features)])
    
    # Создаем модель
    model = DummyModel()

    # Замеряем среднее время инференса
    average_time = measure_inference_time(model, dummy_data, batch_size=32)

In [55]:
pd.DataFrame([{
        "Model/Class": model.__class__.__name__,
        "Config": config,
        "Dataset Name": dataset_name,
        "Features Percent": features_percent,
        "Model Path": model_binary_path,
        "Latency (ms)": latency,
        "Binary Size (KB)": binary_size,
        "AUUC (test)": auuc_test,
        "Precision@[5,100]": precision_at_k
    }])

NameError: name 'dataset_name' is not defined