In [2]:
import pathlib
import os
from pathlib import Path
import sys
 
if sys.argv:
    sys.path.insert(0, str(pathlib.Path(os.path.dirname(os.path.abspath(""))).resolve()))
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from abc import ABC, abstractmethod

from src.global_params import COL_TARGET, COL_TREATMENT

class IDataset(ABC):
    def __init__(self):
        self.data = None

class TorchDataset(IDataset, Dataset):
    def __init__(self, path):
        IDataset.__init__(self)
        Dataset.__init__(self)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.pandas = pd.read_csv(path, sep='\t')
        self.data = torch.tensor(self.pandas.drop([COL_TREATMENT, COL_TARGET], axis=1).values, dtype=torch.float32).to(device)
        self.target = torch.tensor(self.pandas[COL_TARGET].values, dtype=torch.float32).to(device)
        self.treatment = torch.tensor(self.pandas[COL_TREATMENT].values, dtype=torch.float32).to(device)

In [42]:
import torch
import numpy as np
import pandas as pd
from src.datasets import IDataset, NumpyDataset
class PairedUpliftDataset(IDataset, torch.utils.data.Dataset):
    """
    Датасет, содержащий пары примеров (treatment, control) с предсказаниями учителя.
    """
    def __init__(self, path, teacher_model):
        """
        Инициализация датасета.
        teacher_model: Предобученная модель-учитель
        """
        IDataset.__init__(self)
        torch.utils.data.Dataset.__init__(self)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.pandas = pd.read_csv(path, sep='\t')
        self.data = torch.tensor(self.pandas.drop([COL_TREATMENT, COL_TARGET], axis=1).values, dtype=torch.float32).to(device)
        self.target = torch.tensor(self.pandas[COL_TARGET].values, dtype=torch.float32).to(device)
        self.treatment = torch.tensor(self.pandas[COL_TREATMENT].values, dtype=torch.float32).to(device)

        # Разделяем примеры на группы воздействия и контроля
        treatment_mask = self.treatment == 1
        control_mask = self.treatment == 0
        
        self.treatment_indices = np.where(treatment_mask)[0]
        self.control_indices = np.where(control_mask)[0]
        
        self.teacher_preds = torch.tensor(
            teacher_model.predict(NumpyDataset(path))["score"].values,
            dtype=torch.float32
        )
        
        self.pairs = self._create_pairs()
    
    def _create_pairs(self):
        """
        Создает пары из примеров групп воздействия и контроля.
        """
        # Здесь мы используем случайное сопоставление примеров как в статье
        
        np.random.shuffle(self.treatment_indices)
        np.random.shuffle(self.control_indices)
        
        n_pairs = min(len(self.treatment_indices), len(self.control_indices))
        
        pairs = [
            (self.treatment_indices[i], self.control_indices[i])
            for i in range(n_pairs)
        ]
        
        return pairs
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        
        t_idx, c_idx = self.pairs[idx]
        
        # Извлекаем данные для примера из группы воздействия
        t_features = self.data[t_idx]
        t_treatment = self.treatment[t_idx]
        t_outcome = self.target[t_idx]
        t_teacher_pred = self.teacher_preds[t_idx]
        
        # Извлекаем данные для примера из контрольной группы
        c_features = self.data[c_idx]
        c_treatment = self.treatment[c_idx]
        c_outcome = self.target[c_idx]
        c_teacher_pred = self.teacher_preds[c_idx]
        
        return (t_features.to(device), t_treatment.to(device), t_outcome.to(device), t_teacher_pred.to(device),
                c_features.to(device), c_treatment.to(device), c_outcome.to(device), c_teacher_pred.to(device))
    
    def shuffle_pairs(self):
        """
        Вызывать перед новой эпохой для увеличения разнообразия пар.
        """
        self.pairs = self._create_pairs()


In [8]:
from src.utils import get_paths_train_test, train_test_model
from src.models.CausalML.Models import UpliftRandomForestModel
from src.global_params import BASE_PATH
model = UpliftRandomForestModel(from_load=True, path = BASE_PATH + "/exps2/lazada_v2/100/0")

Model loaded from /Users/ogrobertino/UpliftModelingResearch/exps2/lazada_v2/100/0/model.pkl.
Config loaded from /Users/ogrobertino/UpliftModelingResearch/exps2/lazada_v2/100/0/config.json.


In [11]:
model.predict

<src.models.CausalML.Models.UpliftRandomForestModel at 0x3476b0950>

In [13]:
ds_name = 'lazada_v2'
features_percent = 100
train_path, test_path = get_paths_train_test(ds_name=ds_name, features_percent=features_percent)

In [40]:
ds_kdsm = PairedUpliftDataset(train_path, model)

In [21]:
kek = model.predict(NumpyDataset(train_path))

In [29]:
torch.tensor(kek["score"].values)

tensor([0.0515, 0.0355, 0.0404,  ..., 0.0439, 0.0719, 0.0878],
       dtype=torch.float64)

In [49]:
from torch.utils.data import DataLoader
dl = DataLoader(
    ds_kdsm, 
    batch_size=1, 
    shuffle=True,
    num_workers=0
)

In [50]:
kek = next(iter(dl))

In [52]:
kek

[tensor([[ 1.0000e+00,  9.0000e+01,  3.7728e+00,  0.0000e+00,  2.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  4.4543e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  9.0000e+01,  0.0000e+00,  1.0000e+00,
           1.0137e+01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           1.0000e+02,  6.5033e-02,  0.0000e+00,  1.9459e+00,  9.5424e-01,
           1.7709e+00,  1.7709e+00,  1.7709e+00,  1.2000e+01,  0.0000e+00,
           9.5486e-01,  0.0000e+00,  7.5317e-02,  0.0000e+00,  2.0000e+00,
           0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0

In [41]:
ds_kdsm[0]

(tensor([ 0.0000e+00,  3.6500e+02,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.6500e+02,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+02,  0.0000e+00,  0.0000e+00,  0.0000e+00,  6.9897e-01,
          1.5682e+00,  1.5682e+00,  1.5682e+00,  1.7000e+01,  0.0000e+00,
          9.2991e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  0.