In [2]:
import pathlib
import os
from pathlib import Path
import sys
 
if sys.argv:
    sys.path.insert(0, str(pathlib.Path(os.path.dirname(os.path.abspath(""))).resolve()))
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from abc import ABC, abstractmethod

from src.global_params import COL_TARGET, COL_TREATMENT

class IDataset(ABC):
    def __init__(self):
        self.data = None

class TorchDataset(IDataset, Dataset):
    def __init__(self, path):
        IDataset.__init__(self)
        Dataset.__init__(self)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.pandas = pd.read_csv(path, sep='\t')
        self.data = torch.tensor(self.pandas.drop([COL_TREATMENT, COL_TARGET], axis=1).values, dtype=torch.float32).to(device)
        self.target = torch.tensor(self.pandas[COL_TARGET].values, dtype=torch.float32).to(device)
        self.treatment = torch.tensor(self.pandas[COL_TREATMENT].values, dtype=torch.float32).to(device)

In [100]:
import torch
import numpy as np
import pandas as pd
from src.datasets import IDataset, NumpyDataset
import pickle
class PairedUpliftDataset(IDataset, torch.utils.data.Dataset):
    """
    Датасет, содержащий пары примеров (treatment, control) с предсказаниями учителя.
    """
    def __init__(self, teacher_model, path=None, from_saved_path=None):
        """
        Инициализация датасета.
        teacher_model: Предобученная модель-учитель
        """
        IDataset.__init__(self)
        torch.utils.data.Dataset.__init__(self)

        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        if from_saved_path:
            self.load(from_saved_path)
        else:
            self.pandas = pd.read_csv(path, sep='\t')
            self.data = torch.tensor(self.pandas.drop([COL_TREATMENT, COL_TARGET], axis=1).values, dtype=torch.float32).to(self.device)
            self.target = torch.tensor(self.pandas[COL_TARGET].values, dtype=torch.float32).to(self.device)
            self.treatment = torch.tensor(self.pandas[COL_TREATMENT].values, dtype=torch.float32).to(self.device)
    
            # Разделяем примеры на группы воздействия и контроля
            treatment_mask = self.treatment == 1
            control_mask = self.treatment == 0
            
            self.treatment_indices = np.where(treatment_mask)[0]
            self.control_indices = np.where(control_mask)[0]

            teacher_preds = teacher_model.predict(NumpyDataset(path)) # return p - q
            # p + q == 1
            # p - q == score
            # p = (1+score) / 2
            # q = (1-score) / 2

            def f(x):
                if x.treatment == 1:
                    return (1 + x['score']) / 2
                return (1 - x['score']) / 2
    
            teacher_preds['score2'] = teacher_preds.apply(f, axis=1)
            
            self.teacher_preds = torch.tensor(
                teacher_preds['score2'].values,
                dtype=torch.float32
            )
        
        self.pairs = self._create_pairs()
    
    def _create_pairs(self):
        """
        Создает пары из примеров групп воздействия и контроля.
        """
        # Здесь мы используем случайное сопоставление примеров как в статье
        
        np.random.shuffle(self.treatment_indices)
        np.random.shuffle(self.control_indices)
        
        n_pairs = min(len(self.treatment_indices), len(self.control_indices))
        
        pairs = [
            (self.treatment_indices[i], self.control_indices[i])
            for i in range(n_pairs)
        ]
        
        return pairs
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        
        t_idx, c_idx = self.pairs[idx]
        
        # Извлекаем данные для примера из группы воздействия
        t_features = self.data[t_idx]
        t_treatment = self.treatment[t_idx]
        t_outcome = self.target[t_idx]
        t_teacher_pred = self.teacher_preds[t_idx]
        
        # Извлекаем данные для примера из контрольной группы
        c_features = self.data[c_idx]
        c_treatment = self.treatment[c_idx]
        c_outcome = self.target[c_idx]
        c_teacher_pred = self.teacher_preds[c_idx]
        
        return (t_features.to(self.device), t_treatment.to(self.device), t_outcome.to(self.device), t_teacher_pred.to(self.device),
                c_features.to(self.device), c_treatment.to(self.device), c_outcome.to(self.device), c_teacher_pred.to(self.device))
    
    def shuffle_pairs(self):
        """
        Вызывать перед новой эпохой для увеличения разнообразия пар.
        """
        self.pairs = self._create_pairs()


    def save(self, path):
        """
        Сохраняет датасет в файл.
        """
        os.makedirs(os.path.dirname(path), exist_ok=True)
        
        # Подготовка данных для сохранения
        save_data = {
            'data': self.data.cpu().numpy() if isinstance(self.data, torch.Tensor) else self.data,
            'treatment': self.treatment.cpu().numpy() if isinstance(self.treatment, torch.Tensor) else self.treatment,
            'target': self.target.cpu().numpy() if isinstance(self.target, torch.Tensor) else self.target,
            'teacher_preds': self.teacher_preds.cpu().numpy() if isinstance(self.teacher_preds, torch.Tensor) else self.teacher_preds,
            'treatment_indices': self.treatment_indices,
            'control_indices': self.control_indices,
            'pairs': self.pairs
        }

        with open(path, 'wb') as f:
            pickle.dump(save_data, f)
        
        print(f"Dataset saved to {path}")
    
    def load(self, path):
        """
        Загружает датасет из файла.
        """
        if not os.path.exists(path):
            raise FileNotFoundError(f"Dataset file not found: {path}")
        
        # Загружаем данные из файла
        with open(path, 'rb') as f:
            load_data = pickle.load(f)
        
        # Восстанавливаем атрибуты
        self.data = torch.tensor(load_data['data'], dtype=torch.float32)
        self.treatment = torch.tensor(load_data['treatment'], dtype=torch.float32)
        self.target = torch.tensor(load_data['target'], dtype=torch.float32)
        self.teacher_preds = torch.tensor(load_data['teacher_preds'], dtype=torch.float32)
        self.treatment_indices = load_data['treatment_indices']
        self.control_indices = load_data['control_indices']
        self.pairs = load_data['pairs']
        
        print(f"Dataset loaded from {path}")


In [80]:
from src.utils import get_paths_train_test, train_test_model
from src.models.CausalML.Models import UpliftRandomForestModel
from src.global_params import BASE_PATH
model = UpliftRandomForestModel(from_load=True, path = BASE_PATH + "/exps2/lazada_v2/100/0")

Model loaded from /Users/ogrobertino/UpliftModelingResearch/exps2/lazada_v2/100/0/model.pkl.
Config loaded from /Users/ogrobertino/UpliftModelingResearch/exps2/lazada_v2/100/0/config.json.


In [11]:
model.predict

<src.models.CausalML.Models.UpliftRandomForestModel at 0x3476b0950>

In [13]:
ds_name = 'lazada_v2'
features_percent = 100
train_path, test_path = get_paths_train_test(ds_name=ds_name, features_percent=features_percent)

In [101]:
ds_kdsm = PairedUpliftDataset(model, path = train_path)

In [102]:
ds_kdsm.save(BASE_PATH + "/data/lazada_v2_kdsm/train")

Dataset saved to /Users/ogrobertino/UpliftModelingResearch/data/lazada_v2_kdsm/train


In [103]:
ds_kdsm2 = PairedUpliftDataset(model, from_saved_path= BASE_PATH + "/data/lazada_v2_kdsm/train")

Dataset loaded from /Users/ogrobertino/UpliftModelingResearch/data/lazada_v2_kdsm/train


In [104]:
ds_kdsm2

<__main__.PairedUpliftDataset at 0x367007e90>

In [110]:
ds_kdsm2.teacher_preds

tensor([0.5257, 0.4823, 0.4798,  ..., 0.4780, 0.4641, 0.4561])

In [85]:
dl = DataLoader(
    ds_kdsm2, 
    batch_size=1, 
    shuffle=True,
    num_workers=0
)

In [86]:
next(iter(dl))

[tensor([[ 0.0000e+00,  1.6000e+02,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  1.6000e+02,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           1.0000e+02,  0.0000e+00,  0.0000e+00,  0.0000e+00,  6.0206e-01,
           1.2304e+00,  1.2304e+00,  1.2304e+00,  2.3000e+01,  0.0000e+00,
           8.5907e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
           0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0

In [87]:
kek = model.predict(NumpyDataset(train_path))

In [90]:
kek['score']

0         0.051454
1         0.035456
2         0.040372
3         0.040695
4         0.038738
            ...   
694997    0.027096
694998    0.031598
694999    0.043906
695000    0.071883
695001    0.087825
Name: score, Length: 695002, dtype: float64

In [96]:
def f(x):
    if x.treatment == 1:
        return (1 + x['score']) / 2
    return (1 - x['score']) / 2
    
kek['score2'] = kek.apply(f, axis=1)

In [99]:
kek

Unnamed: 0,score,treatment,target,score2
0,0.051454,1,0,0.525727
1,0.035456,0,0,0.482272
2,0.040372,0,0,0.479814
3,0.040695,0,0,0.479652
4,0.038738,1,0,0.519369
...,...,...,...,...
694997,0.027096,0,0,0.486452
694998,0.031598,0,0,0.484201
694999,0.043906,0,0,0.478047
695000,0.071883,0,0,0.464059


In [92]:
(1 + kek["score"]) / 2 if (kek['treatment'] == 1) else (1 - kek["score"]) / 2

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [91]:
(1 - kek["score"]) / 2

0         0.474273
1         0.482272
2         0.479814
3         0.479652
4         0.480631
            ...   
694997    0.486452
694998    0.484201
694999    0.478047
695000    0.464059
695001    0.456088
Name: score, Length: 695002, dtype: float64

In [29]:
torch.tensor(kek["score"].values)

tensor([0.0515, 0.0355, 0.0404,  ..., 0.0439, 0.0719, 0.0878],
       dtype=torch.float64)

In [62]:
from torch.utils.data import DataLoader
dl = DataLoader(
    ds_kdsm, 
    batch_size=1, 
    shuffle=True,
    num_workers=0
)

In [68]:
BASE_PATH + "/exps2/lazada_v2_kdsm/train"

'/Users/ogrobertino/UpliftModelingResearch/exps2/lazada_v2_kdsm/train'

In [63]:
kek = next(iter(dl))

In [41]:
ds_kdsm[0]

(tensor([ 0.0000e+00,  3.6500e+02,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  3.6500e+02,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+02,  0.0000e+00,  0.0000e+00,  0.0000e+00,  6.9897e-01,
          1.5682e+00,  1.5682e+00,  1.5682e+00,  1.7000e+01,  0.0000e+00,
          9.2991e-01,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  1.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  0.