## Datasets processing

In [50]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from src.distributions_check import check_feature_distributions_by_stat_test, check_feature_distributions_by_model

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading datasets from internet

#### Criteo

https://huggingface.co/datasets/criteo/criteo-uplift https://www.uplift-modeling.com/en/latest/api/datasets/fetch_criteo.html

In [15]:
df = pd.read_csv("hf://datasets/criteo/criteo-uplift/criteo-research-uplift-v2.1.csv.gz")

In [16]:
print(df[df['treatment'] == 1]['conversion'].mean(), df[df['treatment'] == 0]['conversion'].mean())
print(df[df['treatment'] == 1]['exposure'].mean(), df[df['treatment'] == 0]['exposure'].mean())
print(df[df['treatment'] == 1]['visit'].mean(), df[df['treatment'] == 0]['visit'].mean())

0.0030894610674129645 0.0019375880152813366
0.036036727482199896 0.0
0.048543360048743316 0.03820095691954503


In [17]:
from sklearn.model_selection import train_test_split
count_for_test = int(0.25 * len(df))
train_data, test_data = train_test_split(df, test_size=count_for_test)

In [52]:
check_feature_distributions_by_stat_test(train_data, test_data, plot=False, print_=False)


Распределения похожи для всех фичей (p-value >= 0.05).


In [53]:
check_feature_distributions_by_model(train_data, test_data)

AUC: 0.4986
Тренировочные и тестовые датасеты РАЗЛИЧАЮТСЯ!


#### Lazada

#### Lenta

#### X5

In [None]:
def sample_features(self, percent, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    # Вычисляем количество фичей для выборки
    n_features = self.train_data.shape[1]  # Количество колонок (фичей)
    sampled_features = int(n_features * percent)  # Количество фичей для выборки

    # Случайно выбираем индексы фичей
    sampled_indices = np.random.permutation(n_features)[:sampled_features]

    # Отбираем фичи в обеих выборках
    train_sampled = self.train_data[:, sampled_indices]
    test_sampled = self.test_data[:, sampled_indices]

    # Сохраняем результат
    train_out_path = os.path.join(output_dir, 'train.tsv')
    test_out_path = os.path.join(output_dir, 'test.tsv')
    pd.DataFrame(train_sampled).to_csv(train_out_path, sep='\t', index=False, header=False)
    pd.DataFrame(test_sampled).to_csv(test_out_path, sep='\t', index=False, header=False)

In [5]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from abc import ABC, abstractmethod


class IDataset(ABC):
    def __init__(self):
        self.data = None

    # @abstractmethod
    # def example(self):
    #     """Загружает данные"""
    #     pass


class TorchDataset(IDataset, Dataset):
    def __init__(self, path):
        IDataset.__init__(self)
        Dataset.__init__(self)
        self.data = torch.tensor(pd.read_csv(path, sep='\t').values, dtype=torch.float32)


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class NumpyDataset(IDataset):
    def __init__(self, path):
        IDataset.__init__(self)
        self.data = torch.tensor(pd.read_csv(path, sep='\t').values, dtype=torch.float32)
    def load(self, path):
        """Загружает данные в формате NumPy массивов"""
        train_path = os.path.join(path, 'train.tsv')
        test_path = os.path.join(path, 'test.tsv')

        self.train_data = pd.read_csv(train_path, sep='\t').values
        self.test_data = pd.read_csv(test_path, sep='\t').values

    def sample_features(self, percent, output_dir):
        os.makedirs(output_dir, exist_ok=True)

        # Вычисляем количество фичей для выборки
        n_features = self.train_data.shape[1]  # Количество колонок (фичей)
        sampled_features = int(n_features * percent)  # Количество фичей для выборки

        # Случайно выбираем индексы фичей
        sampled_indices = np.random.permutation(n_features)[:sampled_features]

        # Отбираем фичи в обеих выборках
        train_sampled = self.train_data[:, sampled_indices]
        test_sampled = self.test_data[:, sampled_indices]

        # Сохраняем результат
        train_out_path = os.path.join(output_dir, 'train.tsv')
        test_out_path = os.path.join(output_dir, 'test.tsv')
        pd.DataFrame(train_sampled).to_csv(train_out_path, sep='\t', index=False, header=False)
        pd.DataFrame(test_sampled).to_csv(test_out_path, sep='\t', index=False, header=False)


# if __name__ == '__main__':
#     # Пример использования TorchDataset
#     path_to_data = 'path_to_your_dataset_folder'  # Укажите путь к папке с данными
#     output_directory = 'output_folder'  # Папка для сохранения выборок

#     # TorchDataset: тренировка
#     torch_dataset = TorchDataset(split='train')
#     torch_dataset.load(path_to_data)
#     torch_dataset.sample_features(0.5, output_directory)

#     # TorchDataset: загрузка в DataLoader
#     from torch.utils.data import DataLoader

#     train_loader = DataLoader(torch_dataset, batch_size=32, shuffle=True)
#     for batch in train_loader:
#         print(batch)  # Вывод одного батча (примера)

#     # Пример использования NumpyDataset
#     numpy_dataset = NumpyDataset()
#     numpy_dataset.load(path_to_data)
#     numpy_dataset.sample_features(0.5, output_directory)

In [3]:
# !pip install numpy pandas torch