# ✅ Импорты

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os

import numpy as np
import pandas as pd

from sklearn.linear_model import HuberRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Загрузим X5 Retail Hero dataset

In [3]:
DATA_PATH = 'data/retailhero-uplift/data'
SEED = 8

In [4]:
%%time

clients = pd.read_csv(os.path.join(DATA_PATH, 'clients.csv'))
products = pd.read_csv(os.path.join(DATA_PATH, 'products.csv'))
purchases = pd.read_csv(os.path.join(DATA_PATH, 'purchases.csv'))
train = pd.read_csv(os.path.join(DATA_PATH, 'uplift_train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'uplift_test.csv'))

CPU times: user 19.1 s, sys: 5.04 s, total: 24.1 s
Wall time: 26.1 s


## ⚒️ Препроцессинг X5 Retail Hero dataset

In [5]:
%%time

product_cols = ['product_id', 'is_alcohol', 'is_own_trademark']

purchases = purchases.merge(products[product_cols], 
                            on='product_id',
                            how='left'
                           )

purchases['transaction_datetime'] = pd.to_datetime(purchases['transaction_datetime'])

# агрегаты по клиенту:
client_features = purchases.groupby('client_id').agg(
    n_transactions=('transaction_id', 'nunique'),
    sum_trn_sum=('trn_sum_from_iss', 'sum'),
    mean_trn_sum=('trn_sum_from_iss', 'mean'),
    sum_quantity=('product_quantity', 'sum'),
    alcohol_bought=('is_alcohol', 'sum'),
    own_trademark_bought=('is_own_trademark', 'sum'),
    n_unique_products=('product_id', 'nunique'),
    n_days_active=('transaction_datetime', lambda x: x.nunique()),
    last_transaction_days_ago=('transaction_datetime', lambda x: (pd.Timestamp('2019-03-18') - x).max().days)
)

CPU times: user 36.1 s, sys: 1.87 s, total: 38 s
Wall time: 38.3 s


In [6]:
%%time

clients['first_issue_date'] = pd.to_datetime(clients['first_issue_date'])
clients['first_redeem_date'] = pd.to_datetime(clients['first_redeem_date'])

clients['issue_ts'] = clients['first_issue_date'].map(pd.Timestamp.timestamp)
clients['redeem_ts'] = clients['first_redeem_date'].apply(
    lambda x: x.timestamp() if pd.notna(x) else np.nan
)

CPU times: user 504 ms, sys: 18.2 ms, total: 522 ms
Wall time: 521 ms


In [7]:
# объединяем с агрегатами:
train_full = train.merge(clients, on='client_id', how='left')
train_full = train_full.merge(client_features, on='client_id', how='left')

test_full = test.merge(clients, on='client_id', how='left')
test_full = test_full.merge(client_features, on='client_id', how='left')

In [8]:
def missing_stats(dataset):
    res = []
    for col in dataset.columns:
        res.append([
            col,
            1.0 - np.mean(dataset[col].isnull()),
            dataset[col].dtype
        ])
    res = pd.DataFrame(res, columns=['feature', 'hitrate', 'dtype'])
    res = res.sort_values(by=['hitrate'], ascending=True)
    res.reset_index(drop=True, inplace=True)
    return res

res_missing = missing_stats(train_full)
res_missing

Unnamed: 0,feature,hitrate,dtype
0,redeem_ts,0.912287,float64
1,first_redeem_date,0.912287,datetime64[ns]
2,n_unique_products,1.0,int64
3,own_trademark_bought,1.0,int64
4,alcohol_bought,1.0,int64
5,sum_quantity,1.0,float64
6,mean_trn_sum,1.0,float64
7,sum_trn_sum,1.0,float64
8,n_transactions,1.0,int64
9,client_id,1.0,object


In [9]:
samples = [train_full, test_full]

to_dt = ['first_issue_date', 'first_redeem_date']

for sample in samples:
    for feat in to_dt:
        sample[feat] = pd.to_datetime(sample[feat], errors='coerce')
        sample[feat] = sample[feat].astype('int64') // 10**9
        sample[to_dt] = sample[to_dt].replace(-9223372036854775808, np.nan).fillna(-1)

fill_mice = res_missing[
                (res_missing['hitrate'] < 1) &
                ~(res_missing['dtype'] == 'object') &
                ~(res_missing['dtype'] == 'datetime64[ns]')
            ]['feature']
fill_mode = res_missing[
                (res_missing['hitrate'] < 1) &
                (res_missing['dtype'] == 'object')
            ]['feature']

for sample in samples:
    imputed = IterativeImputer(
        estimator=HuberRegressor(),
        max_iter=10,
        random_state=SEED
    ).fit_transform(sample[fill_mice])
    sample[fill_mice] = imputed
    sample[fill_mode] = sample[fill_mode].fillna(sample[fill_mode].mode())

In [10]:
train_full['gender'] = train_full['gender'].map({'M': 1, 'F': 0, 'U': -1})
test_full['gender'] = test_full['gender'].map({'M': 1, 'F': 0, 'U': -1})

In [11]:
drop_feat = []

train_full = train_full.drop(drop_feat, axis=1).copy()
test_full = test_full.drop(drop_feat, axis=1).copy()

x_train_ids, x_valid_ids = train_test_split(train_full,
                                            test_size=0.2,
                                            random_state=SEED,
                                            stratify=train_full.treatment_flg.astype(str) +
                                                     train_full.target.astype(str)
                                           )
x_train_ids, x_valid_ids = x_train_ids.index, x_valid_ids.index

train_full.loc[:, 'split'] = None
train_full.iloc[x_train_ids, -1] = 'train'
train_full.iloc[x_valid_ids, -1] = 'valid'

In [12]:
train_full.to_csv(os.path.join(DATA_PATH, 'train_full_preproc.csv'))
test_full.to_csv(os.path.join(DATA_PATH, 'test_full_preproc.csv'))

# ✅ EDA