In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os

import numpy as np
import pandas as pd

# ✅ Загрузим X5 Retail Hero dataset

In [3]:
DATA_PATH = 'data/retailhero-uplift/data'

In [4]:
%%time

clients = pd.read_csv(os.path.join(DATA_PATH, 'clients.csv'))
products = pd.read_csv(os.path.join(DATA_PATH, 'products.csv'))
purchases = pd.read_csv(os.path.join(DATA_PATH, 'purchases.csv'))
train = pd.read_csv(os.path.join(DATA_PATH, 'uplift_train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'uplift_test.csv'))

CPU times: user 18.7 s, sys: 5.27 s, total: 23.9 s
Wall time: 26 s


In [5]:
%%time

product_cols = ['product_id', 'is_alcohol', 'is_own_trademark']

purchases = purchases.merge(products[product_cols], 
                            on='product_id',
                            how='left'
                           )

purchases['transaction_datetime'] = pd.to_datetime(purchases['transaction_datetime'])

# агрегаты по клиенту:
client_features = purchases.groupby('client_id').agg(
    n_transactions=('transaction_id', 'nunique'),
    sum_trn_sum=('trn_sum_from_iss', 'sum'),
    mean_trn_sum=('trn_sum_from_iss', 'mean'),
    sum_quantity=('product_quantity', 'sum'),
    alcohol_bought=('is_alcohol', 'sum'),
    own_trademark_bought=('is_own_trademark', 'sum'),
    n_unique_products=('product_id', 'nunique'),
    n_days_active=('transaction_datetime', lambda x: x.nunique()),
    last_transaction_days_ago=('transaction_datetime', lambda x: (pd.Timestamp('2019-03-18') - x).max().days)
)

CPU times: user 36.5 s, sys: 2.39 s, total: 38.9 s
Wall time: 39.5 s


In [6]:
def parse_birthdate(client_id_str):
    if pd.isna(client_id_str):
        return pd.NaT
    s = str(client_id_str)[:6]
    if len(s) != 6 or not s.isdigit():
        return pd.NaT
    yy, mm, dd = s[:2], s[2:4], s[4:6]
    if mm in ('00', '99') or dd in ('00', '99'):
        return pd.NaT
    try:
        return pd.Timestamp(f'19{yy}-{mm}-{dd}')
    except ValueError:
        try:
            return pd.Timestamp(f'20{yy}-{mm}-{dd}')
        except ValueError:
            return pd.NaT

In [7]:
%%time

clients['first_issue_date'] = pd.to_datetime(clients['first_issue_date'])
clients['first_redeem_date'] = pd.to_datetime(clients['first_redeem_date'])

clients['issue_ts'] = clients['first_issue_date'].map(pd.Timestamp.timestamp)
clients['redeem_ts'] = clients['first_redeem_date'].apply(
    lambda x: x.timestamp() if pd.notna(x) else np.nan
)
clients['redeem_issue_diff'] = clients['redeem_ts'] - clients['issue_ts']
birth_dates = clients['client_id'].apply(parse_birthdate)
clients['age'] = (pd.Timestamp('2019-03-18') - birth_dates).dt.days / 365.25

CPU times: user 1.06 s, sys: 29.5 ms, total: 1.08 s
Wall time: 1.08 s


In [8]:
# объединяем с агрегатами:
train_full = train.merge(clients, on='client_id', how='left')
train_full = train_full.merge(client_features, on='client_id', how='left')

test_full = test.merge(clients, on='client_id', how='left')
test_full = test_full.merge(client_features, on='client_id', how='left')