# Фичи по таргету   

Цель: предсказать для каждого пользователя взятие/ невзятие каждого из четырех продуктов **в течение месяца после отчетной даты**, исторические данные по ним находятся в targets

In [1]:
import numpy as np

import pandas as pd
from pandas.api.types import is_float_dtype, is_integer_dtype

from collections import Counter
from sklearn.utils import resample

from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

import gc
import glob
import pyarrow.parquet as pq
from tqdm import trange, tqdm

  from pandas.core import (


In [2]:
from typing import List, Optional, Tuple

In [3]:
import warnings

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='pandas')

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# найтройки
# Убираем ограничение отображемых колонок
pd.set_option("display.max_columns", None)
# Устанавливаем тему по умолчанию
sb_dark = sns.dark_palette('skyblue', 8, reverse=True) # teal
sns.set(palette=sb_dark)

In [5]:
# Включаем tqdm для pandas, чтобы можно было запускать progress_apply() вместо простого apply()
tqdm.pandas() 
pd.options.display.max_columns = None
pd.options.display.max_rows = 200

In [6]:
eps = 1e-6

In [7]:
PATH = ''
PATH_DATASET = PATH + 'datasets/sber_source/'
PATH_DATASET_OUTPUT = PATH + 'datasets/'

PATH_DATASET_TARGET_TRAIN = PATH_DATASET + 'train_target.parquet/'
PATH_DATASET_TARGET_TEST = PATH_DATASET + 'test_target_b.parquet/'

# таргеты
train_target_files = glob.glob(PATH_DATASET_TARGET_TRAIN + '/*.parquet')
test_target_files = glob.glob(PATH_DATASET_TARGET_TEST + '/*.parquet')

len(train_target_files), len(test_target_files)

(11, 11)

In [8]:
%%time
# Загружаем факты продаж продуктов по трейн клиентам
targets_df = pq.read_table(PATH_DATASET_OUTPUT + 'compress_targets_08_06_2024.parquet').to_pandas()
# targets_df = targets_df.rename(columns={'mon': 'report_next_end'})
targets_df = targets_df.reset_index()
targets_df = targets_df[['client_id', 'mon', 'target_1', 'target_2', 'target_3', 'target_4']]
targets_df.shape

CPU times: total: 3.03 s
Wall time: 2.06 s


(11686066, 6)

In [9]:
# В данных встречаются дубли клиент+отчетный месяц. Там всегда нули, поэтому просто удаляем дубли 
targets_df = targets_df.drop_duplicates(subset=['client_id', 'mon'])
targets_df.shape

(11654375, 6)

In [10]:
%%time
# Рассчитываем факт приобретения клиентом когда-либо продукта 1 или 2/3/4
def get_group_targets(df:pd.DataFrame) -> pd.DataFrame:
    # Факт приобретения клиентом когда-либо продукта 1 или 2/3/4
    df['is_target'] = df[['target_1', 'target_2', 'target_3', 'target_4']].max(axis=1)
    
    # Расширеный факт приобретения клиентом когда-либо группы продуктов 
    df['is_target_1_2'] = df[['target_1', 'target_2']].max(axis=1)
    df['is_target_1_3'] = df[['target_1', 'target_3']].max(axis=1)
    df['is_target_1_4'] = df[['target_1', 'target_4']].max(axis=1)
    df['is_target_2_3'] = df[['target_2', 'target_3']].max(axis=1)
    df['is_target_2_4'] = df[['target_2', 'target_4']].max(axis=1)
    df['is_target_3_4'] = df[['target_3', 'target_4']].max(axis=1)

    df['is_target_123'] = df[['target_1', 'target_2', 'target_3']].max(axis=1)
    df['is_target_134'] = df[['target_1', 'target_3', 'target_4']].max(axis=1)
    df['is_target_124'] = df[['target_1', 'target_2', 'target_4']].max(axis=1)
    df['is_target_234'] = df[['target_2', 'target_3', 'target_4']].max(axis=1)
    
    # Второй расширеный факт приобретения клиентом когда-либо группы продуктов 
    df['is_target_1_and_2'] = np.where(df[['target_1', 'target_2']].sum(axis=1) == 2, 1,0)
    df['is_target_1_and_3'] = np.where(df[['target_1', 'target_3']].sum(axis=1) == 2, 1,0)
    df['is_target_1_and_4'] = np.where(df[['target_1', 'target_4']].sum(axis=1) == 2, 1,0)
    df['is_target_2_and_3'] = np.where(df[['target_2', 'target_3']].sum(axis=1) == 2, 1,0)
    df['is_target_2_and_4'] = np.where(df[['target_2', 'target_4']].sum(axis=1) == 2, 1,0)
    df['is_target_3_and_4'] = np.where(df[['target_3', 'target_4']].sum(axis=1) == 2, 1,0)
    
    df['is_target_and_123'] = np.where(df[['target_1', 'target_2', 'target_3']].sum(axis=1) == 2, 1,0)
    df['is_target_and_134'] = np.where(df[['target_1', 'target_3', 'target_4']].sum(axis=1) == 2, 1,0)
    df['is_target_and_124'] = np.where(df[['target_1', 'target_2', 'target_4']].sum(axis=1) == 2, 1,0)
    df['is_target_and_234'] = np.where(df[['target_2', 'target_3', 'target_4']].sum(axis=1) == 2, 1,0)    
    
    # кол-во купленных продуктов
    df['is_target_cnt'] = df[['target_1', 'target_2', 'target_3', 'target_4']].sum(axis=1)

    return df

targets_df = get_group_targets(targets_df)
targets_df.shape

CPU times: total: 15.5 s
Wall time: 15.3 s


(11654375, 28)

In [11]:
target_columns = ['target_1', 'target_2', 'target_3', 'target_4',
                  'is_target', 'is_target_1_2', 'is_target_1_3',
                  'is_target_1_4', 'is_target_2_3', 'is_target_2_4', 'is_target_3_4',
                  'is_target_1_and_2', 'is_target_1_and_3', 'is_target_1_and_4',
                  'is_target_2_and_3', 'is_target_2_and_4', 'is_target_3_and_4',
                  'is_target_123', 'is_target_134', 'is_target_124', 'is_target_234', 
                  'is_target_cnt']
len(target_columns)

22

In [12]:
%%time
mon_targets_df = targets_df.groupby(by='mon').agg(
    sum_target_1 = ('target_1', sum), 
    sum_target_2 = ('target_2', sum), 
    sum_target_3 = ('target_3', sum), 
    sum_target_4 = ('target_4', sum), 
    sum_is_target = ('is_target', sum), 
    sum_is_target_1_2 = ('is_target_1_2', sum), 
    sum_is_target_1_3 = ('is_target_1_3', sum), 
    sum_is_target_1_4 = ('is_target_1_4', sum), 
    sum_is_target_2_3 = ('is_target_2_3', sum), 
    sum_is_target_2_4 = ('is_target_2_4', sum), 
    sum_is_target_3_4 = ('is_target_3_4', sum), 
    sum_is_target_1_and_2 = ('is_target_1_and_2', sum), 
    sum_is_target_1_and_3 = ('is_target_1_and_3', sum), 
    sum_is_target_1_and_4 = ('is_target_1_and_4', sum), 
    sum_is_target_2_and_3 = ('is_target_2_and_3', sum), 
    sum_is_target_2_and_4 = ('is_target_2_and_4', sum), 
    sum_is_target_3_and_4 = ('is_target_3_and_4', sum), 
    sum_is_target_123 = ('is_target_123', sum), 
    sum_is_target_134 = ('is_target_134', sum), 
    sum_is_target_124 = ('is_target_124', sum), 
    sum_is_target_234 = ('is_target_234', sum), 
    sum_is_target_cnt = ('is_target_cnt', sum),  
)
mon_targets_df = mon_targets_df.reset_index()
mon_targets_df['next_mon'] = mon_targets_df['mon'].shift(1)
mon_targets_df['pre_mon'] = mon_targets_df['mon'].shift(-1)
mon_targets_df['prepre_mon'] = mon_targets_df['mon'].shift(-2)

mon_targets_df.shape

CPU times: total: 1.22 s
Wall time: 1.22 s


(12, 26)

In [13]:
%%time
# Формируем фичи по таргету
begin_date = datetime(2022, 1, 1, 0, 0, 0)
start_date = datetime(2022, 1, 1, 0, 0, 0)

end_date = datetime(2023, 3, 31, 0, 0, 0)

# Бланк-датафрейм с клиентами 
uniq_clients_df = targets_df[['client_id']].drop_duplicates()
# Итоговый датасет 
union_client_agg_df = pd.DataFrame()

# Бежим по месяцам и расчитываем статистики для клиента берем предыдущие месяцы
for i in trange(((end_date - start_date).days//30 + 1)):
    end_date = start_date + relativedelta(months=1) - relativedelta(days=1)
    print(f'start: {start_date}, end: {end_date}')    
    select_mon_current_df = targets_df[targets_df['mon'].between(start_date, end_date)]
    select_mon_full_df = targets_df[targets_df['mon'].between(begin_date, end_date)]
    print(select_mon_current_df.shape, select_mon_full_df.shape)
    
    client_agg_df = uniq_clients_df.copy()
    report_next_end = start_date + relativedelta(months=2) - relativedelta(days=1)
    client_agg_df['report_next_end'] = report_next_end
    client_agg_df = client_agg_df.set_index('client_id')
    
    select_mon_full_df = select_mon_full_df.set_index('client_id')
    for cur_tar in ['target_1', 'target_2', 'target_3', 'target_4', 'is_target']:
        # Расчитываем даты первой и последней покупки продукта 
        min_max_date_buy = select_mon_full_df[select_mon_full_df[cur_tar] == 1].groupby(by='client_id').agg(
                                    first_day_buy = ('mon', min),
                                    last_day_buy = ('mon', max),
        )
        #break
        client_agg_df = client_agg_df.merge(min_max_date_buy, left_index=True, right_index=True, how='left')
        client_agg_df[f'days_first_buy_{cur_tar}'] = (client_agg_df['report_next_end'] - client_agg_df['first_day_buy']).dt.days
        client_agg_df[f'days_last_buy_{cur_tar}'] = (client_agg_df['report_next_end'] - client_agg_df['last_day_buy']).dt.days
        client_agg_df = client_agg_df.drop(columns=['first_day_buy', 'last_day_buy'])
        client_agg_df = client_agg_df.fillna(0)
    
    # Количество покупок продуктов за весь период
    client_agg_df = client_agg_df.merge(
            select_mon_full_df.groupby(by='client_id').agg(
                    sum_target_1_by_all_period = ('target_1', sum),
                    sum_target_2_by_all_period = ('target_2', sum),
                    sum_target_3_by_all_period = ('target_3', sum),
                    sum_target_4_by_all_period = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    
    # Доля покупок по продуктам
    client_agg_df['sum_all_target_by_all_period'] = client_agg_df[['sum_target_1_by_all_period', 'sum_target_2_by_all_period', 'sum_target_3_by_all_period', 'sum_target_4_by_all_period']].sum(axis=1)
    client_agg_df['prc_target_1by_all_trgs'] = (client_agg_df['sum_target_1_by_all_period'] / client_agg_df['sum_all_target_by_all_period']).fillna(0)
    client_agg_df['prc_target_2by_all_trgs'] = (client_agg_df['sum_target_2_by_all_period'] / client_agg_df['sum_all_target_by_all_period']).fillna(0)
    client_agg_df['prc_target_3by_all_trgs'] = (client_agg_df['sum_target_3_by_all_period'] / client_agg_df['sum_all_target_by_all_period']).fillna(0)
    client_agg_df['prc_target_4by_all_trgs'] = (client_agg_df['sum_target_4_by_all_period'] / client_agg_df['sum_all_target_by_all_period']).fillna(0)
    
    # Сколько в среднем в месяц клиент покупает продуктов 
    cnt_month = (end_date - begin_date).days / 30
    client_agg_df['mean_all_target_by_per_mon'] = client_agg_df['sum_all_target_by_all_period'] / cnt_month
    client_agg_df['mean_target_1_by_per_mon'] = client_agg_df['sum_target_1_by_all_period'] / cnt_month
    client_agg_df['mean_target_2_by_per_mon'] = client_agg_df['sum_target_2_by_all_period'] / cnt_month
    client_agg_df['mean_target_3_by_per_mon'] = client_agg_df['sum_target_3_by_all_period'] / cnt_month
    client_agg_df['mean_target_4_by_per_mon'] = client_agg_df['sum_target_4_by_all_period'] / cnt_month    
    
    # Количество покупок продуктов за 30 дней
    client_agg_df = client_agg_df.merge(
            select_mon_full_df[select_mon_full_df['mon'] >= report_next_end - relativedelta(months=1)].groupby(by='client_id').agg(
                    sum_target_1_by_1_mon = ('target_1', sum),
                    sum_target_2_by_1_mon = ('target_2', sum),
                    sum_target_3_by_1_mon = ('target_3', sum),
                    sum_target_4_by_1_mon = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    # Количество покупок продуктов за 60 дней
    client_agg_df = client_agg_df.merge(
            select_mon_full_df[select_mon_full_df['mon'] >= report_next_end - relativedelta(months=2)].groupby(by='client_id').agg(
                    sum_target_1_by_2_mon = ('target_1', sum),
                    sum_target_2_by_2_mon = ('target_2', sum),
                    sum_target_3_by_2_mon = ('target_3', sum),
                    sum_target_4_by_2_mon = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    
    # Количество покупок продуктов за 90 дней
    client_agg_df = client_agg_df.merge(
            select_mon_full_df[select_mon_full_df['mon'] >= report_next_end - relativedelta(months=3)].groupby(by='client_id').agg(
                    sum_target_1_by_3_mon = ('target_1', sum),
                    sum_target_2_by_3_mon = ('target_2', sum),
                    sum_target_3_by_3_mon = ('target_3', sum),
                    sum_target_4_by_3_mon = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    # Количество покупок продуктов за 120 дней
    client_agg_df = client_agg_df.merge(
            select_mon_full_df[select_mon_full_df['mon'] >= report_next_end - relativedelta(months=4)].groupby(by='client_id').agg(
                    sum_target_1_by_4_mon = ('target_1', sum),
                    sum_target_2_by_4_mon = ('target_2', sum),
                    sum_target_3_by_4_mon = ('target_3', sum),
                    sum_target_4_by_4_mon = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    # Количество покупок продуктов за 150 дней
    client_agg_df = client_agg_df.merge(
            select_mon_full_df[select_mon_full_df['mon'] >= report_next_end - relativedelta(months=5)].groupby(by='client_id').agg(
                    sum_target_1_by_5_mon = ('target_1', sum),
                    sum_target_2_by_5_mon = ('target_2', sum),
                    sum_target_3_by_5_mon = ('target_3', sum),
                    sum_target_4_by_5_mon = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    # Количество покупок продуктов за 180 дней
    client_agg_df = client_agg_df.merge(
            select_mon_full_df[select_mon_full_df['mon'] >= report_next_end - relativedelta(months=6)].groupby(by='client_id').agg(
                    sum_target_1_by_6_mon = ('target_1', sum),
                    sum_target_2_by_6_mon = ('target_2', sum),
                    sum_target_3_by_6_mon = ('target_3', sum),
                    sum_target_4_by_6_mon = ('target_4', sum),
                ), left_index=True, right_index=True, how='left'
            )
    
    # Период неактивности    
    period_noactive_target = select_mon_full_df[select_mon_full_df['target_1'] == 0].sort_values(by=['client_id', 'mon'])
    period_noactive_target['shift_mon'] = period_noactive_target.groupby('client_id')['mon'].shift(1)
    period_noactive_target['period_noactive_target'] = (period_noactive_target['mon'] - period_noactive_target['shift_mon']).dt.days.fillna(0)
    period_noactive_target = period_noactive_target.groupby(by='client_id').agg(
           max_period_noactive_target = ('period_noactive_target', max),
           min_period_noactive_target = ('period_noactive_target', min),
           avg_period_noactive_target = ('period_noactive_target', np.mean),
           median_period_noactive_target = ('period_noactive_target', np.median),
    )
    client_agg_df = client_agg_df.merge(period_noactive_target, left_index=True, right_index=True, how='left')
    
    union_client_agg_df = pd.concat([union_client_agg_df, client_agg_df])
    start_date = start_date + relativedelta(months=1)
union_client_agg_df.shape

  0%|          | 0/16 [00:00<?, ?it/s]

start: 2022-01-01 00:00:00, end: 2022-01-31 00:00:00
(0, 28) (0, 28)


  6%|▋         | 1/16 [00:03<00:59,  3.97s/it]

start: 2022-02-01 00:00:00, end: 2022-02-28 00:00:00
(994380, 28) (994380, 28)


 12%|█▎        | 2/16 [00:21<02:51, 12.22s/it]

start: 2022-03-01 00:00:00, end: 2022-03-31 00:00:00
(994380, 28) (1988760, 28)


 19%|█▉        | 3/16 [00:43<03:34, 16.47s/it]

start: 2022-04-01 00:00:00, end: 2022-04-30 00:00:00
(994380, 28) (2983140, 28)


 25%|██▌       | 4/16 [01:07<03:54, 19.55s/it]

start: 2022-05-01 00:00:00, end: 2022-05-31 00:00:00
(994380, 28) (3977520, 28)


 31%|███▏      | 5/16 [01:34<04:03, 22.10s/it]

start: 2022-06-01 00:00:00, end: 2022-06-30 00:00:00
(994380, 28) (4971900, 28)


 38%|███▊      | 6/16 [02:02<04:03, 24.31s/it]

start: 2022-07-01 00:00:00, end: 2022-07-31 00:00:00
(994380, 28) (5966280, 28)


 44%|████▍     | 7/16 [02:33<03:57, 26.40s/it]

start: 2022-08-01 00:00:00, end: 2022-08-31 00:00:00
(994380, 28) (6960660, 28)


 50%|█████     | 8/16 [03:05<03:45, 28.21s/it]

start: 2022-09-01 00:00:00, end: 2022-09-30 00:00:00
(994380, 28) (7955040, 28)


 56%|█████▋    | 9/16 [03:39<03:28, 29.84s/it]

start: 2022-10-01 00:00:00, end: 2022-10-31 00:00:00
(994380, 28) (8949420, 28)


 62%|██████▎   | 10/16 [04:16<03:13, 32.27s/it]

start: 2022-11-01 00:00:00, end: 2022-11-30 00:00:00
(948294, 28) (9897714, 28)


 69%|██████▉   | 11/16 [04:53<02:48, 33.67s/it]

start: 2022-12-01 00:00:00, end: 2022-12-31 00:00:00
(902769, 28) (10800483, 28)


 75%|███████▌  | 12/16 [05:31<02:19, 34.76s/it]

start: 2023-01-01 00:00:00, end: 2023-01-31 00:00:00
(853892, 28) (11654375, 28)


 81%|████████▏ | 13/16 [06:10<01:48, 36.20s/it]

start: 2023-02-01 00:00:00, end: 2023-02-28 00:00:00
(0, 28) (11654375, 28)


 88%|████████▊ | 14/16 [06:47<01:12, 36.35s/it]

start: 2023-03-01 00:00:00, end: 2023-03-31 00:00:00
(0, 28) (11654375, 28)


 94%|█████████▍| 15/16 [07:20<00:35, 35.37s/it]

start: 2023-04-01 00:00:00, end: 2023-04-30 00:00:00
(0, 28) (11654375, 28)


100%|██████████| 16/16 [07:51<00:00, 29.45s/it]

CPU times: total: 7min 50s
Wall time: 7min 51s





(15910080, 53)

In [14]:
%%time
# Уменьшение размера датафрейма, для таргетов, транзакцй и для фичей
def series_to_int(col_df:pd.Series):
    """
    Перевод в целочисленные типы
    """
    min_val = col_df.min()
    max_val = col_df.max()
    if min_val >= -128 and max_val <= 127:
        col_df = col_df.astype('int8')
    elif min_val >= -32768 and max_val <= 32767:
        col_df = col_df.astype('int16')
    elif min_val >= -2147483648 and max_val <= 2147483647:
        col_df = col_df.astype('int32')
    else:
        col_df = col_df.astype('int64')
    return col_df

def compression_df(df:pd.DataFrame(), datetime_cols:List[str]=[], category_cols:List[str]=[]):
    """
    Уменьшение размера датафрейма, для таргетов, транзакцй и для фичей
    """
    float64_cols = list(df.select_dtypes(include='float64'))  
    df[float64_cols] = df[float64_cols].astype('float32')
    for col in df.columns:
        if col in category_cols:
            df[col] = df[col].astype('category')
        elif col in datetime_cols:
            if df[col].dtypes == 'object':
                df[col] = pd.to_datetime(df[col])
        # Если колонка содержит числа 
        elif is_integer_dtype(df[col]):
            if df[col].dtypes == 'int8':
                continue
            else:
                df[col] = series_to_int(df[col])
        elif is_float_dtype(df[col]):
            # Возможно ли перевести в число
            if np.array_equal(df[col].fillna(0), df[col].fillna(0).astype(int)):
                df[col] = df[col].fillna(0)
                df[col] = series_to_int(df[col])
    return df
union_client_agg_df = compression_df(union_client_agg_df, 
                            datetime_cols=['report_end' ,'report_next_end'],
                           )

mon_targets_df = compression_df(mon_targets_df, 
                            datetime_cols=['mon', 'pre_mon', 'next_mon', 'prepre_mon'],
                           )
union_client_agg_df.shape, mon_targets_df.shape

CPU times: total: 1min 2s
Wall time: 1min 3s


((15910080, 53), (12, 26))

## Объединяем с агрегированными данными по месяцам

In [15]:
# union_client_agg_df['report_next_end'].min()
union_client_agg_df = union_client_agg_df.reset_index('client_id').set_index('report_next_end')
union_client_agg_df.shape

(15910080, 53)

In [16]:
%%time
union_client_agg_df = union_client_agg_df.merge(
                mon_targets_df.drop(columns=['mon', 'next_mon', 'prepre_mon']).set_index('pre_mon').add_prefix('agg_premon_'),
                left_index=True,
                right_index=True,    
                how='left')
gc.collect()


# Все-таки это своего рода заглывание вперед убираем, лучше добавить предпоследний месяц
# union_client_agg_df = union_client_agg_df.merge(
#                 mon_targets_df.drop(columns=['pre_mon', 'next_mon']).set_index('mon').add_prefix('agg_curmon_'),
#                 left_index=True,
#                 right_index=True,    
#                 how='left')

# gc.collect()

# union_client_agg_df = union_client_agg_df.merge(
#                 mon_targets_df.drop(columns=['pre_mon', 'mon']).set_index('next_mon').add_prefix('agg_nxtmon_'),
#                 left_index=True,
#                 right_index=True,    
#                 how='left')

union_client_agg_df = union_client_agg_df.merge(
                mon_targets_df.drop(columns=['mon', 'next_mon', 'pre_mon']).set_index('prepre_mon').add_prefix('agg_prepremon_'),
                left_index=True,
                right_index=True,    
                how='left')
gc.collect()
gc.collect()
union_client_agg_df = union_client_agg_df.fillna(0)
# union_client_agg_df = union_client_agg_df.reset_index().rename(columns={'index': 'report_next_end'}).set_index(['client_id','report_next_end'])
union_client_agg_df = union_client_agg_df.reset_index().rename(columns={'index': 'report_next_end'})
union_client_agg_df = union_client_agg_df.sort_values(by=['client_id','report_next_end']).set_index(['client_id','report_next_end'])

union_client_agg_df.shape

CPU times: total: 47.6 s
Wall time: 45 s


(15910080, 96)

In [17]:
%%time
# Сохраняем в файл оптимизированный файл 
union_client_agg_df.to_parquet(PATH_DATASET_OUTPUT + 'client_agg_target_09_06_2024.parquet')

CPU times: total: 27.9 s
Wall time: 25.4 s
