In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as pgo

from datetime import timedelta

In [2]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine = 'pyarrow')
catalog_vector_map = pd.read_parquet('../data/raw/catalog_vector_mapping.pq')
npz = np.load('../data/raw/vectors.npz')

In [3]:
vectors_df = pd.DataFrame(npz['arr_0'])
vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.083243,0.048154,0.021908,0.046554,0.01602,0.01739,0.068655,0.003578,-0.101042,0.001614,...,-0.054967,-0.02656,-0.02437,-0.017317,-0.063028,-0.019044,-0.052011,-0.101291,0.106851,0.016866
1,-0.029639,0.046028,0.02195,0.022532,-0.110691,-0.053836,0.057446,0.030473,-0.027846,-0.031509,...,-0.030721,-0.021226,0.032937,0.052792,0.017208,0.001812,0.01339,0.001876,0.078863,-0.012866
2,-0.021771,0.093553,-0.050793,-0.000166,0.049989,0.075398,0.039691,-0.007549,-0.023111,-0.061027,...,-0.120556,-0.007603,-0.05752,0.018594,-0.047227,-0.030704,-0.002737,-0.181192,-0.027767,0.057662
3,-0.010277,0.072384,0.014594,0.097766,0.000443,-0.059049,0.039525,0.008496,-0.037287,-0.064468,...,-0.126487,0.01542,-0.012856,-0.025929,-0.033612,0.076626,0.127795,-0.148789,0.051517,0.061379
4,-0.012528,-0.019212,0.047336,0.084923,0.005401,0.02953,-0.033076,-0.002323,-0.042567,-0.027429,...,-0.091939,0.005534,0.009784,-0.035835,0.001122,0.054081,-0.015952,-0.100471,0.075144,0.01615


In [4]:
train_actions.head()

Unnamed: 0,user_id,loc_user_id,action,date,products,pageId
0,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:04,[],2571824865
1,6fd49b56-8cc6-11ed-86e0-002590c0647c,c6e357dc-121d-449d-a744-e9a0b56c2380,7,2024-09-07 00:00:08,[],3834364438
2,f9c498ec-5d3b-11ef-86e0-002590c0647c,120c9064-1131-4dc3-8048-44184531b42e,7,2024-09-07 00:00:08,[],2448628415
3,59386b5c-e64f-11ec-8086-002590c82437,f0745572-893f-4e50-bc52-5af47badff5a,7,2024-09-07 00:00:11,[],3875013967
4,badbd396-6cab-11ef-86e0-002590c0647c,73423d85-d47c-4332-8155-5200615302b5,7,2024-09-07 00:00:11,[],3025531174


In [5]:
dates = pd.to_datetime(train_actions.date)
print(f'min_date = {min(dates)}, max_date = {max(dates)}')

min_date = 2024-09-07 00:00:04, max_date = 2024-09-28 23:59:59


Минимальный препроцессинг:

In [6]:
train_actions_processed = train_actions.copy()
train_actions_processed.date = pd.to_datetime(train_actions_processed.date)
train_actions_processed = train_actions_processed.explode('products')
train_actions_processed.products = train_actions_processed.products.fillna(0)
train_actions_processed.products = train_actions_processed.products.astype(int)
train_actions_processed.rename({'products': 'productId'}, axis=1, inplace=True)

In [7]:
stokman_catalog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89483 entries, 0 to 89482
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   add_date     89483 non-null  datetime64[us]
 1   shop_id      89483 non-null  object        
 2   product_id   89483 non-null  object        
 3   category_id  89483 non-null  object        
 4   price        89483 non-null  int32         
 5   title        89483 non-null  object        
 6   old_price    89483 non-null  int32         
dtypes: datetime64[us](1), int32(2), object(4)
memory usage: 4.1+ MB


In [8]:
stokman_catalog.add_date = pd.to_datetime(stokman_catalog.add_date)
stokman_catalog.product_id = stokman_catalog.product_id.astype(int)

In [9]:
print(train_actions_processed.head())
print(train_actions_processed.info())

                                user_id                           loc_user_id  \
0  6fd49b56-8cc6-11ed-86e0-002590c0647c  c6e357dc-121d-449d-a744-e9a0b56c2380   
1  6fd49b56-8cc6-11ed-86e0-002590c0647c  c6e357dc-121d-449d-a744-e9a0b56c2380   
2  f9c498ec-5d3b-11ef-86e0-002590c0647c  120c9064-1131-4dc3-8048-44184531b42e   
3  59386b5c-e64f-11ec-8086-002590c82437  f0745572-893f-4e50-bc52-5af47badff5a   
4  badbd396-6cab-11ef-86e0-002590c0647c  73423d85-d47c-4332-8155-5200615302b5   

   action                date  productId      pageId  
0       7 2024-09-07 00:00:04          0  2571824865  
1       7 2024-09-07 00:00:08          0  3834364438  
2       7 2024-09-07 00:00:08          0  2448628415  
3       7 2024-09-07 00:00:11          0  3875013967  
4       7 2024-09-07 00:00:11          0  3025531174  
<class 'pandas.core.frame.DataFrame'>
Index: 6859002 entries, 0 to 6580935
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_

In [10]:
productId_pageId_mapping = train_actions[['products', 'pageId']][
    (train_actions['pageId'].notna()) & 
    (train_actions['products'].apply(len) == 1) & 
    train_actions['action'].isin([0, 7])]
productId_pageId_mapping = productId_pageId_mapping.explode('products')
productId_pageId_mapping.rename(columns={'products':'productId'}, inplace=True)

- 0 - view (просмотр товара)
- 1 - like (лайк товара)
- 2 - addB (добавление товара в корзину)
- 3 - delB (удаление товара из корзины)
- 4 - clearB (удаление всех товаров из корзины)
- 5 - order (оформление заказа)
- 6 - listB (посещение страницы корзины и вывод списка товаров в корзине)
- 7 - visit (посещение страницы с товаром)
- 8 - visitCategory (посещение страницы с группой товаров)
- 9 - search (поиск товара)

In [11]:
ACTIONS = {
    0: 'view',
    1: 'like',
    2: 'addB',
    3: 'delB',
    4: 'clearB',
    5: 'order',
    6: 'listB',
    7: 'visit',
    8: 'visitCategory',
    9: 'search'
}

In [12]:
user_features = train_actions_processed.groupby('user_id').agg(
    total_actions=('action', 'count'), # тотал кол-во действий
    nunique_products_number=('productId', pd.Series.nunique) # кол-во уникальных товаров, с которыми взаимодействовал пользователь
)

# кол-во действий по номеру
for action_code, action_name in ACTIONS.items():
    user_features[f'{action_name}_number'] = train_actions_processed[train_actions_processed['action'] == action_code].groupby('user_id').size()

# доля каждого действия от общего числа действий, возможно потом от этого избавимся т.к. фича примерно тоже самое что и предыдущая
for action_code, action_name in ACTIONS.items():
    user_features[f'fraction_{action_name}_ofAllActions'] = user_features[f'{action_name}_number'] / user_features['total_actions']

# Активность за последние 3 и 7 дней
max_date = train_actions_processed['date'].max()

last_3_days = max_date - timedelta(days=3)
last_7_days = max_date - timedelta(days=7)

user_features['activity_last3days'] = train_actions_processed[train_actions_processed['date'] >= last_3_days].groupby('user_id').size()
user_features['activity_last7days'] = train_actions_processed[train_actions_processed['date'] >= last_7_days].groupby('user_id').size()

In [24]:
user_features.head()

Unnamed: 0_level_0,total_actions,nunique_products_number,view_number,like_number,addB_number,delB_number,clearB_number,order_number,listB_number,visit_number,...,fraction_order_ofAllActions,fraction_listB_ofAllActions,fraction_visit_ofAllActions,fraction_visitCategory_ofAllActions,fraction_search_ofAllActions,activity_last3days,activity_last7days,max_purchase_amount,min_purchase_amount,avg_purchase_amount
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/*,39,7,5.0,0.0,0.0,0.0,0.0,0.0,2.0,30.0,...,0.0,0.051282,0.769231,0.051282,0.0,0.0,0.0,0.0,0.0,0.0
0000bdba-5180-11eb-8a53-0cc47a6d2fef,29,4,5.0,1.0,1.0,0.0,0.0,0.0,2.0,15.0,...,0.0,0.068966,0.517241,0.172414,0.0,16.0,29.0,0.0,0.0,0.0
0000d5dc-78cf-11ef-86e0-002590c0647c,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.5,0.5,0.0,0.0,2.0,0.0,0.0,0.0
0001151e-c2b9-11ee-bbb1-002590c82436,786,84,97.0,0.0,0.0,0.0,0.0,0.0,0.0,563.0,...,0.0,0.0,0.716285,0.160305,0.0,106.0,220.0,0.0,0.0,0.0
00014c1e-f9d5-11eb-8a53-0cc47a6d2fef,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Объединим по productId, чтобы получить цену каждого товара
tap_with_prices = train_actions_processed.merge(stokman_catalog[['product_id', 'price']], left_on='productId', right_on='product_id', how='left')

# Рассчитаем max, min, avg сумму покупок
purchase_actions = tap_with_prices[tap_with_prices['action'] == 5].groupby('user_id').agg(
    max_purchase_amount=('price', 'max'),
    min_purchase_amount=('price', 'min'),
    avg_purchase_amount=('price', 'mean')
)

# Объединим все вместе
user_features = user_features.merge(purchase_actions, on='user_id', how='left')

In [15]:
user_features.fillna(0, inplace=True)

In [23]:
user_features.columns

Index(['total_actions', 'nunique_products_number', 'view_number',
       'like_number', 'addB_number', 'delB_number', 'clearB_number',
       'order_number', 'listB_number', 'visit_number', 'visitCategory_number',
       'search_number', 'fraction_view_ofAllActions',
       'fraction_like_ofAllActions', 'fraction_addB_ofAllActions',
       'fraction_delB_ofAllActions', 'fraction_clearB_ofAllActions',
       'fraction_order_ofAllActions', 'fraction_listB_ofAllActions',
       'fraction_visit_ofAllActions', 'fraction_visitCategory_ofAllActions',
       'fraction_search_ofAllActions', 'activity_last3days',
       'activity_last7days', 'max_purchase_amount', 'min_purchase_amount',
       'avg_purchase_amount'],
      dtype='object')

In [16]:
user_features.head()

Unnamed: 0_level_0,total_actions,nunique_products_number,view_number,like_number,addB_number,delB_number,clearB_number,order_number,listB_number,visit_number,...,fraction_order_ofAllActions,fraction_listB_ofAllActions,fraction_visit_ofAllActions,fraction_visitCategory_ofAllActions,fraction_search_ofAllActions,activity_last3days,activity_last7days,max_purchase_amount,min_purchase_amount,avg_purchase_amount
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
/*,39,7,5.0,0.0,0.0,0.0,0.0,0.0,2.0,30.0,...,0.0,0.051282,0.769231,0.051282,0.0,0.0,0.0,0.0,0.0,0.0
0000bdba-5180-11eb-8a53-0cc47a6d2fef,29,4,5.0,1.0,1.0,0.0,0.0,0.0,2.0,15.0,...,0.0,0.068966,0.517241,0.172414,0.0,16.0,29.0,0.0,0.0,0.0
0000d5dc-78cf-11ef-86e0-002590c0647c,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.5,0.5,0.0,0.0,2.0,0.0,0.0,0.0
0001151e-c2b9-11ee-bbb1-002590c82436,786,84,97.0,0.0,0.0,0.0,0.0,0.0,0.0,563.0,...,0.0,0.0,0.716285,0.160305,0.0,106.0,220.0,0.0,0.0,0.0
00014c1e-f9d5-11eb-8a53-0cc47a6d2fef,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
corr_matrix = user_features.corr()

fig = pgo.Figure(data=pgo.Heatmap(
    z=corr_matrix.values,  
    x=corr_matrix.columns, 
    y=corr_matrix.index,   
    colorscale='Viridis',
))

fig.show()

In [19]:
user_features.to_csv('../data/processed/user_features_v1.csv')