In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as pgo

from datetime import timedelta

In [2]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine = 'pyarrow')
catalog_vector_map = pd.read_parquet('../data/raw/catalog_vector_mapping.pq')
npz = np.load('../data/raw/vectors.npz')

In [3]:
vectors_df = pd.DataFrame(npz['arr_0'])
vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.083243,0.048154,0.021908,0.046554,0.01602,0.01739,0.068655,0.003578,-0.101042,0.001614,...,-0.054967,-0.02656,-0.02437,-0.017317,-0.063028,-0.019044,-0.052011,-0.101291,0.106851,0.016866
1,-0.029639,0.046028,0.02195,0.022532,-0.110691,-0.053836,0.057446,0.030473,-0.027846,-0.031509,...,-0.030721,-0.021226,0.032937,0.052792,0.017208,0.001812,0.01339,0.001876,0.078863,-0.012866
2,-0.021771,0.093553,-0.050793,-0.000166,0.049989,0.075398,0.039691,-0.007549,-0.023111,-0.061027,...,-0.120556,-0.007603,-0.05752,0.018594,-0.047227,-0.030704,-0.002737,-0.181192,-0.027767,0.057662
3,-0.010277,0.072384,0.014594,0.097766,0.000443,-0.059049,0.039525,0.008496,-0.037287,-0.064468,...,-0.126487,0.01542,-0.012856,-0.025929,-0.033612,0.076626,0.127795,-0.148789,0.051517,0.061379
4,-0.012528,-0.019212,0.047336,0.084923,0.005401,0.02953,-0.033076,-0.002323,-0.042567,-0.027429,...,-0.091939,0.005534,0.009784,-0.035835,0.001122,0.054081,-0.015952,-0.100471,0.075144,0.01615


In [4]:
dates = pd.to_datetime(train_actions.date)
print(f'min_date = {min(dates)}, max_date = {max(dates)}')

min_date = 2024-09-07 00:00:04, max_date = 2024-09-28 23:59:59


Минимальный препроцессинг:

In [5]:
train_actions_processed = train_actions.copy()
train_actions_processed.date = pd.to_datetime(train_actions_processed.date)
train_actions_processed = train_actions_processed.explode('products')
train_actions_processed.products = train_actions_processed.products.fillna(0)
train_actions_processed.products = train_actions_processed.products.astype(int)
train_actions_processed.rename({'products': 'productId'}, axis=1, inplace=True)

In [6]:
stokman_catalog_processed = stokman_catalog.copy()
stokman_catalog_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89483 entries, 0 to 89482
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   add_date     89483 non-null  datetime64[us]
 1   shop_id      89483 non-null  object        
 2   product_id   89483 non-null  object        
 3   category_id  89483 non-null  object        
 4   price        89483 non-null  int32         
 5   title        89483 non-null  object        
 6   old_price    89483 non-null  int32         
dtypes: datetime64[us](1), int32(2), object(4)
memory usage: 4.1+ MB


In [7]:
stokman_catalog_processed.add_date = pd.to_datetime(stokman_catalog_processed.add_date)
stokman_catalog_processed.shop_id = stokman_catalog_processed.shop_id.astype(int)
stokman_catalog_processed.category_id = stokman_catalog_processed.category_id.astype(int)
stokman_catalog_processed.product_id = stokman_catalog_processed.product_id.astype(int)
stokman_catalog_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89483 entries, 0 to 89482
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   add_date     89483 non-null  datetime64[us]
 1   shop_id      89483 non-null  int64         
 2   product_id   89483 non-null  int64         
 3   category_id  89483 non-null  int64         
 4   price        89483 non-null  int32         
 5   title        89483 non-null  object        
 6   old_price    89483 non-null  int32         
dtypes: datetime64[us](1), int32(2), int64(3), object(1)
memory usage: 4.1+ MB


In [9]:
ACTIONS = {
    0: 'view',
    1: 'like',
    2: 'addB',
    3: 'delB',
    4: 'clearB',
    5: 'order',
    6: 'listB',
    7: 'visit',
    8: 'visitCategory',
    9: 'search'
}

In [10]:
products_features = stokman_catalog_processed.copy()

# Добавление счетчиков для каждого действия
for action_code, action_name in ACTIONS.items():
    products_features[f'{action_name}_number'] = train_actions_processed[train_actions_processed['action'] == action_code].groupby('productId').size()

# Дополнительные признаки
products_features['price_difference'] = products_features['price'] - products_features['old_price']
products_features['price_difference_percent'] = (products_features['price_difference'] / products_features['old_price']) * 100

# Заполните пропущенные значения на 0 (если присутствуют нулевые значения в actions)
products_features.fillna(0, inplace=True)

In [24]:
merged_data = pd.merge(
    left=train_actions_processed[['date', 'productId', 'action']],
    right=stokman_catalog_processed[['add_date', 'product_id']],
    left_on='productId', 
    right_on='product_id', 
    how='left'
).dropna()

# 1. Вычисляем разницу во времени между date и add_date (в днях, например)
merged_data['time_difference'] = (merged_data['date'] - merged_data['add_date']).dt.days


result = merged_data.groupby(['productId', 'action']).agg(
    min_time_difference=('time_difference', 'min'),
    avg_time_difference=('time_difference', 'mean')
).reset_index()

result[(result.min_time_difference < 0) | (result.avg_time_difference < 0)].shape[0] - result.shape[0]

-19255

In [17]:
products_features.product_id.nunique() == products_features.shape[0]

True

In [14]:
corr_matrix = products_features.drop('title', axis=1).corr()

fig = pgo.Figure(data=pgo.Heatmap(
    z=corr_matrix.values,  
    x=corr_matrix.columns, 
    y=corr_matrix.index,   
    colorscale='Viridis',
))

fig.show()