In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

import itertools

from typing import Union

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

import mlflow
import mlflow.sklearn

In [3]:
ACTIONS = {
    0: 'view',
    1: 'like',
    2: 'addB',
    3: 'delB',
    4: 'clearB',
    5: 'order',
    6: 'listB',
    7: 'visit',
    8: 'visitCategory',
    9: 'search'
}

In [4]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine='pyarrow')

In [5]:
def min_preprocessing(actions, catalog):
    
    actions.date = pd.to_datetime(actions.date)
    actions = actions.explode('products')
    actions.products = actions.products.fillna(0)
    actions.products = actions.products.astype(int)
    actions.rename({'products': 'productId'}, axis=1, inplace=True)

    catalog.add_date = pd.to_datetime(catalog.add_date)
    catalog.shop_id = catalog.shop_id.astype(int)
    catalog.category_id = catalog.category_id.astype(int)
    catalog.product_id = catalog.product_id.astype(int)
    catalog.drop(['title'], axis=1, inplace=True)
    
    return actions, catalog

In [6]:
def generate_user_product_combinations(users, products):
    ''' Генерация всех возможных комбинаций пользователей и товаров '''
    # Используйте pd.MultiIndex для эффективного создания DataFrame
    index = pd.MultiIndex.from_product([users, products], names=['user_id', 'product_id'])
    
    # Создание DataFrame с индексацией
    df_combinations = pd.DataFrame(index=index).reset_index()
    
    return df_combinations

In [7]:
def create_user_features(actions, catalog):

    user_features = actions.groupby('user_id').agg(
        total_actions=('action', 'count'),  # тотал кол-во действий
        nunique_products_number=('productId', pd.Series.nunique)  # кол-во уникальных товаров, с которыми взаимодействовал пользователь
    )

    # кол-во действий по номеру
    for action_code, action_name in ACTIONS.items():
        user_features[f'{action_name}_number'] = actions[actions['action'] == action_code].groupby('user_id').size()

    # доля каждого действия от общего числа действий
    for action_code, action_name in ACTIONS.items():
        user_features[f'fraction_{action_name}_ofAllActions'] = user_features[f'{action_name}_number'] / user_features['total_actions']

    # Активность за последние 3 и 7 дней
    max_date = actions['date'].max()

    last_3_days = max_date - timedelta(days=3)
    last_7_days = max_date - timedelta(days=7)
    last_14_days = max_date - timedelta(days=14)

    user_features['activity_last3days'] = actions[actions['date'] >= last_3_days].groupby('user_id').size()
    user_features['activity_last7days'] = actions[actions['date'] >= last_7_days].groupby('user_id').size()
    user_features['activity_last14days'] = actions[actions['date'] >= last_14_days].groupby('user_id').size()

    user_features = user_features.reset_index()

    # Объединим по productId, чтобы получить цену каждого товара
    tap_with_prices = actions.merge(catalog[['product_id', 'price']], left_on='productId', right_on='product_id', how='left')

    # Рассчитаем max, min, avg сумму покупок
    purchase_actions = tap_with_prices[tap_with_prices['action'] == 5].groupby('user_id').agg(
        max_purchase_amount=('price', 'max'),
        min_purchase_amount=('price', 'min'),
        avg_purchase_amount=('price', 'mean')
    )

    # Объединим все вместе
    user_features = user_features.merge(purchase_actions, on='user_id', how='left')

    user_features.fillna(0, inplace=True)

    return user_features


In [8]:
def create_product_features(catalog, actions):
    
    product_features = catalog.copy()
    # Добавление счетчиков для каждого действия
    for action_code, action_name in ACTIONS.items():
        product_features[f'{action_name}_number'] = actions[actions['action'] == action_code].groupby('productId').agg({'action':'count'}).reset_index().sort_values(by='productId')['action']
    # Дополнительные признаки
    product_features['price_difference'] = product_features['price'] - product_features['old_price']
    product_features['price_difference_percent'] = (product_features['price_difference'] / product_features['old_price']) * 100

    # Заполните пропущенные значения на 0 (если присутствуют нулевые значения в actions)
    product_features.fillna(0, inplace=True)
    
    return product_features

In [9]:
def create_target(actions_df, combinations_df, test_start_date, test_end_date):
    # Фильтруем покупки в тестовом окне
    test_window_purchases = actions_df[
        (actions_df['action'] == 5) &
        (actions_df['date'] >= test_start_date) &
        (actions_df['date'] <= test_end_date)
    ]

    # Создаем временной DataFrame для меток
    target_df = test_window_purchases[['user_id', 'productId']].copy()
    target_df['target'] = 1
    print(target_df.columns, combinations_df.columns)
    # Объединяем с combinations_df, чтобы установить метки
    combinations_df = combinations_df.merge(target_df, left_on=['user_id', 'product_id'], right_on=['user_id', 'productId'], how='left')

    # Заполняем NaN нулями для случаев, когда покупки не произошло
    combinations_df['target'].fillna(0, inplace=True)

    del target_df, test_window_purchases

    return combinations_df

In [10]:
train_actions, stokman_catalog = min_preprocessing(train_actions, stokman_catalog)

In [56]:
users_actions = train_actions.groupby('user_id').agg({'action':'count'}).sort_values(by='action', ascending=False)
train_actions_cutted = train_actions[train_actions.user_id.isin(users_actions.sort_values(by='action', ascending=False)[:3000].index)]
# train_actions_cutted = train_actions_cutted.rename({'products':'productId'}, axis=1)
del users_actions

MemoryError: Unable to allocate 156. GiB for an array with shape (42006452105,) and data type int32 )))))))))))))))00

In [72]:
model = RandomForestClassifier(random_state=42, class_weight='balanced')

In [16]:
def create_df_all_features_and_target_in_time_window(start: pd.Timestamp, 
                                                     end: pd.Timestamp, 
                                                     actions: pd.DataFrame, 
                                                     catalog: pd.DataFrame,
                                                     combinations: pd.DataFrame) -> Union[pd.Series, pd.DataFrame]:
    ''' 
    '''
    
    actions = actions[(actions.date >= start) & (actions.date <= end)]
    product_features = create_product_features(actions=actions, catalog=catalog)
    user_features = create_user_features(actions=actions, catalog=catalog)
    
    combinations = create_target(actions_df=actions, combinations_df=combinations, test_start_date=start, test_end_date=end)
    del actions
    
    combinations =  combinations.merge(user_features, on='user_id', how='left')
    del user_features
    
    combinations = combinations.merge(product_features, on='product_id', how='left')
    del product_features
    
    return combinations.drop('target', axis=1), combinations['target']

In [57]:
train_size = pd.Timedelta(days=1)
window_size_test = pd.Timedelta(days=3)

train_actions_cutted.sort_values(by='date', inplace=True)
unique_dates = train_actions_cutted['date'].unique()

start_date = unique_dates[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_actions_cutted.sort_values(by='date', inplace=True)


In [58]:
def select_valuable_products(catalog):
    pass

product_features = create_product_features(actions=train_actions, catalog=stokman_catalog)
product_features.sort_values(by='order_number')
need_cat = product_features.category_id[0]
print(need_cat)
products_catalog_cutted = stokman_catalog[stokman_catalog.category_id == need_cat]

8922


In [59]:
df_combinations = generate_user_product_combinations(train_actions_cutted['user_id'].unique(), products_catalog_cutted['product_id'].unique())
df_combinations.shape[0]

792000

In [66]:
end_train = unique_dates[-1] - window_size_test
end_test = end_train + window_size_test

X, y = create_df_all_features_and_target_in_time_window(start=start_date, 
                                                        end=end_train, 
                                                        actions=train_actions_cutted, 
                                                        catalog=products_catalog_cutted, 
                                                        combinations=df_combinations)

Index(['user_id', 'productId', 'target'], dtype='object') Index(['user_id', 'product_id'], dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combinations_df['target'].fillna(0, inplace=True)


In [96]:
model.fit(X=X.drop(['user_id', 'product_id', 'productId', 'add_date'], axis=1), y=y)

In [97]:
X_test, y_test = create_df_all_features_and_target_in_time_window(start=end_train, 
                                                        end=end_test, 
                                                        actions=train_actions_cutted, 
                                                        catalog=products_catalog_cutted, 
                                                        combinations=df_combinations)

preds = model.predict(X_test.drop(['user_id', 'product_id', 'productId', 'add_date'], axis=1))

Index(['user_id', 'productId', 'target'], dtype='object') Index(['user_id', 'product_id'], dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combinations_df['target'].fillna(0, inplace=True)


In [100]:
probas = model.predict_proba(X_test.drop(['user_id', 'product_id', 'productId', 'add_date'], axis=1))

In [120]:
res = pd.concat([pd.DataFrame(preds), X_test[['user_id', 'product_id']]], axis=1)

In [99]:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=y_test, y_pred=preds)

1.0

In [108]:
df_combinations_full = generate_user_product_combinations(train_actions_cutted['user_id'].unique(), 
                                                          stokman_catalog['product_id'].unique())
df_combinations_full.shape[0]

268449000

In [None]:
create_target(ac)

In [112]:
train_actions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6859002 entries, 0 to 6580935
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      object        
 1   loc_user_id  object        
 2   action       int64         
 3   date         datetime64[us]
 4   productId    int64         
 5   pageId       object        
dtypes: datetime64[us](1), int64(2), object(3)
memory usage: 366.3+ MB


In [None]:
probas.head()

In [35]:
while start_date + train_size + window_size_test <= unique_dates[-1]:
    end_train = start_date + train_size
    end_test = end_train + window_size_test
    
    X, y = create_df_all_features_and_target_in_time_window(start=start_date, 
                                                            end=end_train, 
                                                            actions=train_actions_cutted, 
                                                            catalog=products_catalog_cutted, 
                                                            combinations=df_combinations)
    
    print(X.head(), y)
    
    break 
    model.fit(X, y)
    
    train_size += pd.Timedelta(days=1)

Index(['user_id', 'productId', 'target'], dtype='object') Index(['user_id', 'product_id'], dtype='object')
                                user_id  product_id  productId  total_actions  \
0  59386b5c-e64f-11ec-8086-002590c82437     1025536        NaN         1596.0   
1  59386b5c-e64f-11ec-8086-002590c82437     1037902        NaN         1596.0   
2  59386b5c-e64f-11ec-8086-002590c82437     1037998        NaN         1596.0   
3  59386b5c-e64f-11ec-8086-002590c82437     1038253        NaN         1596.0   
4  59386b5c-e64f-11ec-8086-002590c82437     1038310        NaN         1596.0   

   nunique_products_number  view_number_x  like_number_x  addB_number_x  \
0                    126.0          171.0            0.0           26.0   
1                    126.0          171.0            0.0           26.0   
2                    126.0          171.0            0.0           26.0   
3                    126.0          171.0            0.0           26.0   
4                    126.0     

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combinations_df['target'].fillna(0, inplace=True)


In [None]:
# Шаг 2: Добавление признаков пользователя
start_date = '2023-01-01'
end_date = '2023-01-31'
user_features = create_user_features(train_actions_cutted, start_date, end_date)
df_combinations = df_combinations.merge(user_features, on='user_id', how='left')

# Шаг 3: Добавление признаков товара
product_features = create_product_features(stokman_catalog)
df_combinations = df_combinations.merge(product_features, on='product_id', how='left')

# Шаг 4: Создание таргета для тестового окна
test_start_date = '2023-02-01'
test_end_date = '2023-02-03'
df_with_target = create_target(train_actions, df_combinations, test_start_date, test_end_date)

# Шаг 5: Обучение модели и предсказание вероятностей
model, y_pred_proba = train_random_forest(df_with_target, target_column='target')
