In [16]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

import mlflow
import mlflow.sklearn

In [15]:
ACTIONS = {
    0: 'view',
    1: 'like',
    2: 'addB',
    3: 'delB',
    4: 'clearB',
    5: 'order',
    6: 'listB',
    7: 'visit',
    8: 'visitCategory',
    9: 'search'
}

In [21]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
train_actions = train_actions.explode('products')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine='pyarrow')

In [22]:
train_actions.date = pd.to_datetime(train_actions.date)
train_actions = train_actions.explode('products')
train_actions.products = train_actions.products.fillna(0)
train_actions.products = train_actions.products.astype(int)
train_actions.rename({'products': 'productId'}, axis=1, inplace=True)

stokman_catalog.add_date = pd.to_datetime(stokman_catalog.add_date)
stokman_catalog.product_id = stokman_catalog.product_id.astype(int)

In [52]:
def create_features(df):
    df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
    
    user_features = df.groupby('user_id').agg(
    total_actions=('action', 'count'), # тотал кол-во действий
    nunique_products_number=('productId', pd.Series.nunique), # кол-во уникальных товаров, с которыми взаимодействовал пользователь
    will_purchase=('will_purchase', 'max')
    )

    # кол-во действий по номеру
    for action_code, action_name in ACTIONS.items():
        user_features[f'{action_name}_number'] = df[df['action'] == action_code].groupby('user_id').size()

    # доля каждого действия от общего числа действий, возможно потом от этого избавимся т.к. фича примерно тоже самое что и предыдущая
    for action_code, action_name in ACTIONS.items():
        user_features[f'fraction_{action_name}_ofAllActions'] = user_features[f'{action_name}_number'] / user_features['total_actions']

    # Активность за последние 3 и 7 дней
    max_date = df['date'].max()

    last_3_days = max_date - timedelta(days=3)
    last_7_days = max_date - timedelta(days=7)

    user_features['activity_last3days'] = df[df['date'] >= last_3_days].groupby('user_id').size()
    user_features['activity_last7days'] = df[df['date'] >= last_7_days].groupby('user_id').size()
    
    user_features = user_features.reset_index()
    
    # Объединим по productId, чтобы получить цену каждого товара
    tap_with_prices = df.merge(stokman_catalog[['product_id', 'price']], left_on='productId', right_on='product_id', how='left')

    # Рассчитаем max, min, avg сумму покупок
    purchase_actions = tap_with_prices[tap_with_prices['action'] == 5].groupby('user_id').agg(
        max_purchase_amount=('price', 'max'),
        min_purchase_amount=('price', 'min'),
        avg_purchase_amount=('price', 'mean')
    )

    # Объединим все вместе
    user_features = user_features.merge(purchase_actions, on='user_id', how='left')
    
    user_features.fillna(0, inplace=True)
    
    return user_features

In [44]:
train_actions = train_actions.sort_values(by='date')
unique_dates = train_actions['date'].unique()

In [45]:
mlflow_dir = '../experiments/mlflow'
mlflow.set_tracking_uri(mlflow_dir)

In [56]:
start_train_size = pd.Timedelta(days=1)  # Начальная длина обучающего окна
window_size_test = pd.Timedelta(days=3)  # Тестовое окно фиксировано в 3 дня
step = pd.Timedelta(days=1)              # Шаг в один день

results = []
start_date = unique_dates[0]

while start_date + start_train_size + window_size_test <= unique_dates[-1]:
    end_train = start_date + start_train_size
    end_test = end_train + window_size_test

    # Определяем временные окна
    train_dates = unique_dates[(unique_dates >= start_date) & (unique_dates < end_train)]
    test_dates = unique_dates[(unique_dates >= end_train) & (unique_dates < end_test)]

    # Фильтрация данных по датам
    train_data = train_actions[train_actions['date'].isin(train_dates)]
    test_data = train_actions[train_actions['date'].isin(test_dates)]

    # Создаем фичи
    X_train = create_features(train_data)
    X_test = create_features(test_data)

    # Целевая переменная
    y_train = X_train['will_purchase']
    y_test = X_test['will_purchase']

    # Убираем целевую переменную
    X_train = X_train.drop(columns=['user_id', 'will_purchase'])
    X_test = X_test.drop(columns=['user_id', 'will_purchase'])

    # Пропускаем итерации с малым количеством классов
    if y_train.nunique() < 2:
        print(f"Skipping training for dates {train_dates[0]} to {train_dates[-1]}: only one class present.")
        start_train_size += step
        continue

    model = LogisticRegression(solver='liblinear')

    # Логирование параметров с помощью MLflow
    mlflow.set_experiment(f'LR_{start_date}')

    with mlflow.start_run():
        mlflow.log_param("model_type", "Logistic Regression")
        mlflow.log_param("train_window", f"{train_dates[0]} to {train_dates[-1]}")
        mlflow.log_param("test_window", f"{test_dates[0]} to {test_dates[-1]}")
        mlflow.log_param("train_size", len(X_train))
        mlflow.log_param("test_size", len(X_test))

        # Обучение модели
        model.fit(X_train, y_train)

        # Предсказание
        y_pred = model.predict(X_test)

        # Оценка модели
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy from {train_dates[0]} to {train_dates[-1]}: {accuracy}')
        print(classification_report(y_test, y_pred))

        # Логирование метрик
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, f"logistic_regression_model_{train_dates[0].date()}")
        mlflow.end_run()

    results.append((train_dates, accuracy))

    # Увеличиваем окно тренировки
    start_train_size += step

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:14:53 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-07 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-07 00:00:04 to 2024-09-08 00:00:00: 0.9998610602367028
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     78665
           1       1.00      0.98      0.99       506

    accuracy                           1.00     79171
   macro avg       1.00      0.99      0.99     79171
weighted avg       1.00      1.00      1.00     79171



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-09 00:00:03: 0.9997264974265895
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     79919
           1       0.99      0.97      0.98       519

    accuracy                           1.00     80438
   macro avg       0.99      0.98      0.99     80438
weighted avg       1.00      1.00      1.00     80438



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-10 00:00:03: 0.9998173271305746
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81551
           1       1.00      0.98      0.99       563

    accuracy                           1.00     82114
   macro avg       1.00      0.99      0.99     82114
weighted avg       1.00      1.00      1.00     82114



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-11 00:00:02: 0.9995847277524
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81285
           1       0.98      0.96      0.97       589

    accuracy                           1.00     81874
   macro avg       0.99      0.98      0.99     81874
weighted avg       1.00      1.00      1.00     81874



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-12 00:00:01: 0.9996106016832056
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     79034
           1       0.98      0.96      0.97       576

    accuracy                           1.00     79610
   macro avg       0.99      0.98      0.99     79610
weighted avg       1.00      1.00      1.00     79610



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-12 23:59:58: 0.9996801790997042
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74525
           1       0.99      0.96      0.98       517

    accuracy                           1.00     75042
   macro avg       0.99      0.98      0.99     75042
weighted avg       1.00      1.00      1.00     75042



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-13 23:59:59: 0.9996874830153812
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     73070
           1       1.00      0.96      0.98       526

    accuracy                           1.00     73596
   macro avg       1.00      0.98      0.99     73596
weighted avg       1.00      1.00      1.00     73596



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-15 00:00:03: 0.9997182985230794
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74016
           1       1.00      0.96      0.98       531

    accuracy                           1.00     74547
   macro avg       1.00      0.98      0.99     74547
weighted avg       1.00      1.00      1.00     74547



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-16 00:00:02: 0.9996934069156735
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74469
           1       0.99      0.97      0.98       549

    accuracy                           1.00     75018
   macro avg       0.99      0.99      0.99     75018
weighted avg       1.00      1.00      1.00     75018



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-16 23:59:57: 0.9996220316450747
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     76133
           1       0.98      0.97      0.98       593

    accuracy                           1.00     76726
   macro avg       0.99      0.99      0.99     76726
weighted avg       1.00      1.00      1.00     76726



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-18 00:00:03: 0.9997027847774116
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     76752
           1       0.99      0.98      0.98       633

    accuracy                           1.00     77385
   macro avg       0.99      0.99      0.99     77385
weighted avg       1.00      1.00      1.00     77385



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-19 00:00:02: 0.9997389306599833
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     76000
           1       0.99      0.98      0.98       608

    accuracy                           1.00     76608
   macro avg       1.00      0.99      0.99     76608
weighted avg       1.00      1.00      1.00     76608



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-20 00:00:03: 0.9998015873015873
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     75015
           1       1.00      0.98      0.99       585

    accuracy                           1.00     75600
   macro avg       1.00      0.99      0.99     75600
weighted avg       1.00      1.00      1.00     75600



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-20 23:59:55: 0.9997697768114089
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     77593
           1       1.00      0.97      0.98       592

    accuracy                           1.00     78185
   macro avg       1.00      0.99      0.99     78185
weighted avg       1.00      1.00      1.00     78185



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-22 00:00:03: 0.9998174671745136
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81557
           1       1.00      0.98      0.99       620

    accuracy                           1.00     82177
   macro avg       1.00      0.99      0.99     82177
weighted avg       1.00      1.00      1.00     82177



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-23 00:00:03: 0.9998297913728541
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81703
           1       0.99      0.98      0.99       549

    accuracy                           1.00     82252
   macro avg       1.00      0.99      0.99     82252
weighted avg       1.00      1.00      1.00     82252



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-24 00:00:03: 0.999915403765741
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     82209
           1       1.00      0.99      0.99       537

    accuracy                           1.00     82746
   macro avg       1.00      0.99      1.00     82746
weighted avg       1.00      1.00      1.00     82746



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)


Accuracy from 2024-09-07 00:00:04 to 2024-09-24 23:59:58: 0.9998933598748756
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     83888
           1       1.00      0.98      0.99       508

    accuracy                           1.00     84396
   macro avg       1.00      0.99      1.00     84396
weighted avg       1.00      1.00      1.00     84396





In [55]:
window_size_train = pd.Timedelta(days=3)  # Количество дней для обучения
window_size_test = pd.Timedelta(days=3)    # Количество дней для тестирования
step = pd.Timedelta(days=1)                # Шаг в один день

results = []  # Список для хранения результатов

# Начинаем цикл с первой даты
start_date = unique_dates[0]

# Цикл по окнам
while start_date + window_size_train + window_size_test <= unique_dates[-1]:
    end_train = start_date + window_size_train
    end_test = end_train + window_size_test

    # Определение временных окон
    train_dates = unique_dates[(unique_dates >= start_date) & (unique_dates < end_train)]
    test_dates = unique_dates[(unique_dates >= end_train) & (unique_dates < end_test)]

    # Фильтрация данных по датам
    train_data = train_actions[train_actions['date'].isin(train_dates)]
    test_data = train_actions[train_actions['date'].isin(test_dates)]

    # Создание признаков
    X_train = create_features(train_data)
    X_test = create_features(test_data)

    # Определение целевой переменной
    y_train = X_train['will_purchase']
    y_test = X_test['will_purchase']

    # Удаление целевой переменной из признаков
    X_train = X_train.drop(columns=['user_id', 'will_purchase'])
    X_test = X_test.drop(columns=['user_id', 'will_purchase'])
    
    if y_train.nunique() < 2:
        print(f"Skipping training for dates {train_dates[0]} to {train_dates[-1]}: only one class present.")
        continue

    model = LogisticRegression(solver='liblinear')

    #  Лог
    mlflow.set_experiment(f'LR_{start_date}')

    with mlflow.start_run():
        mlflow.log_param("model_type", "Logistic Regression")
        mlflow.log_param("train_window", f"{train_dates[0]} to {train_dates[-1]}")
        mlflow.log_param("test_window", f"{test_dates[0]} to {test_dates[-1]}")

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        # Оценка модели
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy from {train_dates[0]} to {train_dates[-1]}: {accuracy}')
        print(classification_report(y_test, y_pred))

        # Лог
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, f"logistic_regression_model_{train_dates[0].date()}")
        mlflow.end_run()

    results.append((train_dates, accuracy))
    
    start_date += step

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:02:53 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-07 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-07 00:00:04 to 2024-09-10 00:00:03: 0.9998173271305746
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81551
           1       1.00      0.98      0.99       563

    accuracy                           1.00     82114
   macro avg       1.00      0.99      0.99     82114
weighted avg       1.00      1.00      1.00     82114



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:02:59 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-08 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-08 00:00:07 to 2024-09-11 00:00:02: 0.9994992305249529
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81285
           1       0.97      0.96      0.97       589

    accuracy                           1.00     81874
   macro avg       0.98      0.98      0.98     81874
weighted avg       1.00      1.00      1.00     81874



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:05 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-09 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-09 00:00:14 to 2024-09-12 00:00:01: 0.9996106016832056
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     79034
           1       0.98      0.96      0.97       576

    accuracy                           1.00     79610
   macro avg       0.99      0.98      0.99     79610
weighted avg       1.00      1.00      1.00     79610



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:11 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-10 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-10 00:00:04 to 2024-09-12 23:59:58: 0.9998667412915434
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74525
           1       1.00      0.98      0.99       517

    accuracy                           1.00     75042
   macro avg       1.00      0.99      1.00     75042
weighted avg       1.00      1.00      1.00     75042



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:17 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-11 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-11 00:00:04 to 2024-09-13 23:59:59: 0.9998369476601989
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     73070
           1       0.99      0.98      0.99       526

    accuracy                           1.00     73596
   macro avg       1.00      0.99      0.99     73596
weighted avg       1.00      1.00      1.00     73596



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:23 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-12 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-12 00:00:07 to 2024-09-15 00:00:03: 0.9997853703032986
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74016
           1       0.99      0.98      0.98       531

    accuracy                           1.00     74547
   macro avg       1.00      0.99      0.99     74547
weighted avg       1.00      1.00      1.00     74547



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:29 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-13 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-13 00:00:09 to 2024-09-16 00:00:02: 0.9997333973179771
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     74469
           1       0.99      0.98      0.98       549

    accuracy                           1.00     75018
   macro avg       0.99      0.99      0.99     75018
weighted avg       1.00      1.00      1.00     75018



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:34 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-14 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-14 00:00:05 to 2024-09-16 23:59:57: 0.999556864687329
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     76133
           1       0.99      0.95      0.97       593

    accuracy                           1.00     76726
   macro avg       0.99      0.98      0.99     76726
weighted avg       1.00      1.00      1.00     76726



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:40 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-15 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-15 00:00:05 to 2024-09-18 00:00:03: 0.9995994055695548
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     76752
           1       0.99      0.96      0.98       633

    accuracy                           1.00     77385
   macro avg       0.99      0.98      0.99     77385
weighted avg       1.00      1.00      1.00     77385



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:46 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-16 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-16 00:00:07 to 2024-09-19 00:00:02: 0.9995170217209691
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     76000
           1       0.98      0.96      0.97       608

    accuracy                           1.00     76608
   macro avg       0.99      0.98      0.98     76608
weighted avg       1.00      1.00      1.00     76608



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:52 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-17 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-17 00:00:04 to 2024-09-20 00:00:03: 0.9997883597883598
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     75015
           1       0.99      0.98      0.99       585

    accuracy                           1.00     75600
   macro avg       1.00      0.99      0.99     75600
weighted avg       1.00      1.00      1.00     75600



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:03:58 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-18 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-18 00:00:05 to 2024-09-20 23:59:55: 0.999731406279977
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     77593
           1       0.99      0.97      0.98       592

    accuracy                           1.00     78185
   macro avg       1.00      0.99      0.99     78185
weighted avg       1.00      1.00      1.00     78185



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:04:03 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-19 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-19 00:00:06 to 2024-09-22 00:00:03: 0.9996471032040596
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81557
           1       0.99      0.97      0.98       620

    accuracy                           1.00     82177
   macro avg       0.99      0.98      0.99     82177
weighted avg       1.00      1.00      1.00     82177



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:04:09 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-20 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-20 00:00:04 to 2024-09-23 00:00:03: 0.999841949131936
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     81703
           1       0.99      0.99      0.99       549

    accuracy                           1.00     82252
   macro avg       0.99      0.99      0.99     82252
weighted avg       1.00      1.00      1.00     82252



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:04:15 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-21 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-21 00:00:08 to 2024-09-24 00:00:03: 0.9998791482367728
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     82209
           1       0.99      0.99      0.99       537

    accuracy                           1.00     82746
   macro avg       0.99      1.00      1.00     82746
weighted avg       1.00      1.00      1.00     82746



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['will_purchase'] = df.groupby('user_id')['action'].shift(-3).apply(lambda x: 1 if x == 5 else 0)
2024/10/12 20:04:21 INFO mlflow.tracking.fluent: Experiment with name 'LR_2024-09-22 00:00:04' does not exist. Creating a new experiment.


Accuracy from 2024-09-22 00:00:04 to 2024-09-24 23:59:58: 0.999940755486042
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     83888
           1       1.00      0.99      1.00       508

    accuracy                           1.00     84396
   macro avg       1.00      1.00      1.00     84396
weighted avg       1.00      1.00      1.00     84396





In [50]:
user_features_v1 = pd.read_csv('../data/processed/user_features_v1.csv')
user_features_v1.head()

Unnamed: 0,user_id,total_actions,nunique_products_number,view_number,like_number,addB_number,delB_number,clearB_number,order_number,listB_number,...,fraction_order_ofAllActions,fraction_listB_ofAllActions,fraction_visit_ofAllActions,fraction_visitCategory_ofAllActions,fraction_search_ofAllActions,activity_last3days,activity_last7days,max_purchase_amount,min_purchase_amount,avg_purchase_amount
0,/*,39,7,5.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.051282,0.769231,0.051282,0.0,0.0,0.0,0.0,0.0,0.0
1,0000bdba-5180-11eb-8a53-0cc47a6d2fef,29,4,5.0,1.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.068966,0.517241,0.172414,0.0,16.0,29.0,0.0,0.0,0.0
2,0000d5dc-78cf-11ef-86e0-002590c0647c,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.5,0.0,0.0,2.0,0.0,0.0,0.0
3,0001151e-c2b9-11ee-bbb1-002590c82436,786,84,97.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.716285,0.160305,0.0,106.0,220.0,0.0,0.0,0.0
4,00014c1e-f9d5-11eb-8a53-0cc47a6d2fef,4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
predictions = model.predict_proba(user_features_v1)
predictions

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- user_id


In [75]:
y_pred_proba = model.predict_proba(user_features_v1.drop('user_id', axis=1))[:, 1]  # Предсказание вероятности для класса 1

# Сопоставляем вероятности с user_id
predictions_df = pd.DataFrame({
    'user_id': user_features_v1['user_id'],
    'purchase_probability': y_pred_proba
})

predictions_df = predictions_df.sort_values(by='purchase_probability', ascending=False)

In [66]:
actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')

In [88]:
last_10_days = actions['date'].max() - pd.Timedelta(20, unit='D')

# Выбираем все события добавления товара в корзину за последние 6 дней (action == 2)
basket_actions = actions[(actions['action'] == 2) & (actions['date'] > last_10_days)]
# Группируем выборку по уникальным пользователям и создаем рекомендации до 25 товаров
def processing(purchasers):
    purchasers = purchasers.map(lambda x: x[0])
    purchasers = list(purchasers)
    return purchasers[:25]
# Группируем пользователей и выбираем до 25 товаров, которые они добавляли в корзину
recommendations = basket_actions.groupby('user_id')['products'].apply(processing).reset_index()

# Объединяем предсказания с рекомендациями
final_predictions = predictions_df.merge(recommendations, on='user_id', how='left')

final_predictions['products'] = final_predictions['products'].apply(lambda x: [] if x is np.nan else x)
# Выбираем топ-3000 пользователей с наибольшей вероятностью покупки
top_3000_predictions = final_predictions.head(3000)

# Сохраняем результат в CSV
top_3000_predictions[['user_id', 'products']].to_csv("../output/predictions/LR_on_users_with_baseline_products.csv", index=False)
