In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

import mlflow
import mlflow.lightgbm

In [4]:
def min_preprocessing(actions, catalog):
    
    actions.date = pd.to_datetime(actions.date)
    actions = actions.explode('products')
    actions.products = actions.products.fillna(0)
    actions.products = actions.products.astype(int)
    actions.rename({'products': 'productId'}, axis=1, inplace=True)

    catalog.add_date = pd.to_datetime(catalog.add_date)
    catalog.shop_id = catalog.shop_id.astype(int)
    catalog.category_id = catalog.category_id.astype(int)
    catalog.product_id = catalog.product_id.astype(int)
    catalog.drop(['title'], axis=1, inplace=True)
    
    return actions, catalog

In [5]:
train_actions = pd.read_parquet('../data/raw/train_actions.pq', engine='pyarrow')
stokman_catalog = pd.read_parquet('../data/raw/stokman_catalog_preprocessed.pq', engine='pyarrow')

In [6]:
actions, catalog = min_preprocessing(train_actions, stokman_catalog)
del train_actions, stokman_catalog

In [7]:
users_actions = actions.groupby('user_id').agg({'action':'count'}).sort_values(by='action', ascending=False)
actions_cutted = actions[actions.user_id.isin(users_actions.sort_values(by='action', ascending=False)[:3000].index)]

In [8]:
del actions, users_actions

In [9]:
purchase_data = actions_cutted[actions_cutted['action'] == 5]

# Создаем целевую переменную
purchase_data['target'] = 1

# Генерируем все комбинации пользователей и товаров для предсказания
all_users = actions_cutted['user_id'].unique()
all_products = actions_cutted['productId'].unique()
pred_data = pd.DataFrame([(user, product) for user in all_users for product in all_products],
                         columns=['user_id', 'productId'])

# Добавляем флажок покупки для товаров, которые не покупали
pred_data['target'] = 0  # Предположим, что по умолчанию ничего не куплено

# Объединяем с данными о покупках, чтобы пометить покупки
pred_data = pred_data.merge(purchase_data[['user_id', 'productId']], on=['user_id', 'productId'], how='left', indicator=True)
pred_data['target'] = pred_data['_merge'].apply(lambda x: 1 if x == 'both' else 0)
pred_data.drop(columns=['_merge'], inplace=True)

# Кодируем категориальные переменные
pred_data['user_id'] = pred_data['user_id'].astype('category').cat.codes
pred_data['productId'] = pred_data['productId'].astype('category').cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  purchase_data['target'] = 1


In [10]:
# Обучаем модель
X = pred_data[['user_id', 'productId']]
y = pred_data['target']

# Разделяем на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаем датасеты для LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Задаем параметры модели
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Обучаем модель
lgb_model = lgb.train(params, train_data, num_boost_round=100, valid_sets=test_data, force_col_wise=True)

[LightGBM] [Info] Number of positive: 5957, number of negative: 156738939
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.136303 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 156744896, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000038 -> initscore=-10.177770
[LightGBM] [Info] Start training from score -10.177770


: 

In [None]:
# Получаем предсказания для всех комбинаций пользователей и товаров
predictions = lgb_model.predict(pred_data[['user_id', 'productId']])

# Добавляем предсказания в датафрейм
pred_data['prediction'] = predictions

In [1]:
# Получаем топ-25 товаров для каждого пользователя
top_recommendations = pred_data.sort_values(['user_id', 'prediction'], ascending=[True, False])
top_recommendations = top_recommendations.groupby('user_id').head(25)

# Сохраняем в csv файл
top_recommendations[['user_id', 'productId', 'prediction']].to_csv('../output/predictions/LGB_result_1.csv', index=False)


NameError: name 'pred_data' is not defined