In [1]:
import optuna
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

In [2]:
train = pd.read_csv('../../train.csv')
test = pd.read_csv('../../sample_submission.csv')

train.head()

Unnamed: 0,user_id,order_completed_at,cart
0,2,2015-03-22 09:25:46,399
1,2,2015-03-22 09:25:46,14
2,2,2015-03-22 09:25:46,198
3,2,2015-03-22 09:25:46,88
4,2,2015-03-22 09:25:46,157


Проверим, есть ли пропуски в данных

In [3]:
train.isnull().sum()

user_id               0
order_completed_at    0
cart                  0
dtype: int64

Все товары из `train` были куплены пользователями, зададим им `target` равный единице

In [4]:
train['order_completed_at'] = pd.to_datetime(train['order_completed_at'])
train['cart'] = train['cart'].astype('int64')
train['target'] = 1
train.head()

Unnamed: 0,user_id,order_completed_at,cart,target
0,2,2015-03-22 09:25:46,399,1
1,2,2015-03-22 09:25:46,14,1
2,2,2015-03-22 09:25:46,198,1
3,2,2015-03-22 09:25:46,88,1
4,2,2015-03-22 09:25:46,157,1


In [5]:
test.head()

Unnamed: 0,id,target
0,0;133,0
1,0;5,1
2,0;10,0
3,0;396,1
4,0;14,0


Преобразуем данные из файла с примером ответа (датафрейм `test`) к виду, удобному для будущих предсказаний

In [6]:
test['user_id'] = test['id'].apply(lambda x: x.split(';')[0]).astype('int64')
test['cart'] = test['id'].apply(lambda x: x.split(';')[1]).astype('int64')
test.drop('id', axis = 1, inplace=True)
test.head()

Unnamed: 0,target,user_id,cart
0,0,0,133
1,1,0,5
2,0,0,10
3,1,0,396
4,0,0,14


Дата последней совершенной покупки

In [7]:
train['order_completed_at'].max()

Timestamp('2020-09-03 23:45:45')

Для удобства добавим в тестовую выборку дату большую, чем дата последней совершенной покупки из обучающей выборки

In [8]:
test['order_completed_at'] = pd.Timestamp('2024-09-03 22:45:00')
test['order_completed_at'] = test['order_completed_at'].astype('datetime64[ns]')
test.head()

Unnamed: 0,target,user_id,cart,order_completed_at
0,0,0,133,2024-09-03 22:45:00
1,1,0,5,2024-09-03 22:45:00
2,0,0,10,2024-09-03 22:45:00
3,1,0,396,2024-09-03 22:45:00
4,0,0,14,2024-09-03 22:45:00


В датафрейме `train` имеется информация о товарах, которые пользователи _покупали_. Сформируем датафрейм `negative_cart_train`, который будет содержать товары, которые пользователи покупали в предыдущие сессии, но не купили в текущую сессию. Это поможет нашей модели предсказывать ситуации, когда пользователь не покупает товар, а также поможет сформировать дополнительные признаки

In [9]:
train = train.sort_values(['user_id', 'order_completed_at'])
grouped_orders = train.groupby(['user_id', 'order_completed_at']).cart

current_user = -1
current_cart = []
all_orders_carts = []
items_per_order = []

for (user, date), items in grouped_orders:
    if user == current_user:
        current_cart.extend(list(items))
        all_orders_carts.append(list(set(current_cart)))
    else:
        current_cart = list(items)
        all_orders_carts.append(list(set(current_cart)))
    current_user = user
    items_per_order.append(list(items))

all_orders_carts_series = pd.Series(all_orders_carts).apply(set)    
items_per_order_series = pd.Series(items_per_order).apply(set)

excluded_items = all_orders_carts_series - items_per_order_series

negative_cart_train = train[['user_id', 'order_completed_at']].drop_duplicates().reset_index(drop=True)
negative_cart_train['cart'] = excluded_items
negative_cart_train = negative_cart_train.explode('cart').dropna()
negative_cart_train['cart'] = negative_cart_train['cart'].astype('int64')
negative_cart_train.head()

# Предыдущая реализация
# negative_cart_train = train.groupby(['user_id', 'order_completed_at'])['cart'].apply(set).reset_index()
# negative_cart_train['cart_shift'] = grouped_data.groupby('user_id')['cart'].shift()
# negative_cart_train['negative_cart_sample'] = negative_cart_train['cart_shift'] - negative_cart_train['cart']
# negative_cart_train.dropna(inplace=True)

# negative_cart_train.drop(['cart', 'cart_shift'], axis = 1, inplace=True)
# negative_cart_train = negative_cart_train.explode('negative_cart_sample')
# negative_cart_train.dropna(inplace=True)

# negative_cart_train

Unnamed: 0,user_id,order_completed_at,cart
1,0,2020-08-24 08:55:32,20
1,0,2020-08-24 08:55:32,430
2,0,2020-09-02 07:38:25,5
2,0,2020-09-02 07:38:25,133
2,0,2020-09-02 07:38:25,10


В силу того, что пользователи данные товары не покупали, установим значение `target` нулевым

In [10]:
negative_cart_train['target'] = 0
negative_cart_train.head()

Unnamed: 0,user_id,order_completed_at,cart,target
1,0,2020-08-24 08:55:32,20,0
1,0,2020-08-24 08:55:32,430,0
2,0,2020-09-02 07:38:25,5,0
2,0,2020-09-02 07:38:25,133,0
2,0,2020-09-02 07:38:25,10,0


Для генерации признаков создадим единый датафрейм `all_data`

In [11]:
all_data = pd.concat([train, negative_cart_train, test], ignore_index=True)
all_data.head()

Unnamed: 0,user_id,order_completed_at,cart,target
0,0,2020-07-19 09:59:17,20,1
1,0,2020-07-19 09:59:17,82,1
2,0,2020-07-19 09:59:17,441,1
3,0,2020-07-19 09:59:17,57,1
4,0,2020-07-19 09:59:17,14,1


Создадим новые признаки на основе имеющихся данных

Популярность товара `cart_popularity` показывает, сколько раз товар `cart` был заказан

In [12]:
all_data['cart_popularity'] = all_data['cart'].map(all_data['cart'].value_counts())
all_data.head()

Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity
0,0,2020-07-19 09:59:17,20,1,74645
1,0,2020-07-19 09:59:17,82,1,93427
2,0,2020-07-19 09:59:17,441,1,62946
3,0,2020-07-19 09:59:17,57,1,186836
4,0,2020-07-19 09:59:17,14,1,175438


Счетчик заказов `previous_orders_count` отражает то, сколько раз пользователь уже заказывал товар `cart` до текущей покупки (по умолчанию признак равен нулю, так как заказов данного товара еще не было)

In [13]:
all_data.sort_values(by=['user_id', 'order_completed_at'], inplace = True)
all_data['orders_count'] = all_data.groupby(['user_id', 'cart'])['target'].cumsum()
all_data['previous_orders_count'] = all_data.groupby(['user_id', 'cart'])['orders_count'].shift()
all_data.drop('orders_count', axis=1, inplace=True)
all_data.fillna(0, inplace=True)
all_data.head()

Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity,previous_orders_count
0,0,2020-07-19 09:59:17,20,1,74645,0.0
1,0,2020-07-19 09:59:17,82,1,93427,0.0
2,0,2020-07-19 09:59:17,441,1,62946,0.0
3,0,2020-07-19 09:59:17,57,1,186836,0.0
4,0,2020-07-19 09:59:17,14,1,175438,0.0


Счетчик торговых сессий `user_order_count` отражает номер текущей сессии пользователя (единица, если пользователь совершает заказ впервые)

In [14]:
all_data = all_data.sort_values(by=['user_id', 'order_completed_at'])
all_data['user_order_count'] = all_data.groupby('user_id')['order_completed_at'].rank(method='dense').astype(int)
all_data.head()

Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity,previous_orders_count,user_order_count
0,0,2020-07-19 09:59:17,20,1,74645,0.0,1
1,0,2020-07-19 09:59:17,82,1,93427,0.0,1
2,0,2020-07-19 09:59:17,441,1,62946,0.0,1
3,0,2020-07-19 09:59:17,57,1,186836,0.0,1
4,0,2020-07-19 09:59:17,14,1,175438,0.0,1


Создадим столбец `seq_length`, который будет отражать, сколько раз подряд пользователь покупал или не покупал данный товар 

In [15]:
all_data.sort_values(by=['user_id', 'cart', 'order_completed_at'], inplace=True)

all_data['seq_length'] = all_data.groupby(
    ['user_id', 'cart', (all_data['target'] != all_data['target'].shift()).cumsum()]
).cumcount() + 1
all_data.head()

Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity,previous_orders_count,user_order_count,seq_length
9,0,2020-08-24 08:55:32,5,1,132316,0.0,2,1
3123066,0,2020-09-02 07:38:25,5,0,132316,1.0,3,1
12171846,0,2024-09-03 22:45:00,5,1,132316,1.0,4,1
11,0,2020-08-24 08:55:32,10,1,60467,0.0,2,1
3123068,0,2020-09-02 07:38:25,10,0,60467,1.0,3,1


Разделим столбец `seq_length` на два столбца, `zero_seq_length` и `one_run_length`, где первый будет содержать в себе информацию о том, сколько раз пользователь игнорировал данный товар, а второй — сколько раз пользователь уже покупал данный товар. Они и будут нашими признаками  

Столбец `seq_length` необходимо удалить в силу того, что он является линейной комбинацией столбцов `zero_seq_length` и `one_run_length`

In [16]:
is_zero = all_data['target'] == 0
is_one = all_data['target'] == 1

all_data['zero_seq_length'] = all_data['seq_length'].where(is_zero, 0)
all_data['one_seq_length'] = all_data['seq_length'].where(is_one, 0)

all_data['zero_seq_length'] = all_data.groupby(['user_id', 'cart'])['zero_seq_length'].cummax().shift()
all_data['one_seq_length'] = all_data.groupby(['user_id', 'cart'])['one_seq_length'].cummax().shift()

In [17]:
all_data.drop('seq_length', axis=1, inplace = True)
all_data.dropna(inplace=True)
all_data.sort_values('order_completed_at', inplace=True)
all_data.head()

Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity,previous_orders_count,user_order_count,zero_seq_length,one_seq_length
93,2,2015-03-22 09:25:46,23,1,165530,0.0,1,9.0,3.0
82,2,2015-03-22 09:25:46,14,1,175438,0.0,1,13.0,1.0
85,2,2015-03-22 09:25:46,157,1,87268,0.0,1,10.0,1.0
88,2,2015-03-22 09:25:46,16,1,113669,0.0,1,6.0,2.0
91,2,2015-03-22 09:25:46,808,1,85625,0.0,1,4.0,2.0


Разделим данные на обучающую и тестовую части

In [18]:
train_data = all_data.drop(all_data[all_data.order_completed_at == '2024-09-03 22:45:00'].index)

test_data = all_data[all_data.order_completed_at == '2024-09-03 22:45:00']
test_data.sort_values(by=['cart','order_completed_at', 'user_id'], inplace = True)
train_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.sort_values(by=['cart','order_completed_at', 'user_id'], inplace = True)


Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity,previous_orders_count,user_order_count,zero_seq_length,one_seq_length
93,2,2015-03-22 09:25:46,23,1,165530,0.0,1,9.0,3.0
82,2,2015-03-22 09:25:46,14,1,175438,0.0,1,13.0,1.0
85,2,2015-03-22 09:25:46,157,1,87268,0.0,1,10.0,1.0
88,2,2015-03-22 09:25:46,16,1,113669,0.0,1,6.0,2.0
91,2,2015-03-22 09:25:46,808,1,85625,0.0,1,4.0,2.0
...,...,...,...,...,...,...,...,...,...
2529964,12702,2020-09-03 23:45:45,9,1,108912,1.0,7,0.0,1.0
11120925,12702,2020-09-03 23:45:45,199,0,35733,1.0,7,5.0,1.0
11120916,12702,2020-09-03 23:45:45,161,0,22489,2.0,7,2.0,2.0
11120928,12702,2020-09-03 23:45:45,84,0,158110,1.0,7,4.0,1.0


In [19]:
test_data

Unnamed: 0,user_id,order_completed_at,cart,target,cart_popularity,previous_orders_count,user_order_count,zero_seq_length,one_seq_length
12172068,7,2024-09-03 22:45:00,0,0,93564,1.0,12,0.0,1.0
12172124,8,2024-09-03 22:45:00,0,1,93564,1.0,9,7.0,1.0
12172270,9,2024-09-03 22:45:00,0,0,93564,1.0,47,42.0,1.0
12172453,12,2024-09-03 22:45:00,0,1,93564,2.0,22,17.0,1.0
12172528,13,2024-09-03 22:45:00,0,1,93564,3.0,18,7.0,2.0
...,...,...,...,...,...,...,...,...,...
12336558,3238,2024-09-03 22:45:00,880,1,119,2.0,72,64.0,2.0
12419889,4816,2024-09-03 22:45:00,880,0,119,1.0,24,15.0,1.0
12651314,10280,2024-09-03 22:45:00,880,0,119,2.0,10,7.0,2.0
12757677,13281,2024-09-03 22:45:00,880,1,119,1.0,5,3.0,1.0


Сформируем матрицы объект-признак и вектор целевой переменной.  
Стобец `order_completed_at` больше не нужен, время уже закодировано в других признаках

In [20]:
X_train = train_data.drop(['order_completed_at', 'target'], axis=1)
y_train = train_data.target

X_test = test_data.drop(['order_completed_at', 'target'], axis=1)
X_train

Unnamed: 0,user_id,cart,cart_popularity,previous_orders_count,user_order_count,zero_seq_length,one_seq_length
93,2,23,165530,0.0,1,9.0,3.0
82,2,14,175438,0.0,1,13.0,1.0
85,2,157,87268,0.0,1,10.0,1.0
88,2,16,113669,0.0,1,6.0,2.0
91,2,808,85625,0.0,1,4.0,2.0
...,...,...,...,...,...,...,...
2529964,12702,9,108912,1.0,7,0.0,1.0
11120925,12702,199,35733,1.0,7,5.0,1.0
11120916,12702,161,22489,2.0,7,2.0,2.0
11120928,12702,84,158110,1.0,7,4.0,1.0


In [21]:
X_test

Unnamed: 0,user_id,cart,cart_popularity,previous_orders_count,user_order_count,zero_seq_length,one_seq_length
12172068,7,0,93564,1.0,12,0.0,1.0
12172124,8,0,93564,1.0,9,7.0,1.0
12172270,9,0,93564,1.0,47,42.0,1.0
12172453,12,0,93564,2.0,22,17.0,1.0
12172528,13,0,93564,3.0,18,7.0,2.0
...,...,...,...,...,...,...,...
12336558,3238,880,119,2.0,72,64.0,2.0
12419889,4816,880,119,1.0,24,15.0,1.0
12651314,10280,880,119,2.0,10,7.0,2.0
12757677,13281,880,119,1.0,5,3.0,1.0


Использование различных Scaler-ов ухудшало значение метрики, от них в конечном итоге было решено отказаться

In [22]:
# sc = StandartScaler()
# X_train_process = sc.fit_transform(X_train)
# X_test_process = sc.transform(X_test)

Для предсказания будем использовать модель _LightFM_. (прочие бустинговые модели работали неприлично долго).  
Подбор гиперпараметров осуществлялся при помощи байесовской оптимизации (библиотека _optuna_).  
После подбора гиперпараметров среднее значение метрики `f1` на кроссвалидации по пяти фолдам составило $ \sim0.498$

In [23]:
# cv = TimeSeriesSplit(n_splits = 5)

# def objective_lgbm(trial):
# #     learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1)
#     learning_rate = 0.07819058013347185
    
# #     num_leaves = trial.suggest_int('num_leaves', 5, 100)
#     num_leaves = 65
# #     lambda_l2 = trial.suggest_float('lambda_l2', 1, 3)
#     lambda_l2 = 1.8227241075403795
    
#     min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 800, 900)
#     bagging_fraction = trial.suggest_float('bagging_fraction', 0.6, 0.95)
#     colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 0.8)
# #     scale_pos_weight = trial.suggest_float('scale_pos_weight', 2, 4)
#     scale_pos_weight = 3.0378377679796253
    
#     model = LGBMClassifier(learning_rate = learning_rate,
#                            n_estimators = 250,
#                            num_leaves = num_leaves,
#                            lambda_l2 = lambda_l2,
#                            min_data_in_leaf = min_data_in_leaf,
#                            bagging_fraction = bagging_fraction,
#                            colsample_bytree = colsample_bytree,
#                            scale_pos_weight = scale_pos_weight)
    
    
#     score =  cross_val_score(estimator=model, X=X_train_process, y=y_train, cv = cv, scoring='f1', n_jobs=-1).mean()
#     return score

# study = optuna.create_study(direction='maximize')
# study.optimize(objective_lgbm, n_trials=50)

Обучим модель на всех данных при помощи найденных гиперпараметров

In [24]:
lgbm = LGBMClassifier(random_state=42)
params = {'n_estimators': 250,
          'learning_rate': 0.07819058013347185,
          'num_leaves': 65,
          'lambda_l2': 1.8227241075403795,
          'min_data_in_leaf': 809,
          'bagging_fraction': 0.4481481399461171,
          'colsample_bytree': 0.6790283515524086,
          'scale_pos_weight': 3.0378377679796253
         }

# params = study.best_params
lgbm.set_params(**params)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 3123063, number of negative: 9048781
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1188
[LightGBM] [Info] Number of data points in the train set: 12171844, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.256581 -> initscore=-1.063816
[LightGBM] [Info] Start training from score -1.063816


Сделаем предсказание для тестового множества

In [25]:
y_pred = lgbm.predict(X_test)

# f1_score(y_train, lgbm.predict(X_train))



Запишем предсказания в _csv_ файл

In [26]:
X_test = X_test.assign(
    user_id=X_test['user_id'].astype(str),
    cart=X_test['cart'].astype(str),
    id=(X_test['user_id'].astype(str) + ';' + X_test['cart'].astype(str)),
    target=y_pred.astype(int)
)

X_test[['id', 'target']].set_index('id').to_csv('ans.csv')

In [28]:
# pd.read_csv('ans.csv')['target']