# ITMO 
# Проект 3
# Рекомендательные системы

## Двухуровневая модель рекомендаций на основе lightGBM

In [1]:
!pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!git clone https://github.com/SLVmain/Recsystem.git

Cloning into 'Recsystem'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 63 (delta 18), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (63/63), done.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные функции
from Recsystem.src.metrics import precision_at_k, recall_at_k
from Recsystem.src.utils import prefilter_items
from Recsystem.src.recommenders import MainRecommender

In [6]:
#download data from yandex disk
!wget -O retail_train.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/7zzy7loLn1mydA
!wget -O hh_demographic.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/K5QgI5i7b9OCAg
!wget -O product.csv https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/YGMKawk5FPzM4w

--2022-11-01 15:01:22--  https://getfile.dokpub.com/yandex/get/https://disk.yandex.ru/d/7zzy7loLn1mydA
Resolving getfile.dokpub.com (getfile.dokpub.com)... 142.132.255.217
Connecting to getfile.dokpub.com (getfile.dokpub.com)|142.132.255.217|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://downloader.disk.yandex.ru/disk/2c6938c7b93040933d215df91dd912c9e82a49928cec3925243e2602a7767648/63615e1a/MNC-wvHOfDa0WFpeCBXdEeTllmFbRsuqTtMMo1bAHYmfZm06wNwADVaeVBrd1O94hx1t7mR55BRS-gIuH7IQKQ%3D%3D?uid=0&filename=retail_train.csv&disposition=attachment&hash=v53oKGpCiygFJ679NGHlZA0WPJn1sgCtV3OoRP2vKRXBZcMYaV5bIqW9uckUJx3GRAX03HPYQxcN%2B1YRYcCtZw%3D%3D%3A&limit=0&content_type=text%2Fplain&owner_uid=1130000061907806&fsize=143477004&hid=d107c6346479d71567899d724067670c&media_type=spreadsheet&tknv=v2 [following]
--2022-11-01 15:01:22--  https://downloader.disk.yandex.ru/disk/2c6938c7b93040933d215df91dd912c9e82a49928cec3925243e2602a7767648/63615e1a/MNC-wvHOfDa0WFpeCBXd

In [3]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

#уменьшим датасет, не хватает памяти
data = data[:1000000]

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации: делим на трейн и тест
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy() 
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(12)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0
5,2375,26984851516,1,826249,2,1.98,364,-0.6,1642,1,0.0,0.0
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0
8,2375,26984851516,1,1102651,1,1.89,364,0.0,1642,1,0.0,0.0
9,2375,26984851516,1,6423775,1,2.0,364,-0.79,1642,1,0.0,0.0


In [4]:
# будем использовать топ 5000 товаров (см src/MainRecommender)

n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 52624 to 5001


In [19]:
# создаем Рекоммендер
recommender = MainRecommender(data_train_lvl_1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/2479 [00:00<?, ?it/s]



  0%|          | 0/2479 [00:00<?, ?it/s]

In [18]:
recommender

<Recsystem.src.recommenders.MainRecommender at 0x7f2e417b5950>

Получаем кандидатов. Если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N

In [19]:
recommender.get_als_recommendations(2375, N=5)

[1011457, 970866, 974524, 971922, 911565]

### Измеряем recall@k

- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

own recommendtions + top-popular лучший recall

In [16]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[843559, 856942, 871570, 888996, 928891, 93800..."
1,2,"[861272, 1116476, 12187999, 12188436, 12263180..."


In [17]:
users_lvl_1 = pd.DataFrame(data_train_lvl_1['user_id'].unique())
users_lvl_1.columns = ['user_id']

In [24]:
K_num = 50
result_lvl_1['als_rec'] = users_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=K_num))

In [25]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als_rec
0,1,"[843559, 856942, 871570, 888996, 928891, 93800...","[1011457, 970866, 974524, 971922, 911565, 9683..."
1,2,"[861272, 1116476, 12187999, 12188436, 12263180...","[921504, 909800, 1011427, 970202, 983897, 1011..."


In [26]:
def calculate_precision_k(data, K): #data - pandas df
    for column in data.columns[2:]:
        yield column, data.apply(lambda row: precision_at_k(row[column], row['actual'], k=K), axis=1).mean()

In [27]:
# посмотрим precision_5
prec_K = 5
sorted(calculate_precision_k(result_lvl_1, prec_K), key=lambda x: x[1],reverse=True)

[('als_rec', 0.0036844591402928678)]

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_lvl_2
- Обучаем *только* на выбранных кандидатах
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [None]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

In [29]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start, фильтруем юзеров
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))

In [30]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,816,"[999953, 956127, 920804, 1002178, 848107, 9337..."
1,1375,"[940726, 935993, 1012627, 879393, 887375, 8285..."


In [31]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,816,999953,1
0,816,956127,1
0,816,920804,1
0,816,1002178,1


In [32]:
users_lvl_2.shape[0]

105550

In [33]:
users_lvl_2['user_id'].nunique()

2111

In [34]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [35]:
targets_lvl_2.shape

(105651, 3)

In [36]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,816,999953,0.0
1,816,956127,0.0


In [37]:
targets_lvl_2['target'].value_counts()

0.0    105016
1.0       635
Name: target, dtype: int64

(!) На каждого юзера 50 item_id-кандидатов

In [39]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [40]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [41]:
#объединим все признаки в один датафрейм
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,U,50-74K,Unknown,Single Female,1,None/Unknown
1,816,956127,0.0,317,GROCERY,National,CHEESE,NATURAL CHEESE EXACT WT SLICES,8 OZ,25-34,U,50-74K,Unknown,Single Female,1,None/Unknown


In [42]:
# объединим все в одну df для построения новых фичей
df_feach_eng = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

df_feach_eng = df_feach_eng.merge(item_features, on='item_id', how='left')
df_feach_eng = df_feach_eng.merge(user_features, on='user_id', how='left')
df_feach_eng.columns

Index(['user_id', 'basket_id', 'day', 'item_id', 'quantity', 'sales_value',
       'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc'],
      dtype='object')

In [43]:
# добавим фич и и для data_val_lvl_2(используем его для валидации модели)

df_val = data_val_lvl_2.copy()
df_val = df_val.merge(item_features, on='item_id', how='left')
df_val = df_val.merge(user_features, on='user_id', how='left')
df_val.columns

Index(['user_id', 'basket_id', 'day', 'item_id', 'quantity', 'sales_value',
       'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc'],
      dtype='object')

In [44]:
# готовим датасет для загрузки в классификатор
targets_lvl_2 = targets_lvl_2.merge(data_train_lvl_2, on='user_id', how='left')

In [45]:
targets_lvl_2.columns

Index(['user_id', 'item_id_x', 'target', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc', 'basket_id',
       'day', 'item_id_y', 'quantity', 'sales_value', 'store_id',
       'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc'],
      dtype='object')

In [46]:
# генерим новые фичи
# средний чек на юзера
df = df_feach_eng.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'avg_bill']
targets_lvl_2 = targets_lvl_2.merge(df, on='user_id')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_id_y,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,869150,1,1.79,445,-0.04,32,37,0.0,0.0,36.756667
1,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,883404,1,0.99,445,-0.3,32,37,0.0,0.0,36.756667


In [47]:
df = df_val.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'avg_bill']
df_val = df_val.merge(df, on='user_id')
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill
0,2295,31336488347,293,822241,2,0.4,403,0.0,117,43,...,SOFT DRINK POWDER POUCHES,.17 OZ,35-44,B,35-49K,Renter,Single Male,1,None/Unknown,84.012857
1,2295,31336488347,293,822524,2,3.0,403,-1.58,117,43,...,TUNA,12 OZ,35-44,B,35-49K,Renter,Single Male,1,None/Unknown,84.012857


In [48]:
#Кол-во покупок юзера в каждой категории
df = df_feach_eng.groupby(['user_id', 'department'])['quantity'].sum().reset_index()
df = df.groupby('user_id')['quantity'].mean().reset_index()
df.columns = ['user_id', 'avg_count_pursh_dep']
targets_lvl_2 = targets_lvl_2.merge(df, on='user_id')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,1,1.79,445,-0.04,32,37,0.0,0.0,36.756667,21.714286
1,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,1,0.99,445,-0.3,32,37,0.0,0.0,36.756667,21.714286


In [49]:
df = df_val.groupby(['user_id', 'department'])['quantity'].sum().reset_index()
df = df.groupby('user_id')['quantity'].mean().reset_index()
df.columns = ['user_id', 'avg_count_pursh_dep']
df_val = df_val.merge(df, on='user_id')
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep
0,2295,31336488347,293,822241,2,0.4,403,0.0,117,43,...,.17 OZ,35-44,B,35-49K,Renter,Single Male,1,None/Unknown,84.012857,36.375
1,2295,31336488347,293,822524,2,3.0,403,-1.58,117,43,...,12 OZ,35-44,B,35-49K,Renter,Single Male,1,None/Unknown,84.012857,36.375


Фичи item_id: - Кол-во покупок в неделю - Среднее ол-во покупок 1 товара в категории в неделю - (Кол-во покупок в неделю) / (Среднее ол-во покупок 1 товара в категории в неделю) - Цена (Можно посчитать из retil_train.csv) - Цена / Средняя цена товара в категории

In [50]:
# Среднее кол-во покупок 1 товара в категории
df = df_feach_eng.groupby(['item_id', 'department'])['quantity'].sum().reset_index()
df = df.groupby('item_id')['quantity'].mean().reset_index()
df.columns = ['item_id_x', 'avg_count_item_dep']
targets_lvl_2 = targets_lvl_2.merge(df, on='item_id_x')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep,avg_count_item_dep
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,1.79,445,-0.04,32,37,0.0,0.0,36.756667,21.714286,5.0
1,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,0.99,445,-0.3,32,37,0.0,0.0,36.756667,21.714286,5.0


In [51]:
df = df_val.groupby(['item_id', 'department'])['quantity'].sum().reset_index()
df = df.groupby('item_id')['quantity'].mean().reset_index()
df.columns = ['item_id', 'avg_count_item_dep']
df_val = df_val.merge(df, on='item_id')
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep,avg_count_item_dep
0,2295,31336488347,293,822241,2,0.4,403,0.0,117,43,...,35-44,B,35-49K,Renter,Single Male,1,None/Unknown,84.012857,36.375,37.0
1,955,31343676130,293,822241,2,0.4,367,0.0,1943,43,...,19-24,U,Under 15K,Unknown,Single Female,1,None/Unknown,20.09,18.4,37.0


In [52]:
#цена товара
targets_lvl_2['price'] = targets_lvl_2['sales_value']/targets_lvl_2['quantity']
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep,avg_count_item_dep,price
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,445,-0.04,32,37,0.0,0.0,36.756667,21.714286,5.0,1.79
1,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,445,-0.3,32,37,0.0,0.0,36.756667,21.714286,5.0,0.99


In [53]:
df_val['price'] = df_val['sales_value']/df_val['quantity']
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep,avg_count_item_dep,price
0,2295,31336488347,293,822241,2,0.4,403,0.0,117,43,...,B,35-49K,Renter,Single Male,1,None/Unknown,84.012857,36.375,37.0,0.2
1,955,31343676130,293,822241,2,0.4,367,0.0,1943,43,...,U,Under 15K,Unknown,Single Female,1,None/Unknown,20.09,18.4,37.0,0.2


Фичи пары user_id - item_id - (Средняя сумма покупки 1 товара в каждой категории (берем категорию item_id)) - (Цена item_id) - (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю) - (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

In [54]:
# Среднее кол-во покупок всеми юзерами конкретной категории в неделю
df = df_feach_eng.groupby(['department', 'week_no'])['quantity'].sum().reset_index()
df = df.groupby('department')['quantity'].mean().reset_index()
df.columns = ['department', 'avg_sum_all_pursh_dep']
targets_lvl_2 = targets_lvl_2.merge(df, on='department')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep,avg_count_item_dep,price,avg_sum_all_pursh_dep
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,-0.04,32,37,0.0,0.0,36.756667,21.714286,5.0,1.79,24758.333333
1,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,-0.3,32,37,0.0,0.0,36.756667,21.714286,5.0,0.99,24758.333333


In [55]:
df = df_val.groupby(['department', 'week_no'])['quantity'].sum().reset_index()
df = df.groupby('department')['quantity'].mean().reset_index()
df.columns = ['department', 'avg_sum_all_pursh_dep']
df_val = df_val.merge(df, on='department')
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep,avg_count_item_dep,price,avg_sum_all_pursh_dep
0,2295,31336488347,293,822241,2,0.4,403,0.0,117,43,...,35-49K,Renter,Single Male,1,None/Unknown,84.012857,36.375,37.0,0.2,20717.5
1,955,31343676130,293,822241,2,0.4,367,0.0,1943,43,...,Under 15K,Unknown,Single Female,1,None/Unknown,20.09,18.4,37.0,0.2,20717.5


In [56]:
# (Кол-во покупок юзером конкретной категории в неделю) / (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)

targets_lvl_2['user_count_per_dep_pursh'] = targets_lvl_2['avg_count_pursh_dep']/targets_lvl_2['avg_sum_all_pursh_dep']
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep,avg_count_item_dep,price,avg_sum_all_pursh_dep,user_count_per_dep_pursh
0,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,32,37,0.0,0.0,36.756667,21.714286,5.0,1.79,24758.333333,0.000877
1,816,999953,0.0,1046,GROCERY,National,COLD CEREAL,ADULT CEREAL,13.5 OZ,25-34,...,32,37,0.0,0.0,36.756667,21.714286,5.0,0.99,24758.333333,0.000877


In [57]:
df_val['user_count_per_dep_pursh'] = df_val['avg_count_pursh_dep']/df_val['avg_sum_all_pursh_dep']
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep,avg_count_item_dep,price,avg_sum_all_pursh_dep,user_count_per_dep_pursh
0,2295,31336488347,293,822241,2,0.4,403,0.0,117,43,...,Renter,Single Male,1,None/Unknown,84.012857,36.375,37.0,0.2,20717.5,0.001756
1,955,31343676130,293,822241,2,0.4,367,0.0,1943,43,...,Unknown,Single Female,1,None/Unknown,20.09,18.4,37.0,0.2,20717.5,0.000888


In [58]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [59]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7997691 entries, 0 to 7997690
Data columns (total 32 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   user_id                   int64  
 1   item_id_x                 int64  
 2   manufacturer              int64  
 3   department                object 
 4   brand                     object 
 5   commodity_desc            object 
 6   sub_commodity_desc        object 
 7   curr_size_of_product      object 
 8   age_desc                  object 
 9   marital_status_code       object 
 10  income_desc               object 
 11  homeowner_desc            object 
 12  hh_comp_desc              object 
 13  household_size_desc       object 
 14  kid_category_desc         object 
 15  basket_id                 int64  
 16  day                       int64  
 17  item_id_y                 int64  
 18  quantity                  int64  
 19  sales_value               float64
 20  store_id                

In [60]:
X_train = X_train.drop('item_id_y', axis=1)
X_train.columns.tolist()

['user_id',
 'item_id_x',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'basket_id',
 'day',
 'quantity',
 'sales_value',
 'store_id',
 'retail_disc',
 'trans_time',
 'week_no',
 'coupon_disc',
 'coupon_match_disc',
 'avg_bill',
 'avg_count_pursh_dep',
 'avg_count_item_dep',
 'price',
 'avg_sum_all_pursh_dep',
 'user_count_per_dep_pursh']

In [61]:
df_val = df_val.rename(columns={'item_id': 'item_id_x'})
df_val.columns.tolist()

['user_id',
 'basket_id',
 'day',
 'item_id_x',
 'quantity',
 'sales_value',
 'store_id',
 'retail_disc',
 'trans_time',
 'week_no',
 'coupon_disc',
 'coupon_match_disc',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'avg_bill',
 'avg_count_pursh_dep',
 'avg_count_item_dep',
 'price',
 'avg_sum_all_pursh_dep',
 'user_count_per_dep_pursh']

In [62]:
cat_feats =['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [63]:
for c in cat_feats:
    
    X_train[c] = X_train[c].astype('category')

In [64]:
for c in cat_feats:
    
    df_val[c] = df_val[c].astype('category')

In [65]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(categorical_column=['department', 'brand', 'commodity_desc',
                                   'sub_commodity_desc', 'curr_size_of_product',
                                   'age_desc', 'marital_status_code',
                                   'income_desc', 'homeowner_desc',
                                   'hh_comp_desc', 'household_size_desc',
                                   'kid_category_desc'],
               learning_rate=0.05, max_depth=8, n_estimators=300,
               objective='binary')

In [66]:
train_preds = lgb.predict(X_train)

In [67]:
train_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [68]:
val_preds = lgb.predict_proba(df_val)

In [69]:
val_preds

array([[9.99798436e-01, 2.01563683e-04],
       [9.98841265e-01, 1.15873465e-03],
       [9.99677894e-01, 3.22105500e-04],
       ...,
       [9.99878216e-01, 1.21784337e-04],
       [9.99393894e-01, 6.06106325e-04],
       [9.99955155e-01, 4.48447651e-05]])

In [70]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[835796, 856942, 858450, 918390, 934369, 94094..."
1,3,"[892920, 1092502, 1119761]"


In [71]:
pred_ds = df_val[['user_id', 'item_id_x']].copy()
pred_ds['proba'] = val_preds[:,1]
pred_ds = pred_ds.groupby(['user_id', 'item_id_x'])['proba'].mean().reset_index()
pred_s = pred_ds.groupby('user_id').apply(lambda x: x.sort_values('proba', ascending=False)['item_id_x'].tolist())

def get_LGBM_recommendations(user_id, N=5):
    recommendations = pred_s[user_id][:N]
    
    overall_top_purchases = data_val_lvl_2.groupby('item_id')['quantity'].count().reset_index()
    overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
    overall_top_purchases = overall_top_purchases[overall_top_purchases['item_id'] != 999999]
    overall_top_purchases = overall_top_purchases.item_id.tolist()
    
    if len(recommendations) < N:
            recommendations.extend(overall_top_purchases[:N])
            recommendations = recommendations[:N]
    
    return recommendations

In [72]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']


#только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

K_num = 50
result_lvl_2['LGBM'] = result_lvl_2['user_id'].apply(lambda x: get_LGBM_recommendations(x, N=K_num))

In [73]:
result_lvl_2.head(2)

Unnamed: 0,user_id,actual,LGBM
0,1,"[835796, 856942, 858450, 918390, 934369, 94094...","[995242, 1077490, 1074612, 1018600, 934369, 92..."
1,3,"[892920, 1092502, 1119761]","[892920, 1119761, 1092502, 1082185, 6534178, 1..."


In [74]:
def calculate_precision_k(data, K): #data - pandas df
    for column in data.columns[2:]:
        yield column, data.apply(lambda row: precision_at_k(row[column], row['actual'], k=K), axis=1).mean()

In [75]:
# Посчитаем precision_5 LGBM  
sorted(calculate_precision_k(result_lvl_2, 5), key=lambda x: x[1],reverse=True)

[('LGBM', 0.9529039070749735)]

In [76]:
# inference - получение 50 рекомендаций для одного юзера
# номер юзера - 23
get_LGBM_recommendations(23, N=50) #inference

[981760,
 6534178,
 909338,
 982009,
 1033062,
 10200444,
 1000329,
 1053690,
 1065593,
 822407,
 908846,
 976998,
 931777,
 971325,
 1110695,
 7441668,
 828331,
 1084423,
 1000728,
 1138858,
 1076306,
 908408,
 1135552,
 907647,
 926233,
 1053022,
 9831557,
 1096290,
 959737,
 958023,
 888783,
 983710,
 930331,
 1047316,
 833754,
 6552995,
 10457385,
 5567705,
 8119156,
 997112,
 1131490,
 885261,
 1101910,
 1046201,
 1064798,
 1082185,
 6534178,
 1029743,
 995242,
 1106523]