# Course project


- В проекте реализована двухуровневая модель рекомендаций покупок на основе данных о покупках пользователей, характеристиках товаров и пользователей
- Все используемые методы и классы реализованы в [данном проекте](https://github.com/VasiliyS178/my_first_recommender_system). Данный ноутбук и реализованные модули из папки src, данные из data нужно разместить в одной директории с ноутбуком, сохранив структуру папок
- На первом уровне отбираются 500 товаров-кандидатов, которые ранжируются с помощью модели LGBMClassifier на втором уровне. 
- Товары-кандидаты побираются для каждого пользователя на основе ранее сделанных покупок (ItemItemRecommender(K=1), базовой модели ALS, модели подбора аналогичных товаров, купленным ранее (на основе ALS) и модели подбора товаров на основе схожести пользователей (на основе ALS)
- Целевая метрика precision@5
- В качестве бейзлайн решения испльзовался [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- В работе не реализовано получение рекомендаций для "холодных" пользователей, рекомендации выдаются только для тех пользователей, по которым есть собранные характеристики в user_features

## Import libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.recommenders import MainRecommender
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.utils import postfilter_items
from src.utils import calc_precision
from src.utils import calc_recall
from src.utils import eval_recall
from src.utils import rerank

## Read data

In [2]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

In [3]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [4]:
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


In [5]:
user_features.head()

Unnamed: 0,AGE_DESC,MARITAL_STATUS_CODE,INCOME_DESC,HOMEOWNER_DESC,HH_COMP_DESC,HOUSEHOLD_SIZE_DESC,KID_CATEGORY_DESC,household_key
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16


## Data analysis

In [6]:
data.shape

(2396804, 12)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2396804 entries, 0 to 2396803
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 219.4 MB


In [8]:
# Checking for NULL
data.isna().sum()[data.isna().sum() != 0]

Series([], dtype: int64)

In [9]:
data.describe()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
count,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0,2396804.0
mean,1271.904,33366430000.0,363.977,2827247.0,100.3763,3.100821,3048.227,-0.5400708,1561.714,52.68156,-0.01638696,-0.002897905
std,726.5644,4284798000.0,175.9385,3732798.0,1152.379,4.210229,8785.542,1.245824,401.5691,25.1331,0.2168615,0.03974618
min,1.0,26984850000.0,1.0,25671.0,0.0,0.0,1.0,-130.02,0.0,1.0,-55.93,-7.7
25%,655.0,30087140000.0,216.0,916993.0,1.0,1.29,330.0,-0.69,1307.0,32.0,0.0,0.0
50%,1271.0,32419980000.0,366.0,1027569.0,1.0,2.0,370.0,-0.02,1614.0,53.0,0.0,0.0
75%,1914.0,35145800000.0,515.0,1132178.0,1.0,3.49,422.0,0.0,1844.0,74.0,0.0,0.0
max,2500.0,41656790000.0,663.0,18024560.0,89638.0,840.0,34280.0,3.99,2359.0,95.0,0.0,0.0


In [10]:
user_features.shape

(801, 8)

In [11]:
user_features.nunique()

AGE_DESC                 6
MARITAL_STATUS_CODE      3
INCOME_DESC             12
HOMEOWNER_DESC           5
HH_COMP_DESC             6
HOUSEHOLD_SIZE_DESC      5
KID_CATEGORY_DESC        4
household_key          801
dtype: int64

In [12]:
user_features.isna().sum()[user_features.isna().sum() != 0]

Series([], dtype: int64)

In [13]:
item_features.nunique()

PRODUCT_ID              92353
MANUFACTURER             6476
DEPARTMENT                 44
BRAND                       2
COMMODITY_DESC            308
SUB_COMMODITY_DESC       2383
CURR_SIZE_OF_PRODUCT     4345
dtype: int64

In [14]:
item_features.isna().sum()[item_features.isna().sum() != 0]

Series([], dtype: int64)

In [15]:
item_features.head()

Unnamed: 0,PRODUCT_ID,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
3,26190,69,GROCERY,Private,FRUIT - SHELF STABLE,APPLE SAUCE,50 OZ
4,26355,69,GROCERY,Private,COOKIES/CONES,SPECIALTY COOKIES,14 OZ


## Set global const

In [16]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'
PROBA_COL = 'proba_item_purchase'
N_POPULAR = 5000 # Number items for prefilter
N_REC = 5 # Number of recommendations
N_CANDIDATES = 500 # Number of item-candidate
TOPK_RECALL = 5
TOPK_PRECISION = 5

## Process features dataset

In [17]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

## Split dataset for train, eval, test

In [18]:
# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

# VAL_MATCHER_WEEKS = 2 ('own_rec', 0.09629218081315837) ('als_rec', 0.07590267437523225) ('sim_item_rec', 0.052594555176119356)
# VAL_MATCHER_WEEKS = 3 ('own_rec', 0.10041296731092798) ('als_rec', 0.07592196196927876) ('sim_item_rec', 0.05356238749805944)
# VAL_MATCHER_WEEKS = 5 ('own_rec', 0.09729557304852297) ('als_rec', 0.06970155654164245) ('sim_item_rec', 0.054667935647957)
# VAL_MATCHER_WEEKS = 6 ('own_rec', 0.09211914788591925) ('als_rec', 0.06989723382189614) ('sim_item_rec', 0.05122039092146674)
# VAL_MATCHER_WEEKS = 10 ('own_rec', 0.08992459100259059) ('als_rec', 0.06517115990894459) ('sim_item_rec', 0.05432735787818124)
VAL_MATCHER_WEEKS = 3
VAL_RANKER_WEEKS = 3

In [19]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [20]:
# сделаем объединенный сет данных для первого уровня (матчинга)
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

In [21]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [22]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2193515, 12) Users: 2499 Items: 85334
val_matcher
Shape: (84975, 12) Users: 1889 Items: 20567
train_ranker
Shape: (84975, 12) Users: 1889 Items: 20567
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


## Prefilter items

In [23]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=N_POPULAR)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 85334 to 5001


In [24]:
data_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1.39
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0,1.21
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0,1.5
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0,1.57
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99


## Make cold-start to warm-start

In [25]:
# ищем общих пользователей
#common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values))
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values)&set(user_features.user_id.values))

# оставляем общих пользователей
data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (743207, 13) Users: 743 Items: 4997
val_matcher
Shape: (47872, 12) Users: 743 Items: 15534
train_ranker
Shape: (47872, 12) Users: 743 Items: 15534
val_ranker
Shape: (64714, 12) Users: 743 Items: 18237


## Init/train recommender

In [26]:
recommender = MainRecommender(data_train_matcher)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4997.0), HTML(value='')))




### Testing recommendations with help of our functions

Можно потом все эти варианты соединить в один

(!) Если модель рекомендует < N товаров, то рекомендации дополняются топ-популярными товарами до N

In [20]:
# Take user 2375

In [None]:
recommender.get_als_recommendations(2375, n=5)

In [14]:
recommender.get_own_recommendations(2375, n=5)

[1036501, 847962, 1052920, 887219, 894360]

In [15]:
recommender.get_similar_items_recommendation(2375, n=5)

[867709, 1046545, 9527160, 845208, 1092945]

In [16]:
recommender.get_similar_users_recommendation(2375, n=5)

[894360, 935578, 1046689, 1038745, 1102416]

## Eval recall of matching

### Измеряем recall@k

In [27]:
REC_FUNCTIONS = {'own_rec': recommender.get_own_recommendations, 
                    'sim_item_rec': recommender.get_similar_items_recommendation, 
                    'als_rec': recommender.get_als_recommendations}

In [28]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[829323, 835108, 836423, 851515, 875240, 87737..."
1,7,"[882477, 922307, 965797, 1022003, 1064441, 108..."


In [29]:
eval_recall(result_eval_matcher, USER_COL, REC_FUNCTIONS)
result_eval_matcher.head()

Model own_rec has done
Model sim_item_rec has done
Model als_rec has done


Unnamed: 0,user_id,actual,own_rec,sim_item_rec,als_rec
0,1,"[829323, 835108, 836423, 851515, 875240, 87737...","[1029743, 1082185, 1106523, 1127831, 951590]","[856345, 826597, 9526411, 5577022, 975633]","[1100972, 912704, 832678, 823721, 1027569]"
1,7,"[882477, 922307, 965797, 1022003, 1064441, 108...","[1029743, 1082185, 1106523, 1127831, 951590]","[1038985, 999999, 896369, 1133018, 993639]","[987724, 1082185, 1096036, 1034686, 893018]"
2,8,"[840361, 846830, 853567, 861961, 870735, 87365...","[1029743, 1082185, 1106523, 1127831, 951590]","[915086, 5569845, 5568249, 1133018, 1110843]","[965766, 844179, 1051211, 1029743, 981760]"
3,13,"[6544236, 841549, 857736, 912835, 942687, 9952...","[1029743, 1082185, 1106523, 1127831, 951590]","[1082185, 1019247, 9677874, 981760, 897125]","[1039156, 940766, 918335, 859075, 893018]"
4,16,"[866227, 1084551, 9835695, 13007710]","[1029743, 1082185, 1106523, 1127831, 951590]","[1016260, 9834988, 868972, 848319, 1029743]","[1082185, 1029743, 995242, 981760, 1127831]"


### Recall@5 of matching

In [30]:
print(*sorted(calc_recall(result_eval_matcher, TOPK_RECALL, ACTUAL_COL), key=lambda x: x[1], reverse=True), sep='\n')

('als_rec', 0.018214322284259035)
('own_rec', 0.017982226069655844)
('sim_item_rec', 0.013986970259124772)


### Precision@5 of matching

In [31]:
print(*sorted(calc_precision(result_eval_matcher, TOPK_PRECISION, ACTUAL_COL), key=lambda x: x[1], reverse=True), sep='\n')

('own_rec', 0.17873485868102376)
('als_rec', 0.14374158815612467)
('sim_item_rec', 0.12436069986541112)


*Вывод: лучшие результаты дает модель на основе ранее сделанных покупок ItemItemRecommender(K=1). Далее товары-кандидаты для модели ранжирования (2-го уровня) будем генерировать с помощью нее.*

## Ranking part

### Обучаем модель 2-ого уровня на выбранных кандидатах
- Обучаем на data_train_ranking
- Обучаем *только* на выбранных кандидатах
- Генерируни топ-500 кадидиатов с помощью get_own_recommendations, т.к. эта модель показывает ЛУЧШИЙ результат по precision
- (!) Если юзер купил < 500 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

### Подготовка данных для трейна

In [32]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [33]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, n=N_CANDIDATES))

In [34]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,1753,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."
1,2200,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."


In [35]:
# разворачиваем товары
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [36]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [37]:
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,1753,1029743
0,1753,1082185
0,1753,1106523
0,1753,1127831


### Check warm start

In [38]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (371500, 2) Users: 743 Items: 500


### Создаем трейн сет для ранжирования с учетом кандидатов с этапа 1 

In [39]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace=True)

In [40]:
df_ranker_train.target.value_counts()

0.0    363245
1.0     10201
Name: target, dtype: int64

### Подготавливаем фичи для обучения модели

- Выберем LightGBM c loss = binary. Это классическая бинарная классификация

### Описательные фичи

In [36]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [62]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [42]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')
df_ranker_train.head(5)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1753,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown
1,1753,1082185,0.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown
2,1753,1106523,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown
3,1753,1127831,0.0,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown
4,1753,951590,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,U,35-49K,Homeowner,Unknown,1,None/Unknown


### Поведенческие фичи

##### Чтобы считать поведенческие фичи, нужно учесть все данные что были до data_val_ranker

In [43]:
df_join_train_matcher.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


### Создаем новые фичи 

In [44]:
df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

In [45]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,1753,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,15015,12737,1242,2862.82,165.0,2151.483516,0.061233,0.798441,0.051943,0.005065
1,1753,1082185,0.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,...,27104,26127,1242,2862.82,297.846154,2151.483516,0.110534,0.798441,0.10655,0.005065
2,1753,1106523,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,10207,8722,1242,2862.82,112.164835,2151.483516,0.041626,0.798441,0.03557,0.005065
3,1753,1127831,0.0,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,45-54,...,7469,5211,1242,2862.82,82.076923,2151.483516,0.03046,0.798441,0.021251,0.005065
4,1753,951590,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,...,6765,5417,1242,2862.82,74.340659,2151.483516,0.027589,0.798441,0.022091,0.005065


In [46]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [47]:
cat_feats = [
    'manufacturer',
    'department',
    'brand',
    'commodity_desc',
    'sub_commodity_desc',
    'curr_size_of_product',
    'age_desc',
    'marital_status_code',
    'income_desc',
    'homeowner_desc',
    'hh_comp_desc',
    'household_size_desc',
    'kid_category_desc'
]
X_train[cat_feats] = X_train[cat_feats].astype('category')

### Обучение модели ранжирования (baseline)

In [48]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=100,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

train_preds = lgb.predict_proba(X_train)

  return f(**kwargs)


In [49]:
df_ranker_predict = df_ranker_train.copy()

In [50]:
df_ranker_predict[PROBA_COL] = train_preds[:,1]

In [51]:
df_ranker_predict.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket,proba_item_purchase
0,1753,1029743,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,12737,1242,2862.82,165.0,2151.483516,0.061233,0.798441,0.051943,0.005065,0.252065
1,1753,1082185,0.0,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,45-54,...,26127,1242,2862.82,297.846154,2151.483516,0.110534,0.798441,0.10655,0.005065,0.489003
2,1753,1106523,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,45-54,...,8722,1242,2862.82,112.164835,2151.483516,0.041626,0.798441,0.03557,0.005065,0.160677
3,1753,1127831,0.0,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,45-54,...,5211,1242,2862.82,82.076923,2151.483516,0.03046,0.798441,0.021251,0.005065,0.026971
4,1753,951590,0.0,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,45-54,...,5417,1242,2862.82,74.340659,2151.483516,0.027589,0.798441,0.022091,0.005065,0.090106


## Evaluation on test dataset

In [52]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,7,"[840386, 889774, 898068, 909714, 929067, 95347..."


## Eval matching on test dataset

In [54]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, n=N_REC))

Wall time: 1.56 s


In [55]:
# замеряем precision только для лучшей модели 1-го уровня, чтобы далее понять влияение ранжирования на метрики

print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION, ACTUAL_COL), key=lambda x: x[1], reverse=True), sep='\n')

('own_rec', 0.21561238223418588)


## Eval re-ranked matched result on test dataset   

In [56]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].\
    apply(lambda user_id: rerank(USER_COL, user_id, df_ranker_predict, PROBA_COL))

In [57]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION, ACTUAL_COL), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.23580080753701219)
('own_rec', 0.21561238223418588)


### Сделаем подбор гиперпараметров для LGBMClassifier (модели 2-го уровня) с помощью алгоритма random search

In [58]:
import itertools

def sample_hyperparameters():
    while True:
        yield {            
            'max_depth': np.random.choice([2, 4, 6, 8, 10, 20, 30, 40, 100]),
            'n_estimators': np.random.choice([100, 200, 300, 500, 1000, 5000]),
            'learning_rate': np.random.choice([0.5, 0.1, 0.05, 0.01, 0.001, 0.0001])            
        }
                                          
def random_search(X_train, y_train, cat_feats, num_samples=20, num_threads=4):
    
    i = 1
    
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        model = LGBMClassifier(**hyperparams, objective='binary', categorical_column=cat_feats, random_state=42)
        model.fit(X_train, y_train)
        train_preds = model.predict_proba(X_train)
        df_ranker_predict = df_ranker_train.copy()
        df_ranker_predict['proba_item_purchase'] = train_preds[:,1]                            
        result_eval_ranker['reranked_own_rec_' + str(i)] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(USER_COL, user_id, df_ranker_predict, PROBA_COL))
        print('Sample #{} with hyperparameters {}'.format(i, hyperparams))
        i += 1 

In [59]:
random_search(X_train, y_train, cat_feats, num_samples=20, num_threads=4)

  return f(**kwargs)


Sample #1 with hyperparameters {'max_depth': 4, 'n_estimators': 500, 'learning_rate': 0.01}
Sample #2 with hyperparameters {'max_depth': 4, 'n_estimators': 1000, 'learning_rate': 0.5}
Sample #3 with hyperparameters {'max_depth': 30, 'n_estimators': 5000, 'learning_rate': 0.001}
Sample #4 with hyperparameters {'max_depth': 4, 'n_estimators': 100, 'learning_rate': 0.1}
Sample #5 with hyperparameters {'max_depth': 2, 'n_estimators': 500, 'learning_rate': 0.001}
Sample #6 with hyperparameters {'max_depth': 100, 'n_estimators': 200, 'learning_rate': 0.05}
Sample #7 with hyperparameters {'max_depth': 30, 'n_estimators': 300, 'learning_rate': 0.5}
Sample #8 with hyperparameters {'max_depth': 10, 'n_estimators': 500, 'learning_rate': 0.1}
Sample #9 with hyperparameters {'max_depth': 30, 'n_estimators': 100, 'learning_rate': 0.01}
Sample #10 with hyperparameters {'max_depth': 40, 'n_estimators': 5000, 'learning_rate': 0.01}
Sample #11 with hyperparameters {'max_depth': 6, 'n_estimators': 100, '

In [60]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION, ACTUAL_COL), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec_10', 0.27267833109017464)
('reranked_own_rec_14', 0.2724091520861369)
('reranked_own_rec_8', 0.2659488559892324)
('reranked_own_rec_13', 0.2602960969044412)
('reranked_own_rec_16', 0.2602960969044411)
('reranked_own_rec_6', 0.24495289367429338)
('reranked_own_rec_15', 0.24360699865410482)
('reranked_own_rec_20', 0.24360699865410482)
('reranked_own_rec_3', 0.23876177658142672)
('reranked_own_rec_12', 0.23795423956931364)
('reranked_own_rec_4', 0.23687752355316288)
('reranked_own_rec', 0.23580080753701219)
('reranked_own_rec_19', 0.23553162853297457)
('reranked_own_rec_17', 0.2336473755047107)
('reranked_own_rec_1', 0.23310901749663537)
('reranked_own_rec_9', 0.23283983849259768)
('reranked_own_rec_5', 0.23230148048452232)
('reranked_own_rec_18', 0.23122476446837165)
('own_rec', 0.21561238223418588)
('reranked_own_rec_11', 0.19004037685060632)
('reranked_own_rec_7', 0.08398384925975795)
('reranked_own_rec_2', 0.060565275908479245)


**Вывод: лучшие параметры были в сэмпле #10 {'max_depth': 40, 'n_estimators': 5000, 'learning_rate': 0.01}. Далее будем использовать их для подготовки рекомендаций**

In [61]:
lgb_best = LGBMClassifier(objective='binary',
                     max_depth=40,
                     n_estimators=5000,
                     learning_rate=0.01,
                     categorical_column=cat_feats)

lgb_best.fit(X_train, y_train)

LGBMClassifier(categorical_column=['manufacturer', 'department', 'brand',
                                   'commodity_desc', 'sub_commodity_desc',
                                   'curr_size_of_product', 'age_desc',
                                   'marital_status_code', 'income_desc',
                                   'homeowner_desc', 'hh_comp_desc',
                                   'household_size_desc', 'kid_category_desc'],
               learning_rate=0.01, max_depth=40, n_estimators=5000,
               objective='binary')

## Оценка на тесте для сдачи результатов курсового проекта

In [62]:
df_test = pd.read_csv('data/retail_test1.csv')

In [63]:
df_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [64]:
print_stats_data(df_test,'df_test')

df_test
Shape: (88734, 12) Users: 1885 Items: 20497


In [65]:
# Убираем холодных пользователей и пользователей, по которым нет данных
common_users = list(set(data_train_matcher.user_id.values)&\
                    set(data_val_matcher.user_id.values)&\
                    set(data_val_ranker.user_id.values)&\
                    set(user_features.user_id.values))
df_test = df_test[df_test.user_id.isin(common_users)]
print_stats_data(df_test,'df_test')

df_test
Shape: (47823, 12) Users: 713 Items: 15132


In [66]:
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]
result_test.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,7,"[847270, 855557, 859987, 863407, 895454, 90663..."


In [67]:
df_test_candidates = pd.DataFrame(df_test[USER_COL].unique())
df_test_candidates.columns = [USER_COL]
df_test_candidates['candidates'] = df_test_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, n=N_CANDIDATES))

In [68]:
df_test_candidates.head(5)

Unnamed: 0,user_id,candidates
0,588,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."
1,2070,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."
2,117,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."
3,1762,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."
4,762,"[1029743, 1082185, 1106523, 1127831, 951590, 9..."


In [69]:
# разворачиваем товары
df_items = df_test_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'
df_test_candidates = df_test_candidates.drop('candidates', axis=1).join(df_items)

In [70]:
df_test_candidates.head(5)

Unnamed: 0,user_id,item_id
0,588,1029743
0,588,1082185
0,588,1106523
0,588,1127831
0,588,951590


In [71]:
X_test = df_test_candidates[[USER_COL, ITEM_COL]].copy()

In [72]:
X_test = X_test.merge(item_features, on='item_id', how='left')
X_test = X_test.merge(user_features, on='user_id', how='left')

X_test.head(5)

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,588,1029743,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,35-44,A,50-74K,Homeowner,2 Adults Kids,5+,3+
1,588,1082185,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,35-44,A,50-74K,Homeowner,2 Adults Kids,5+,3+
2,588,1106523,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,35-44,A,50-74K,Homeowner,2 Adults Kids,5+,3+
3,588,1127831,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,35-44,A,50-74K,Homeowner,2 Adults Kids,5+,3+
4,588,951590,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,35-44,A,50-74K,Homeowner,2 Adults Kids,5+,3+


In [73]:
X_test[cat_feats] = X_test[cat_feats].astype('category')

In [74]:
X_test = X_test.merge(df_test.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

X_test = X_test.merge(df_test.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

X_test = X_test.merge(df_test.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

X_test = X_test.merge(df_test.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

X_test = X_test.merge(df_test.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

X_test = X_test.merge(df_test.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

X_test = X_test.merge(df_test.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)

X_test = X_test.merge(df_test.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

X_test = X_test.merge(df_test.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

X_test = X_test.merge(df_test.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

X_test = X_test.merge(df_test.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)

In [75]:
X_test.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,...,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,588,1029743,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,35-44,A,...,195.0,175.0,60,173.1,2.142857,197.252747,0.000795,0.073203,0.000714,0.000245
1,588,1082185,2,PRODUCE,National,TROPICAL FRUIT,BANANAS,40 LB,35-44,A,...,637.0,608.0,60,173.1,7.0,197.252747,0.002598,0.073203,0.00248,0.000245
2,588,1106523,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,35-44,A,...,127.0,117.0,60,173.1,1.395604,197.252747,0.000518,0.073203,0.000477,0.000245
3,588,1127831,5937,PRODUCE,National,BERRIES,STRAWBERRIES,16 OZ,35-44,A,...,10.0,8.0,60,173.1,0.10989,197.252747,4.1e-05,0.073203,3.3e-05,0.000245
4,588,951590,910,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,20 OZ,35-44,A,...,138.0,115.0,60,173.1,1.516484,197.252747,0.000563,0.073203,0.000469,0.000245


In [76]:
test_preds = lgb_best.predict_proba(X_test)
df_test_predict = X_test.copy()
df_test_predict[PROBA_COL] = test_preds[:,1]

In [77]:
result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, n=N_REC))

In [78]:
result_test['reranked_own_rec'] = result_test[USER_COL].apply(lambda user_id: rerank(USER_COL, user_id, df_test_predict, PROBA_COL))

In [79]:
# Отберем 20 товаров из результатов ранжирования для последующей постфильтрации
result_test['reranked_own_rec_for_pf'] = result_test[USER_COL].apply(lambda user_id: rerank(USER_COL, user_id, df_test_predict, PROBA_COL, n=20))

In [80]:
# Сдлеаем постфильтрацию товаров для сравнения результатов
result_test['reranked_own_rec_pf'] = result_test['reranked_own_rec_for_pf'].apply(lambda reranked_list: postfilter_items(reranked_list, item_features))

In [81]:
result_test = result_test.drop('reranked_own_rec_for_pf', axis=1)

In [82]:
result_test.head()

Unnamed: 0,user_id,actual,own_rec,reranked_own_rec,reranked_own_rec_pf
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[1029743, 1082185, 1106523, 1127831, 951590]","[988791, 909714, 990797, 896369, 1052912]","[988791, 1052912, 1069621, 880150, 1051211]"
1,7,"[847270, 855557, 859987, 863407, 895454, 90663...","[1029743, 1082185, 1106523, 1127831, 951590]","[909714, 896369, 832678, 988791, 990797]","[909714, 832678, 990797, 1069621, 880150]"
2,8,"[846334, 850834, 857503, 862139, 865891, 87829...","[1029743, 1082185, 1106523, 1127831, 951590]","[909714, 896369, 988791, 832678, 990797]","[909714, 988791, 1069621, 844165, 873627]"
3,13,"[878996, 923746, 942525, 943076, 951590, 98517...","[1029743, 1082185, 1106523, 1127831, 951590]","[1069621, 873627, 844165, 868764, 873654]","[1069621, 844165, 873654, 1052912, 1100972]"
4,16,"[9677923, 12263788]","[1029743, 1082185, 1106523, 1127831, 951590]","[1069621, 896369, 832678, 1052912, 1100972]","[1069621, 832678, 1021164, 1005186, 988791]"


In [83]:
print(*sorted(calc_precision(result_test, TOPK_PRECISION, ACTUAL_COL), key=lambda x: x[1], reverse=True), sep='\n')

('own_rec', 0.17419354838709794)
('reranked_own_rec', 0.036746143057503435)
('reranked_own_rec_pf', 0.0336605890603085)


In [84]:
result_final = result_test.drop(['actual', 'reranked_own_rec', 'reranked_own_rec_pf'], axis=1)

In [85]:
result_final.to_csv('predictions.csv', index=False)