# Двухуровневые модели рекомендаций

---

# Import libs

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline

# Модель второго уровня
from lightgbm import LGBMClassifier

In [2]:
# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items, time_to_cat
from recommenders import MainRecommender
import MyServiceFunctions

## Read data

In [3]:
data = pd.read_csv('C:/Users/Вадим/Desktop/GeekBrains/Recommendation-systems/Lectures/Lecture_2/webinar_2/webinar_2'
                   '/data/retail_train.csv')
item_features = pd.read_csv('C:/Users/Вадим/Desktop/GeekBrains/Recommendation-systems/Lectures/Lecture_2/webinar_2/webinar_2'
                            '/data/product.csv')
user_features = pd.read_csv('C:/Users/Вадим/Desktop/GeekBrains/Recommendation-systems/Lectures/Lecture_2/webinar_2/webinar_2'
                            '/data/hh_demographic.csv')

# Process features dataset

In [4]:
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


### Features

- user_id - Уникально идентифицирует каждое домашнее хозяйство.
- basket_id - Uniquely identifies a purchase occasion. Однозначно идентифицирует случай покупки.
- day - Day when transaction occurred. День, когда произошла транзакция.
- item_id - Uniquely identifies each product. Уникально идентифицирует каждый продукт.
- quantity - Number of the products purchased during the trip. Количество продуктов, приобретенных во время визита в магазин.
- sales_value - Amount of dollars retailer receives from sale. Сумма долларов, которую ритейлер получает от продажи.
- store_id - Identifies unique stores. Однозначно идентифицирует магазины.
- retail_disc - Discount applied due to retailer's loyalty card program. Скидка, применяемая в связи с программой карты лояльности ритейлера.
- trans_time - Time of day when the transaction occurred. Время суток, когда произошла транзакция.
- week_no - Week of the transaction. Ranges 1 - 102. Неделя транзакции. Диапазон 1 - 102.
- coupon_disc - Discount applied due to manufacturer coupon. Скидка, применяемая в связи с купоном производителя.
- coupon_match_disc - Discount applied due to retailer's match of manufacturer coupon. Скидка применяется в связи с совпадением купона продавца с купоном производителя.

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2396804 entries, 0 to 2396803
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user_id            int64  
 1   basket_id          int64  
 2   day                int64  
 3   item_id            int64  
 4   quantity           int64  
 5   sales_value        float64
 6   store_id           int64  
 7   retail_disc        float64
 8   trans_time         int64  
 9   week_no            int64  
 10  coupon_disc        float64
 11  coupon_match_disc  float64
dtypes: float64(4), int64(8)
memory usage: 219.4 MB


In [6]:
data.isna().sum()

user_id              0
basket_id            0
day                  0
item_id              0
quantity             0
sales_value          0
store_id             0
retail_disc          0
trans_time           0
week_no              0
coupon_disc          0
coupon_match_disc    0
dtype: int64

### 1. Selection and construction of new features

#### Введём признак домохозяйства - в какое время дня чаще всего совершаются покупки, для этого расчетаем среднее время совершания покупок.

In [7]:
df_cat = data.groupby(['user_id'], as_index=False).mean()[['user_id','trans_time']]
df_cat = pd.DataFrame(df_cat)
df_cat.rename(columns={'trans_time': 'avr_trans_time'}, inplace=True)
df_cat['avr_trans_time'] = df_cat['avr_trans_time'].astype(int)
df_cat.head(2)

Unnamed: 0,user_id,avr_trans_time
0,1,1384
1,2,1805


In [8]:
data = data.merge(df_cat, on=['user_id'])
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237


In [9]:
data = time_to_cat(data)
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day


#### Создадим список покупок каждого домохозяйства

In [10]:
df_cat = data.groupby(['user_id'], as_index=False).agg({'basket_id': lambda x: set(x)})
df_cat.columns = ['user_id', 'list_basket_id']
df_cat.head(2)

Unnamed: 0,user_id,list_basket_id
0,1,"{32259826179, 28106322445, 31818756113, 325567..."
1,2,"{30035866883, 41008217091, 29157132423, 406185..."


#### Создадим признак - количество покупок каждого домохозяйства

In [11]:
df_cat['num_of_purchases'] = df_cat['list_basket_id'].apply(lambda x: len(x))
df_cat.drop('list_basket_id', axis=1,  inplace=True)
df_cat.head(2)

Unnamed: 0,user_id,num_of_purchases
0,1,79
1,2,44


In [12]:
data = data.merge(df_cat, on=['user_id'])
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day,105
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day,105


#### Создадим признак - кол-во недель, в какие совершались покупки

In [13]:
df_cat_1 = data.groupby(['user_id'], as_index=False)['week_no'].nunique()
df_cat_1.columns = ['user_id', 'num_week_no']
df_cat_1.head(2)

Unnamed: 0,user_id,num_week_no
0,1,64
1,2,34


In [14]:
data = data.merge(df_cat_1, on=['user_id'])
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day,105,51
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day,105,51


#### Создадим признак - сумма в деньгах всех покупок каждого домохозяйства

In [15]:
df_cat = data.groupby('user_id', as_index=False)['sales_value'].sum()
df_cat = pd.DataFrame(df_cat)
df_cat.rename(columns={'sales_value': 'total_amount_of_purchases'}, inplace=True)
df_cat.sort_values('total_amount_of_purchases')
df_cat.head(2)

Unnamed: 0,user_id,total_amount_of_purchases
0,1,3959.91
1,2,1823.45


In [16]:
data = data.merge(df_cat, on=['user_id'])
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day,105,51,2486.42
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day,105,51,2486.42


#### Создадим признак - средний чек покупки каждого домохозяйства

In [17]:
data['avr_purchase_receipt'] = data['total_amount_of_purchases'] / data['num_of_purchases']
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day,105,51,2486.42,23.68019
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day,105,51,2486.42,23.68019


#### Создадим признак - средний недельный чек покупки каждого домохозяйства

In [18]:
data['avr_weekly_purchase_receipt'] = data['total_amount_of_purchases'] / data['num_week_no']
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt,avr_weekly_purchase_receipt
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day,105,51,2486.42,23.68019,48.753333
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day,105,51,2486.42,23.68019,48.753333


#### Создадим признак - количество дней между первой и последней покупкой домохозяйств

In [19]:
df_cat = data.groupby(['user_id'], as_index=False).agg({'day': lambda x: list(x)})
df_cat.columns = ['user_id', 'list_day']
df_cat['num_days'] = df_cat['list_day'].apply(lambda x: (max(x)-min(x)))
df_cat.replace({'num_days': {0: 1}}, inplace=True)
df_cat.drop('list_day', axis=1,  inplace=True)
df_cat.head(5)

Unnamed: 0,user_id,num_days
0,1,609
1,2,519
2,3,527
3,4,523
4,5,504


In [20]:
data = data.merge(df_cat, on=['user_id'])
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt,avr_weekly_purchase_receipt,num_days
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,1237,day,105,51,2486.42,23.68019,48.753333,659
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,1237,day,105,51,2486.42,23.68019,48.753333,659


#### Создадим признак - среднее число покупок домохозяйств в день

In [21]:
data['avr_num_of_purchases_per_day'] = data['num_of_purchases'] / data['num_days']
data.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt,avr_weekly_purchase_receipt,num_days,avr_num_of_purchases_per_day
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,0.0,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,0.0,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332


In [22]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'

In [23]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [24]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [25]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92353 entries, 0 to 92352
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   item_id               92353 non-null  int64 
 1   manufacturer          92353 non-null  int64 
 2   department            92353 non-null  object
 3   brand                 92353 non-null  object
 4   commodity_desc        92353 non-null  object
 5   sub_commodity_desc    92353 non-null  object
 6   curr_size_of_product  92353 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.9+ MB


In [26]:
item_features.isna().sum()

item_id                 0
manufacturer            0
department              0
brand                   0
commodity_desc          0
sub_commodity_desc      0
curr_size_of_product    0
dtype: int64

In [27]:
item_features.loc[item_features['item_id'] == 26081]

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [28]:
len(item_features.loc[item_features['sub_commodity_desc'] == ' '])

15

In [29]:
len(item_features.loc[item_features['curr_size_of_product'] == ' '])

30607

In [30]:
item_features.loc[item_features['curr_size_of_product'] == ' ']

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,
8,26636,69,PASTRY,Private,BREAKFAST SWEETS,SW GDS: SW ROLLS/DAN,
11,26889,32,DRUG GM,National,MAGAZINE,TV/MOVIE-MAGAZINE,
13,27021,2,GROCERY,National,AIR CARE,AIR CARE - AEROSOLS,
...,...,...,...,...,...,...,...
92348,18293142,6384,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,
92349,18293439,6393,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,
92350,18293696,6406,DRUG GM,National,BOOKSTORE,PAPERBACK BEST SELLER,
92351,18294080,6442,DRUG GM,National,BOOKSTORE,PAPERBACK BOOKS,


In [31]:
item_features.replace({'department': {' ': 'unknown'}, 'commodity_desc': {' ': 'unknown'}, 'sub_commodity_desc': {' ': 'unknown'}}, inplace=True)

In [32]:
item_features.replace({'curr_size_of_product': {' ': 'not_defined'}}, inplace=True)

In [33]:
len(item_features.loc[item_features['curr_size_of_product'] == ' '])

0

In [34]:
item_features['commodity_desc'] = item_features['commodity_desc'].apply(
    lambda x: MyServiceFunctions.lower_text(x))


In [35]:
item_features['sub_commodity_desc'] = item_features['sub_commodity_desc'].apply(
    lambda x: MyServiceFunctions.lower_text(x))


In [36]:
item_features['commodity_desc'] = item_features['commodity_desc'].apply(lambda x: MyServiceFunctions.replace_commodity_desc(x))

In [37]:
item_features['sub_commodity_desc'] = item_features['sub_commodity_desc'].apply(lambda x: MyServiceFunctions.replace_sub_commodity_desc(x))

In [38]:
item_features['commodity_desc'] = item_features['commodity_desc'].apply(
    lambda x: MyServiceFunctions.text_without_stopword(x))


In [39]:
item_features['sub_commodity_desc'] = item_features['sub_commodity_desc'].apply(
    lambda x: MyServiceFunctions.text_without_stopword(x))


In [40]:
item_features.head(5)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,frzn ice,ice crushed cubed,22 LB
1,26081,2,MISC. TRANS.,National,commodity description,subcommodity description,not_defined
2,26093,69,PASTRY,Private,bread,bread italian french,not_defined
3,26190,69,GROCERY,Private,fruit shelf stable,apple sauce,50 OZ
4,26355,69,GROCERY,Private,cookies cones,specialty cookies,14 OZ


In [41]:
item_features['sub_commodity_desc'].unique().tolist()

['ice crushed cubed',
 'subcommodity description',
 'bread italian french',
 'apple sauce',
 'specialty cookies',
 'spices seasonings',
 'tray pack choc chip cookies',
 'vitamin minerals',
 'sweet goods sweet rolls dan',
 'honey',
 'traditional',
 'tv movie magazine',
 'air care aerosols',
 'string cheese',
 'vegetable salad oil',
 'instant decaf flavor coffee w',
 'diet control liquors nutritional',
 'paper foam drinking cups',
 'mainstream white bread',
 'natural cheese exact  chunks',
 'cranberry sauce',
 'mixed vegetables',
 'mexican beans refried',
 'skillet dinners',
 'hot mustard specialty mustard',
 'plastic heavy paper premium plate',
 'individually wrapped slice single cheese',
 'frozen bagged vegetables plain',
 'macaroni cheese dinner',
 'ground coffee',
 'pie filling mincemeat glazes',
 'potatoes sweet',
 'english muffins waffles',
 'plstc ctlrytblclthstthpksst',
 'fresh',
 'miscellaneous cheese',
 'frozen fruit',
 'seafood frozen raw fillets',
 'ammonia',
 'sweet goods mu

In [42]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [45]:
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   age_desc             801 non-null    object
 1   marital_status_code  801 non-null    object
 2   income_desc          801 non-null    object
 3   homeowner_desc       801 non-null    object
 4   hh_comp_desc         801 non-null    object
 5   household_size_desc  801 non-null    object
 6   kid_category_desc    801 non-null    object
 7   user_id              801 non-null    int64 
dtypes: int64(1), object(7)
memory usage: 50.2+ KB


---

# Split dataset for train, eval, test

In [46]:
VAL_MATCHER_WEEKS = 6
VAL_RANKER_WEEKS = 3

In [47]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                        (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]


# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [48]:
def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")

In [49]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2108779, 21) Users: 2498 Items: 83685
val_matcher
Shape: (169711, 21) Users: 2154 Items: 27649
train_ranker
Shape: (169711, 21) Users: 2154 Items: 27649
val_ranker
Shape: (118314, 21) Users: 2042 Items: 24329


In [50]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,coupon_match_disc,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt,avr_weekly_purchase_receipt,num_days,avr_num_of_purchases_per_day
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,...,0.0,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,...,0.0,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332


# Prefilter items

In [51]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [52]:
data_train_matcher.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt,avr_weekly_purchase_receipt,num_days,avr_num_of_purchases_per_day,price
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,2.99
11,2375,27115132404,12,827919,1,2.79,364,0.0,1936,2,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,2.79


# Make cold-start to warm-start

In [53]:
# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (861401, 22) Users: 2495 Items: 5001
val_matcher
Shape: (169615, 21) Users: 2151 Items: 27644
train_ranker
Shape: (169615, 21) Users: 2151 Items: 27644
val_ranker
Shape: (118282, 21) Users: 2040 Items: 24325


# Init/train recommender

In [54]:
recommender = MainRecommender(data_train_matcher)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

# Eval recall of matching

In [55]:
ACTUAL_COL = 'actual'

In [56]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [57]:
# N = Neighbors
N_PREDICT = 50

In [58]:
def evalRecall(df_result, result_col_name, target_col_name, recommend_model, N_PREDICT):
    df_result[result_col_name] = df_result[target_col_name].apply(lambda x: recommend_model(x, N_PREDICT))
    return df_result.apply(lambda row: recall_at_k(row[result_col_name], row[ACTUAL_COL], k=N_PREDICT), axis=1).mean()

In [59]:
evalRecall(result_eval_matcher, 'result', USER_COL, recommender.get_own_recommendations, 25)

0.044119547395835505

In [60]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [61]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [62]:
result_eval_matcher['own_rec_45'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, 45))
result_eval_matcher['als_rec_5'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, 5))
result_eval_matcher['own_rec_45_+_als_rec_5'] = result_eval_matcher['own_rec_45'] + result_eval_matcher['als_rec_5']

In [63]:
result_eval_matcher.apply(lambda row: recall_at_k(row['own_rec_45_+_als_rec_5'], row[ACTUAL_COL], k=50), axis=1).mean()

0.0667090010081626

In [64]:
result_eval_matcher['own_rec_50'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, 50))
result_eval_matcher.apply(lambda row: recall_at_k(row['own_rec_50'], row[ACTUAL_COL], k=50), axis=1).mean()

0.06525657038145175

In [65]:
result_eval_matcher['als_rec_50'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, 50))
result_eval_matcher.apply(lambda row: recall_at_k(row['als_rec_50'], row[ACTUAL_COL], k=50), axis=1).mean()

0.054910942926714806

### Recall@50 of matching

In [66]:
TOPK_RECALL = 50

In [67]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec_45_+_als_rec_5', 0.0667090010081626),
 ('own_rec_50', 0.06525657038145175),
 ('own_rec_45', 0.06212243669915392),
 ('als_rec_50', 0.054910942926714806),
 ('result', 0.044119547395835505),
 ('als_rec_5', 0.015864391764492336)]

### Precision@5 of matching

In [68]:
TOPK_PRECISION = 5

In [69]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('result', 0.17712691771268974),
 ('own_rec_45', 0.17712691771268974),
 ('own_rec_45_+_als_rec_5', 0.17712691771268974),
 ('own_rec_50', 0.17712691771268974),
 ('als_rec_5', 0.13314737331473564),
 ('als_rec_50', 0.13314737331473564)]

# Ranking part

## Подготовка данных для трейна

In [70]:
# взяли пользователей из трейна для ранжирования
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [71]:
df_match_candidates

Unnamed: 0,user_id
0,2375
1,1364
2,1130
3,1173
4,98
...,...
2146,540
2147,338
2148,615
2149,1077


In [72]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [73]:
df_match_candidates.head(2)

Unnamed: 0,user_id,candidates
0,2375,"[948640, 918046, 847962, 907099, 873980, 88469..."
1,1364,"[1101378, 8090570, 857176, 947013, 1065979, 10..."


In [74]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items

0        948640
0        918046
0        847962
0        907099
0        873980
         ...   
2150     909714
2150     995478
2150    1075214
2150    1138443
2150    1023720
Length: 107550, dtype: int64

In [75]:
df_items.name = 'item_id'
df_items

0        948640
0        918046
0        847962
0        907099
0        873980
         ...   
2150     909714
2150     995478
2150    1075214
2150    1138443
2150    1023720
Name: item_id, Length: 107550, dtype: int64

In [76]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)
df_match_candidates.head(4)

Unnamed: 0,user_id,item_id
0,2375,948640
0,2375,918046
0,2375,847962
0,2375,907099


### Check warm start

In [77]:
print_stats_data(df_match_candidates, 'match_candidates')

match_candidates
Shape: (107550, 2) Users: 2151 Items: 4574


In [78]:
df_ranker_train = data_train_ranker.copy()
df_ranker_train['target'] = 1  # тут только покупки

In [79]:
df_ranker_train.head(5)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,avr_trans_time,time_cat,num_of_purchases,num_week_no,total_amount_of_purchases,avr_purchase_receipt,avr_weekly_purchase_receipt,num_days,avr_num_of_purchases_per_day,target
563,2375,40630425577,594,835300,1,4.73,364,0.0,1305,86,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,1
564,2375,40630425577,594,848029,1,11.21,364,-4.5,1305,86,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,1
565,2375,40630425577,594,862714,3,2.37,364,-0.6,1305,86,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,1
566,2375,40630425577,594,896613,2,6.24,364,-2.71,1305,86,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,1
567,2375,40630425577,594,923746,1,1.49,364,0.0,1305,86,...,1237,day,105,51,2486.42,23.68019,48.753333,659,0.159332,1


#### Не хватает нулей в датасете, поэтому добавляем наших кандитатов в качество нулей

In [80]:
df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

## Подготавливаем фичи для обучения модели

In [81]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,frzn ice,ice crushed cubed,22 LB
1,26081,2,MISC. TRANS.,National,commodity description,subcommodity description,not_defined


In [82]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2375,948640,,,,,,,,,...,oral hygiene products,whitening systems,3 OZ,,,,,,,
1,2375,918046,,,,,,,,,...,cigarettes,cigarettes,971838 PK,,,,,,,


In [83]:
X_train = df_ranker_train.drop('target', axis=1)
y_train = df_ranker_train[['target']]

In [84]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline

In [85]:
from sklearn.pipeline import FeatureUnion

In [86]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [87]:
from sklearn.ensemble import RandomForestClassifier

In [88]:
continuos_cols = ['manufacturer', 'day', 'quantity', 'sales_value', 'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
                  'coupon_match_disc', 'avr_trans_time', 'num_of_purchases', 'num_week_no', 'total_amount_of_purchases',
                  'avr_purchase_receipt', 'avr_weekly_purchase_receipt', 'num_days', 'avr_num_of_purchases_per_day']
cat_cols = ['income_desc', 'age_desc', 'hh_comp_desc', 'marital_status_code', 'household_size_desc', 'kid_category_desc', 'time_cat']  #  , 'homeowner_des'
base_cols = ['brand']
text_cols = ['department', 'commodity_desc', 'sub_commodity_desc']

In [89]:
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,basket_id,day,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2375,948640,,,,,,,,,...,oral hygiene products,whitening systems,3 OZ,,,,,,,
1,2375,918046,,,,,,,,,...,cigarettes,cigarettes,971838 PK,,,,,,,


In [91]:
X_train.loc[X_train['age_desc'].isna(), 'age_desc'] = 'Unknown'
X_train['age_desc']

0         Unknown
1         Unknown
2         Unknown
3         Unknown
4         Unknown
           ...   
106967    Unknown
106968    Unknown
106969    Unknown
106970    Unknown
106971    Unknown
Name: age_desc, Length: 106972, dtype: object

In [92]:
X_train.loc[X_train['marital_status_code'].isna(), 'marital_status_code'] = 'Unknown'
X_train.loc[X_train['income_desc'].isna(), 'income_desc'] = 'Unknown'
X_train.loc[X_train['homeowner_desc'].isna(), 'homeowner_desc'] = 'Unknown'
X_train.loc[X_train['hh_comp_desc'].isna(), 'hh_comp_desc'] = 'Unknown'
X_train.loc[X_train['household_size_desc'].isna(), 'household_size_desc'] = 'Unknown'
X_train.loc[X_train['kid_category_desc'].isna(), 'kid_category_desc'] = 'Unknown'

In [93]:
continuos_transformers = []
cat_transformers = []
base_transformers = []
text_transformers = []

for cont_col in continuos_cols:
    transfomer =  Pipeline([
        ('selector', MyServiceFunctions.NumberSelector(key=cont_col)),
        ('standard', StandardScaler())
    ])
    continuos_transformers.append((cont_col, transfomer))

for cat_col in cat_cols:
    cat_transformer = Pipeline([
        ('selector', MyServiceFunctions.FeatureSelector(key=cat_col)),
        ('ohe', MyServiceFunctions.OHEEncoder(key=cat_col))
    ])
    cat_transformers.append((cat_col, cat_transformer))

for base_col in base_cols:
    base_transformer = Pipeline([
        ('selector', MyServiceFunctions.NumberSelector(key=base_col)),
        ('ohe_bin', MyServiceFunctions.OHEEncoderBin(key=base_col))
    ])
    base_transformers.append((base_col, base_transformer))

for text_col in text_cols:
    text_transformer = Pipeline([
        ('selector', MyServiceFunctions.FeatureSelector(key=text_col)),
        ('description_tfidf', TfidfVectorizer(analyzer='word'))
    ])
    text_transformers.append((text_col, text_transformer))

In [94]:
feats = FeatureUnion(continuos_transformers+cat_transformers+base_transformers+text_transformers)
# feats = FeatureUnion(continuos_transformers+base_transformers+text_transformers)
feature_processing = Pipeline([('feats', feats)])

In [95]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106972 entries, 0 to 106971
Data columns (total 34 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   user_id                       106972 non-null  int64  
 1   item_id                       106972 non-null  int64  
 2   basket_id                     7795 non-null    float64
 3   day                           7795 non-null    float64
 4   quantity                      7795 non-null    float64
 5   sales_value                   7795 non-null    float64
 6   store_id                      7795 non-null    float64
 7   retail_disc                   7795 non-null    float64
 8   trans_time                    7795 non-null    float64
 9   week_no                       7795 non-null    float64
 10  coupon_disc                   7795 non-null    float64
 11  coupon_match_disc             7795 non-null    float64
 12  avr_trans_time                7795 non-null 

In [189]:
feature_processing.fit(X_train)

Pipeline(steps=[('feats',
                 FeatureUnion(transformer_list=[('manufacturer',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='manufacturer')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('day',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='day')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('quantity',
                                                 Pipeline(steps=[('selector',
                             

In [456]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['basket_id',
 'day',
 'quantity',
 'sales_value',
 'store_id',
 'retail_disc',
 'trans_time',
 'week_no',
 'coupon_disc',
 'coupon_match_disc',
 'avr_trans_time',
 'time_cat',
 'num_of_purchases',
 'num_week_no',
 'total_amount_of_purchases',
 'avr_purchase_receipt',
 'avr_weekly_purchase_receipt',
 'num_days',
 'avr_num_of_purchases_per_day',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [227]:
# classifier_lgb = Pipeline([
#     ('features', feats),
#     ('classifier', LogisticRegression()),
# ])


In [705]:
from catboost import CatBoost, Pool

In [706]:
# df_bin_feat = pd.get_dummies(X_train)
#
# model = CatBoost(params ={'objective': "Logloss",
#                           "iterations":1000,
#                           "max_depth":8,
#                           "task_type":"GPU",
#                           "devices":"0:1:2:3"
#             })
# model.fit(df_bin_feat, y_train, silent=True)
#
# train_preds = model.predict(df_bin_feat,prediction_type="Probability")

In [96]:
classifier_lgb = Pipeline([
    ('features', feats),
    ('classifier', LGBMClassifier(objective='binary',
                                  boosting_type='gbdt',
                                  max_depth=55,
                                  num_leaves=10,
                                  min_child_samples=9,
                                  n_estimators=200,
                                  # learning_rate=0.327855,
                                  # learning_rate=0.0032785714285714285,
                                  learning_rate=0.0012785714285714285,
                                  # categorical_column=cat_feats,
                                  n_jobs=-1,
                                  #                      verbose=0
                                  )),
])


In [97]:
classifier_lgb.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('manufacturer',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='manufacturer')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('day',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='day')),
                                                                 ('standard',
                                                                  StandardScaler())])),
                                                ('quantity',
                                                 Pipeline(steps=[('selector',
                          

In [98]:
train_preds = classifier_lgb.predict_proba(X_train)

In [99]:
df_ranker_predict = df_ranker_train.copy()

In [100]:
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [101]:
df_ranker_predict['proba_item_purchase'][:10]

0    0.056427
1    0.056427
2    0.282688
3    0.056427
4    0.056427
5    0.056427
6    0.056427
7    0.056427
8    0.056427
9    0.056427
Name: proba_item_purchase, dtype: float64

# Evaluation on test dataset

In [102]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


## Eval matching on test dataset

In [103]:
%%time
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

CPU times: total: 203 ms
Wall time: 190 ms


In [104]:
# померяем precision только модели матчинга, чтобы понимать влияение ранжирования на метрики

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.1444117647058813)]

## Eval re-ranked matched result on test dataset
    Вспомним df_match_candidates сет, который был получен own_recommendations на юзерах, набор пользователей мы фиксировали и он одинаков, значит и прогноз одинаков, поэтому мы можем использовать этот датафрейм для переранжирования.


In [105]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [106]:
tuple_res = sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)[0]

In [107]:
tuple_res[0]

'reranked_own_rec'

In [108]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.19446475195822263)
('own_rec', 0.1444117647058813)
