+# Вебинар 6. Двухуровневые модели рекомендаций


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

import warnings

warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('data/retail_train.csv')
item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




In [5]:
recommender.get_als_recommendations(2375, N=5)

[899624, 1044078, 1106523, 883932, 871756]

In [6]:
recommender.get_own_recommendations(2375, N=5)

[948640, 918046, 847962, 907099, 873980]

In [7]:
recommender.get_similar_items_recommendation(2375, N=5)

[1046545, 1044078, 1042907, 907099, 1133312]

In [8]:
recommender.get_similar_users_recommendation(2375, N=5)

[963686, 12523928, 1012801, 5574377, 945909]

### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


In [9]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()

result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1 = result_lvl_1.query('user_id in @train_users')
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [10]:
# result_lvl_1 = result_lvl_1.head(255)

#### Зависимость от способа генерации кандидатов

In [11]:
# your_code
result_lvl_1['als_rec'] = [recommender.get_als_recommendations(i, N=50) for i in list(result_lvl_1['user_id'])]

result_lvl_1.apply(lambda x: recall_at_k(x['als_rec'], x['actual'], 50), axis=1).mean() * 100

4.889174566169511

In [12]:
result_lvl_1['own_rec'] = [recommender.get_own_recommendations(i, N=50) for i in list(result_lvl_1['user_id'])]

result_lvl_1.apply(lambda x: recall_at_k(x['own_rec'], x['actual'], 50), axis=1).mean() * 100

6.525657038145175

In [13]:
result_lvl_1['simr_it_rec'] = [recommender.get_similar_items_recommendation(i, N=50) for i in list(result_lvl_1['user_id'])]

result_lvl_1.apply(lambda x: recall_at_k(x['simr_it_rec'], x['actual'], 50), axis=1).mean() * 100

3.2977727875453597

Лучший результат показал get_own_recommendations

#### Зависимость от k

get_own_recommendations

In [14]:
for k in (20, 50, 100, 200, 500):
    result_lvl_1['own_rec'] = [recommender.get_own_recommendations(i, k) for i in list(result_lvl_1['user_id'])]
    print("For k =", str(k) + ":", result_lvl_1.apply(lambda x: recall_at_k(x['own_rec'], x['actual'], k), axis=1).mean() * 100)

For k = 20: 3.928427679372909
For k = 50: 6.525657038145175
For k = 100: 9.604492955885034
For k = 200: 13.537278412833242
For k = 500: 18.20532455550868


get_als_recommendations

In [15]:
for k in (20, 50, 100, 200, 500):
    result_lvl_1['als_rec'] = [recommender.get_als_recommendations(i, k) for i in list(result_lvl_1['user_id'])]
    print("For k =", k, ":", result_lvl_1.apply(lambda x: recall_at_k(x['als_rec'], x['actual'], k), axis=1).mean() * 100)

For k = 20 : 2.9194072113265794
For k = 50 : 4.889174566169511
For k = 100 : 6.981706270629508
For k = 200 : 9.98905252984215
For k = 500 : 14.739812503199001


get_similar_items_recommendation

In [16]:
for k in (20, 50, 100, 200, 500):
    result_lvl_1['simr_it_rec'] = [recommender.get_similar_items_recommendation(i, k) for i in list(result_lvl_1['user_id'])]
    print("For k =", k, ":", result_lvl_1.apply(lambda x: recall_at_k(x['simr_it_rec'], x['actual'], k), axis=1).mean() * 100)

For k = 20 : 1.6923420203268276
For k = 50 : 3.2977727875453597
For k = 100 : 5.255132296757268
For k = 200 : 8.575727411349504
For k = 500 : 13.645991282737373


#### Вывод:

С ростом k растёт и качество отбора кандидатов, поэтому следует взять максимально возможное k

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [17]:
# your_code
data_train_lvl_2.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0


In [18]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5))

In [19]:
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263]"
1,2021,"[950935, 1119454, 835578, 863762, 1019142]"


In [20]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1105426,1
0,2070,1097350,1
0,2070,879194,1
0,2070,948640,1


In [21]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [22]:
targets_lvl_2['target'].mean()

0.2867504835589942

In [23]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [24]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [25]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


### Генерация признаков

#### Для пользователя:

In [26]:
# генерация признаков:
# для пользователя:

# общая сумма покупок за период покупателя:
for i in user_features["user_id"]:
       user_features.loc[user_features['user_id'] == i, 'total_sum'] = data_train_lvl_2[data_train_lvl_2['user_id'] == i]['sales_value'].sum()

# средний чек:
for i in user_features["user_id"]:
    receipts = list()
    for j in data_train_lvl_2[data_train_lvl_2['user_id'] == i]['basket_id'].unique():
        receipts.append(data_train_lvl_2[data_train_lvl_2['user_id'] == i][data_train_lvl_2['basket_id'] == j]['sales_value'].sum())
    if len(receipts) != 0:
        user_features.loc[user_features['user_id'] == i, 'average_receipt'] = sum(receipts)/len(receipts)
    else: user_features.loc[user_features['user_id'] == i, 'average_receipt'] = 0

# самый популярный магазин:
for i in user_features["user_id"]:
    max_quantity = 0
    most_popular = 0
    for j in data_train_lvl_2[data_train_lvl_2['user_id'] == i]['store_id'].unique():
        new_m = len(data_train_lvl_2[(data_train_lvl_2['user_id'] == i)&(data_train_lvl_2['store_id'] == j)]['basket_id'].unique())
        if max_quantity < new_m:
            max_max_quantity = new_m
            most_popular = j
    user_features.loc[user_features['user_id'] == i, 'most_popular_store'] = j

In [27]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,total_sum,average_receipt,most_popular_store
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,341.78,48.825714,436.0
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,187.65,37.53,359.0
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,304.14,50.69,321.0


In [28]:
user_features

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,total_sum,average_receipt,most_popular_store
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,341.78,48.825714,436.0
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,187.65,37.530000,359.0
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8,304.14,50.690000,321.0
3,25-34,U,75-99K,Homeowner,2 Adults Kids,4,2,13,1105.91,48.083043,323.0
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16,24.05,12.025000,309.0
...,...,...,...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2494,145.54,72.770000,333.0
797,45-54,A,75-99K,Homeowner,Unknown,3,1,2496,346.86,69.372000,370.0
798,45-54,U,35-49K,Unknown,Single Male,1,None/Unknown,2497,560.65,31.147222,343.0
799,25-34,U,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,2498,184.34,14.180000,31862.0


#### Для товаров:

In [29]:
# генерация признаков:
# для товаров:

# покупок в неделю:
for i in item_features["item_id"]:
    weeks = np.array(data_train_lvl_2[(data_train_lvl_2['item_id'] == i)]['week_no'].unique())
    if len(weeks) != 0:
        item_features.loc[item_features['item_id'] == i, 'average_weekly_quantity'] = data_train_lvl_2[(data_train_lvl_2['item_id'] == i)]['quantity'].sum()/len(weeks)
    else: item_features.loc[item_features['item_id'] == i, 'average_weekly_quantity'] = 0
        
# средняя цена в категории:
for i in item_features["commodity_desc"].unique():
    items_list = np.array(item_features[item_features["commodity_desc"] == i]["item_id"])
    item_features.loc[item_features["commodity_desc"] == i, 'average_cat_value'] = data_train_lvl_2[(data_train_lvl_2['item_id'].isin(items_list))]['sales_value'].sum()/len(items_list)
    
# соотношение цены товара к средней в категории:
for i in item_features['item_id']:
    item_features.loc[item_features['item_id'] == i, 'worth'] = data_train_lvl_2[data_train_lvl_2['item_id'] == i]['sales_value'].mean()/item_features[item_features['item_id'] == i]['average_cat_value']

In [30]:
item_features[item_features['average_weekly_quantity'] != 0]

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,average_cat_value,worth
60,28116,69,GROCERY,Private,DRY MIX DESSERTS,GELATIN,.3 OZ,1.0,2.248187,0.146785
61,28117,69,GROCERY,Private,DRY MIX DESSERTS,GELATIN,.3 OZ,1.0,2.248187,0.151233
62,28143,69,GROCERY,Private,DRY MIX DESSERTS,GELATIN,.3 OZ,1.0,2.248187,0.146785
66,28186,69,GROCERY,Private,BAKING MIXES,BROWNIE MIX,10.25 OZ,1.0,4.512189,0.175081
74,28304,69,GROCERY,Private,SOUP,CONDENSED SOUP,10.75 OZ,1.0,7.499703,0.070669
...,...,...,...,...,...,...,...,...,...,...
91681,17330511,5042,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,2.0,1.432143,6.968579
91687,17382205,5456,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,,1.0,1.432143,5.579052
91690,17383227,6422,DRUG GM,National,BOOKSTORE,CHILDRENS LOW END,,1.0,1.180511,3.803436
91695,17827644,906,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,32 OZ,1.0,1.432143,1.745636


#### Для пары товар-пользователь:

In [31]:
user_items_features = data_train_lvl_2[['user_id', 'item_id', 'quantity']].copy()
user_items_features = user_items_features.merge(item_features[['item_id', 'commodity_desc', 'average_weekly_quantity']], on=['item_id'], how='left')

user_items_features

Unnamed: 0,user_id,item_id,quantity,commodity_desc,average_weekly_quantity
0,2070,1019940,1,SOFT DRINKS,2.000000
1,2021,840361,1,EGGS,78.666667
2,2021,856060,1,CANNED JUICES,10.833333
3,2021,869344,1,FRZN VEGETABLE/VEG DSH,2.500000
4,2021,896862,2,BACON,10.166667
...,...,...,...,...,...
169706,222,1120741,1,SOFT DRINKS,17.500000
169707,462,993339,1,YOGURT,2.333333
169708,462,995242,1,FLUID MILK PRODUCTS,197.500000
169709,462,10180324,1,LUNCHMEAT,2.333333


In [32]:
# генерация признаков:
# для пары товар-пользователь:

# коэффициент популярности среди покупателей для каждой категории
for j in list(user_items_features['commodity_desc'].unique()):
    popularity = len(user_items_features[user_items_features['commodity_desc'] == j]['user_id'].unique())
    if popularity < 100:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.8
    elif popularity < 600:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.6
    elif popularity < 1000:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.4
    else:
        user_items_features.loc[user_items_features['commodity_desc'] == j, 'is_popular'] = 0.2
        
# насколько популярен товар для пользователя, учитывая его популярность среди всех пользователей
for i in list(user_items_features['user_id'].unique()):
    for j in list(user_items_features['is_popular'].unique()):
        user_items_features.loc[(user_items_features['user_id'] == i) & (user_items_features['is_popular'] == j), 'is_favorite_cat'] = user_items_features[(user_items_features['user_id'] == i) & (user_items_features['is_popular'] == j)]['quantity'].sum() * user_items_features['is_popular']

In [33]:
user_items_features

Unnamed: 0,user_id,item_id,quantity,commodity_desc,average_weekly_quantity,is_popular,is_favorite_cat
0,2070,1019940,1,SOFT DRINKS,2.000000,0.2,17.6
1,2021,840361,1,EGGS,78.666667,0.2,3.4
2,2021,856060,1,CANNED JUICES,10.833333,0.4,6.0
3,2021,869344,1,FRZN VEGETABLE/VEG DSH,2.500000,0.4,6.0
4,2021,896862,2,BACON,10.166667,0.6,25.2
...,...,...,...,...,...,...,...
169706,222,1120741,1,SOFT DRINKS,17.500000,0.2,2.6
169707,462,993339,1,YOGURT,2.333333,0.6,11.4
169708,462,995242,1,FLUID MILK PRODUCTS,197.500000,0.2,2.6
169709,462,10180324,1,LUNCHMEAT,2.333333,0.4,7.2


In [34]:
user_items_features = user_items_features.drop(['quantity', 'commodity_desc', 'average_weekly_quantity', 'item_id'], axis=1)

In [35]:
user_items_features

Unnamed: 0,user_id,is_popular,is_favorite_cat
0,2070,0.2,17.6
1,2021,0.2,3.4
2,2021,0.4,6.0
3,2021,0.4,6.0
4,2021,0.6,25.2
...,...,...,...
169706,222,0.2,2.6
169707,462,0.6,11.4
169708,462,0.2,2.6
169709,462,0.4,7.2


In [36]:
# генерация признаков:
# для пары товар-пользователь:

# (Кол-во покупок юзером конкретной категории в неделю) - (Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
# for i in item_features["commodity_desc"].unique():
#     items_list = np.array(item_features[item_features["commodity_desc"] == i]["item_id"])
#     weeks = len(data_train_lvl_2['week_no'].unique())
#     mean_value = data_train_lvl_2[(data_train_lvl_2["item_id"].isin(items_list))]['quantity'].mean()/weeks
#     if (weeks != 0):
#         for j in user_features['user_id'].unique():
#             user_pref = data_train_lvl_2[(data_train_lvl_2["user_id"] == j) & (data_train_lvl_2["item_id"].isin(items_list))]['quantity'].sum()/weeks
#             user_items_features.loc[(user_items_features['commodity_desc'] == i)&(user_items_features['user_id'] == j), 'is_popular_cat'] = user_pref - mean_value

# (Кол-во покупок юзером конкретной категории в неделю)/(Среднее кол-во покупок всеми юзерами конкретной категории в неделю)
# for i in item_features["commodity_desc"].unique():
#     items_list = np.array(item_features[item_features["commodity_desc"] == i]["item_id"])
#     weeks = len(data_train_lvl_2['week_no'].unique())
#     mean_value = data_train_lvl_2[(data_train_lvl_2["item_id"].isin(items_list))]['quantity'].mean()/weeks
#     if (weeks != 0):
#         for j in user_features['user_id'].unique():
#             user_pref = data_train_lvl_2[(data_train_lvl_2["user_id"] == j) & (data_train_lvl_2["item_id"].isin(items_list))]['quantity'].sum()/weeks
#             user_items_features.loc[(user_items_features['commodity_desc'] == i)&(user_items_features['user_id'] == j), 'is_popular_cat'] = user_pref/mean_value
    

### Результат без фичей:

In [37]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [38]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [39]:
X_train.head()

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
2,2070,879194,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
3,2070,948640,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
4,2070,928263,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [40]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

Берем топ-k предсказаний, ранжированных по вероятности, для каждого юзера

In [41]:
from sklearn.metrics import classification_report

In [42]:
precision_at_k(train_preds, y_train, k=5)

1770.0

In [43]:
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92      8850
         1.0       0.91      0.66      0.77      3558

    accuracy                           0.88     12408
   macro avg       0.89      0.82      0.84     12408
weighted avg       0.89      0.88      0.88     12408



### Результат с фичами:

In [69]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [70]:
targets_lvl_2['target'].mean()

0.2867504835589942

In [71]:
targets_lvl_2

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0
2,2070,879194,0.0
3,2070,948640,0.0
4,2070,928263,0.0
...,...,...,...
12403,1745,902377,0.0
12404,1745,988697,1.0
12405,1745,1016785,0.0
12406,1745,1107173,0.0


In [72]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')

In [73]:
targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,average_cat_value,worth
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,1.545114
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,1.000000,5.387610,2.039865
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,0.000000,14.148287,
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,0.000000,2.655038,
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,2.200000,14.148287,0.621206
...,...,...,...,...,...,...,...,...,...,...,...,...
12403,1745,902377,0.0,69,MEAT-PCKGD,Private,BREAKFAST SAUSAGE/SANDWICHES,OTHER,2 LB,2.500000,12.278244,0.304400
12404,1745,988697,1.0,1089,MEAT-PCKGD,National,DINNER SAUSAGE,SMOKED/COOKED - BEEF,14 OZ,2.000000,6.519204,0.560651
12405,1745,1016785,0.0,1046,GROCERY,National,COFFEE,GROUND DECAFFINATED,11.5 OZ,1.500000,8.064967,0.742099
12406,1745,1107173,0.0,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,5.535244,0.648571


In [74]:
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,1.000000,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0
2,2070,879194,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,14 CT,0.000000,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0
3,2070,948640,0.0,1213,DRUG GM,National,ORAL HYGIENE PRODUCTS,WHITENING SYSTEMS,3 OZ,0.000000,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0
4,2070,928263,0.0,69,DRUG GM,Private,DIAPERS & DISPOSABLES,BABY DIAPERS,13 CT,2.200000,...,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12403,1745,902377,0.0,69,MEAT-PCKGD,Private,BREAKFAST SAUSAGE/SANDWICHES,OTHER,2 LB,2.500000,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0
12404,1745,988697,1.0,1089,MEAT-PCKGD,National,DINNER SAUSAGE,SMOKED/COOKED - BEEF,14 OZ,2.000000,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0
12405,1745,1016785,0.0,1046,GROCERY,National,COFFEE,GROUND DECAFFINATED,11.5 OZ,1.500000,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0
12406,1745,1107173,0.0,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0


In [75]:
targets_lvl_2.tail()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store
12403,1745,902377,0.0,69,MEAT-PCKGD,Private,BREAKFAST SAUSAGE/SANDWICHES,OTHER,2 LB,2.5,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.97,292.0
12404,1745,988697,1.0,1089,MEAT-PCKGD,National,DINNER SAUSAGE,SMOKED/COOKED - BEEF,14 OZ,2.0,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.97,292.0
12405,1745,1016785,0.0,1046,GROCERY,National,COFFEE,GROUND DECAFFINATED,11.5 OZ,1.5,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.97,292.0
12406,1745,1107173,0.0,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.0,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.97,292.0
12407,1745,1109778,0.0,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.5,...,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.97,292.0


In [76]:
targets_lvl_2 = targets_lvl_2.merge(user_items_features, on='user_id', how='left')

In [78]:
targets_lvl_2

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
1,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
2,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
3,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.4,6944.0
4,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131966,1745,1107173,0.0,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6
1131967,1745,1107173,0.0,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6
1131968,1745,1109778,0.0,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.500000,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.6,1.2
1131969,1745,1109778,0.0,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.500000,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6


In [79]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [82]:
X_train

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,average_cat_value,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
1,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
2,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
3,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.4,6944.0
4,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131966,1745,1107173,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,5.535244,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6
1131967,1745,1107173,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,5.535244,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6
1131968,1745,1109778,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.500000,4.889273,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.6,1.2
1131969,1745,1109778,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.500000,4.889273,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6


In [83]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'average_weekly_quantity',
 'average_cat_value',
 'worth',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'total_sum',
 'average_receipt',
 'most_popular_store',
 'is_popular',
 'is_favorite_cat']

In [84]:
X_train

Unnamed: 0,user_id,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,average_weekly_quantity,average_cat_value,...,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,total_sum,average_receipt,most_popular_store,is_popular,is_favorite_cat
0,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
1,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
2,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
3,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.4,6944.0
4,2070,1105426,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,1.666667,4.303889,...,50-74K,Unknown,Unknown,1,None/Unknown,617.29,14.355581,327.0,0.2,17.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131966,1745,1107173,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,5.535244,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6
1131967,1745,1107173,1075,GROCERY,National,CRACKERS/MISC BKD FD,SNACK CRACKERS,9.5 OZ,1.000000,5.535244,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6
1131968,1745,1109778,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.500000,4.889273,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.6,1.2
1131969,1745,1109778,3751,DELI,National,CHEESES,CHEESE: PROCESSED BULK,,1.500000,4.889273,...,Under 15K,Unknown,Single Male,2,None/Unknown,13.97,13.970000,292.0,0.2,0.6


In [85]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)

Берем топ-k предсказаний, ранжированных по вероятности, для каждого юзера

In [86]:
from sklearn.metrics import classification_report

In [87]:
precision_at_k(train_preds, y_train, k=5)

120339.8

In [88]:
print(classification_report(y_train, train_preds))

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97    601699
         1.0       0.98      0.94      0.96    530272

    accuracy                           0.97   1131971
   macro avg       0.97      0.96      0.96   1131971
weighted avg       0.97      0.97      0.97   1131971



### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 