# Курсовой проект. 2-х уровневая рекомендательная система.

In [195]:
import warnings
warnings.filterwarnings('ignore')

In [196]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
# from src.utils import prefilter_items
# from src.recommenders import MainRecommender

In [197]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [198]:
def prefilter_items(data, take_n_popular=5000):

    "функция расчета popularity, так как будем использовать неоднократно"
    def popularity_calc(data):
        
        popularity = data.groupby('item_id')['user_id'].nunique().reset_index()
        popularity['user_id'] = popularity['user_id'] / data['user_id'].nunique()
        popularity.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
        
        return popularity
    
    # считаем popularity
    popularity = popularity_calc(data)
    
    # Уберем самые популярные товары (их и так купят)
    top_popular = popularity[popularity['share_unique_users'] > 0.5].item_id.tolist()
    data = data[~data['item_id'].isin(top_popular)]
    
    # Уберем самые НЕ популярные товары (их и так НЕ купят)
    top_notpopular = popularity[popularity['share_unique_users'] < 0.005].item_id.tolist()
    data = data[~data['item_id'].isin(top_notpopular)]
        
    # считаем popularity, еще раз, после того как исключили популярные и непопулярные товары
    popularity = popularity_calc(data)
    
    # Возьмем топ по популярности
    top_n = popularity.sort_values('share_unique_users', ascending=False).head(take_n_popular).item_id.tolist()
 
    # Заведем фиктивный item_id (если юзер покупал товары из топ-5000, то он "купил" такой товар)
    data.loc[~data['item_id'].isin(top_n), 'item_id'] = 999999
    
    # если захотим просто исключить айтемы не из топ-N
    # data = data[data['item_id'].isin(top_n)]

    return data

In [199]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [200]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

class MainRecommender:

    def __init__(self, data):

        self.user_item_matrix = self.prepare_matrix(data)  
        self.id_to_itemid, self.id_to_userid, self.itemid_to_id, self.userid_to_id = self.prepare_dicts(self.user_item_matrix)

        self.als_recommender = self.fit_als_recommender(self.user_item_matrix)
        self.own_recommender = self.fit_own_recommender(self.user_item_matrix)
        
        self.sparse_user_item = csr_matrix(self.user_item_matrix).tocsr()
               
        self.popularity = data[data['item_id'] != 999999].\
        groupby('item_id').count().reset_index().sort_values('user_id', ascending=False)
    
    @staticmethod
    def prepare_matrix(data):

        user_item_matrix = pd.pivot_table(data,
                                          index='user_id', columns='item_id',
                                          values='quantity',  # Можно пробовать другие варианты
                                          aggfunc='count',
                                          fill_value=0
                                          )

        user_item_matrix = user_item_matrix.astype(float)  # необходимый тип матрицы для implicit

        return user_item_matrix

    @staticmethod
    def prepare_dicts(user_item_matrix):
        """Подготавливает вспомогательные словари"""

        userids = user_item_matrix.index.values
        itemids = user_item_matrix.columns.values

        matrix_userids = np.arange(len(userids))
        matrix_itemids = np.arange(len(itemids))

        id_to_itemid = dict(zip(matrix_itemids, itemids))
        id_to_userid = dict(zip(matrix_userids, userids))

        itemid_to_id = dict(zip(itemids, matrix_itemids))
        userid_to_id = dict(zip(userids, matrix_userids))

        return id_to_itemid, id_to_userid, itemid_to_id, userid_to_id


    @staticmethod
    def fit_own_recommender(user_item_matrix):
        """Обучает модель, которая рекомендует товары, среди товаров, купленных юзером"""

        own_recommender = ItemItemRecommender(K=5, num_threads=4)
        own_recommender.fit(csr_matrix(user_item_matrix).tocsr())

        return own_recommender

    @staticmethod
    def fit_als_recommender(user_item_matrix, n_factors=20, regularization=0.001, iterations=15, num_threads=4):
        """Обучает ALS"""

        als_recommender = AlternatingLeastSquares(factors=n_factors,
                                        regularization=regularization,
                                        iterations=iterations,
                                        num_threads=num_threads)
        als_recommender.fit(csr_matrix(user_item_matrix).tocsr())

        return als_recommender
    
    
    def _get_recommendations(self, user, model, N=5):
        """Функция получения рекоммендаций"""

        # если нет пользователя с таким id, то возвращается топ N айтемов
        if user not in self.user_item_matrix.index:
            return self.popularity['item_id'][:N].tolist() 
        
        
        res = [self.id_to_itemid[rec] for rec in 
                    model.recommend(userid=self.userid_to_id[user], 
                                    user_items=self.sparse_user_item[self.userid_to_id[user]], # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[self.itemid_to_id[999999]],
                                    recalculate_user=False)[0]]
        
        # если кол-во рекомендаций меньше заданного, то возвращается топ N айтемов
        if len(res) < N:
            res = self.popularity['item_id'][:N].tolist()
        
       
        return res
    
    
    
    """Рекомендации через стардартные библиотеки implicit""" 
    def get_own_recommendations(self, user, N=5):
        "item-item рекоммендор"
        
        return self._get_recommendations(user, model=self.own_recommender, N=N)
                         
                                         
    def get_als_recommendations(self, user, N=5):
        "als рекоммендор"
        
        return self._get_recommendations(user, model=self.als_recommender, N=N)

### Обучаем модели 1-ого уровня ItemItem и ALS

In [201]:
recommender = MainRecommender(data_train_lvl_1)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [202]:
recommender.get_own_recommendations(2120, N=5)

[6534178, 1106523, 1133018, 883404, 951590]

In [203]:
recommender.get_als_recommendations(2120, N=5)

[6534178, 1106523, 826249, 904360, 1058997]

Создадим датасет с фактическими покупками на валидации:

In [204]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head()

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


Отбираем по 50 кандидатов для каждого юзера обоими рекоммендаторами

In [205]:
%%time

result_lvl_1['itemitem'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

CPU times: total: 188 ms
Wall time: 207 ms


In [206]:
%%time

result_lvl_1['als'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=50))

CPU times: total: 0 ns
Wall time: 369 ms


In [207]:
result_lvl_1.head()

Unnamed: 0,user_id,actual,itemitem,als
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[6534178, 1106523, 951590, 1133018, 883404, 84...","[6534178, 862349, 885290, 995965, 5978656, 108..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[6534178, 1106523, 951590, 1133018, 5569230, 8...","[1106523, 6534178, 5569230, 1133018, 1053690, ..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[6534178, 1106523, 951590, 1133018, 883404, 96...","[6534178, 1106523, 826249, 5569230, 951590, 80..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96...","[6534178, 1106523, 951590, 1133018, 883404, 84...","[878996, 1024306, 834484, 854852, 860776, 1023..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[6534178, 1106523, 951590, 1133018, 883404, 11...","[6534178, 849843, 883404, 826249, 840361, 1058..."


Проверяем средние метрики recall_at_k и precision_at_k по всем юзерам

In [208]:
result_lvl_1.apply(lambda x: recall_at_k(x.als, x.actual, 50), axis=1).mean()

0.10039803736732429

In [209]:
result_lvl_1.apply(lambda x: recall_at_k(x.itemitem, x.actual, 50), axis=1).mean()

0.1123711632472873

In [210]:
result_lvl_1.apply(lambda x: precision_at_k(x.als, x.actual, 5), axis=1).mean()

0.18449396471680593

In [211]:
result_lvl_1.apply(lambda x: precision_at_k(x.itemitem, x.actual, 5), axis=1).mean()

0.1820798514391829

Метрика precision@5 показывает одинаковые результаты на обоих рекоммендорах, но у itemitem максимальный recall выше, поэтому возьмем его в качестве основы для модели первого уровня.

***Отбираем по 50 кандидатов для каждого юзера из второго тренировочного сета (= первого валидационного), если такого юзера нет в рекоммендоре, то рекомендация юудет топ-N айтемов (так построили модель)***

In [212]:
result_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
result_lvl_2.columns = ['user_id']

In [213]:
result_lvl_2['candidates'] = result_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

In [214]:
result_lvl_2.head()

Unnamed: 0,user_id,candidates
0,2070,"[6534178, 1106523, 951590, 1133018, 1085604, 1..."
1,2021,"[6534178, 1106523, 951590, 1133018, 883404, 84..."
2,1753,"[6534178, 1106523, 951590, 1133018, 883404, 10..."
3,2120,"[6534178, 1106523, 1133018, 883404, 951590, 90..."
4,1346,"[6534178, 1106523, 951590, 1133018, 866227, 86..."


In [215]:
s = result_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True).astype('int')
s.name = 'item_id'

In [216]:
s.head(3)

0    6534178
0    1106523
0     951590
Name: item_id, dtype: int32

In [217]:
result_lvl_2 = result_lvl_2.drop('candidates', axis=1).join(s)
result_lvl_2.head(3)

Unnamed: 0,user_id,item_id
0,2070,6534178
0,2070,1106523
0,2070,951590


In [218]:
data_train_lvl_2.tail(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2282322,462,41297773713,635,995242,1,1.0,304,-0.89,2040,91,0.0,0.0
2282323,462,41297773713,635,10180324,1,3.0,304,-0.29,2040,91,0.0,0.0
2282324,462,41297773713,635,12731714,1,4.08,304,0.0,2040,91,0.0,0.0


In [219]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = result_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)

In [220]:
targets_lvl_2.head(3)

Unnamed: 0,user_id,item_id,target
0,2070,6534178,1.0
1,2070,6534178,1.0
2,2070,1106523,0.0


In [221]:
# средняя точность предсказаний айтемов
targets_lvl_2['target'].mean()

0.19045243419920868

### Обучаем модель 2-ого уровня

### Градиентный бустинг

In [222]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [223]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [224]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.tail(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
116257,1745,847789,0.0,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHEAT/MULTIGRAIN BR,20 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
116258,1745,870547,0.0,516,GROCERY,National,MARGARINES,MARGARINE: TUBS AND BOWLS,45 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown
116259,1745,1026346,0.0,69,GROCERY,Private,REFRGRATD JUICES/DRNKS,DAIRY CASE FRUIT DRINKS (NO JU,1 GA,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown


In [225]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [226]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

# категориальные признаки
cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [227]:
# проверка на пропуски
X_train.isna().sum()

user_id                     0
item_id                     0
manufacturer                0
department                  0
brand                       0
commodity_desc              0
sub_commodity_desc          0
curr_size_of_product        0
age_desc                71138
marital_status_code     71138
income_desc             71138
homeowner_desc          71138
hh_comp_desc            71138
household_size_desc     71138
kid_category_desc       71138
dtype: int64

In [228]:
# список столбцов с пропущенными значениями
nan_cols = X_train.isna().sum().loc[X_train.isna().sum() > 0].index.tolist()

# заменяем пропуски модами
for col in nan_cols:
    X_train[col].fillna(X_train[col].mode()[0], inplace=True)

In [229]:
# проверка на пропуски после обработки
X_train.isna().sum().sum()

0

In [230]:
# обучаем модель
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

In [231]:
train_preds = lgb.predict_proba(X_train)
train_preds[:, 1]

array([0.58404382, 0.58404382, 0.32906101, ..., 0.33119462, 0.26186964,
       0.13112552])

Добавим предсказания модели

In [232]:
targets_lvl_2_predict = targets_lvl_2.copy()
targets_lvl_2_predict['proba_item_purchase'] = train_preds[:, 1]
targets_lvl_2_predict.tail(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,proba_item_purchase
116257,1745,847789,0.0,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHEAT/MULTIGRAIN BR,20 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.331195
116258,1745,870547,0.0,516,GROCERY,National,MARGARINES,MARGARINE: TUBS AND BOWLS,45 OZ,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.26187
116259,1745,1026346,0.0,69,GROCERY,Private,REFRGRATD JUICES/DRNKS,DAIRY CASE FRUIT DRINKS (NO JU,1 GA,45-54,A,Under 15K,Unknown,Single Male,2,None/Unknown,0.131126


### Тест

In [233]:
data_val_lvl_2.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2277416,338,41260573635,636,840173,1,1.99,369,0.0,112,92,0.0,0.0
2277417,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,0.0,0.0
2277418,338,41260573635,636,5592737,2,1.58,369,-0.2,112,92,0.0,0.0


Создадим датасет с фактическими покупками на валидации-2:

In [234]:
result_test = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns=['user_id', 'actual']
result_test.head()

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412..."


In [235]:
%%time

result_test['candidates'] = result_test['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))

CPU times: total: 172 ms
Wall time: 197 ms


In [236]:
result_test.head(3)

Unnamed: 0,user_id,actual,candidates
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 1106523, 951590, 1133018, 883404, 84..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 1106523, 951590, 1133018, 1053690, 8..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6534178, 1106523, 951590, 1133018, 883404, 84..."


***Precision@5***

In [237]:
result_test.apply(lambda x: precision_at_k(x.candidates, x.actual, 5), axis=1).mean()

0.15798237022526934

Получили результат 0.15798 на модели первого уровня

In [238]:
# скопируем для финального тестирования
final_test = result_test.copy()

In [239]:
s = result_test.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True).astype('int')
s.name = 'item_id'
result_test = result_test.drop(['actual','candidates'], axis=1).join(s)
result_test.tail(3)

Unnamed: 0,user_id,item_id
2041,2500,9526410
2041,2500,909714
2041,2500,1023720


In [240]:
targets_test = data_val_lvl_2[['user_id', 'item_id']].copy()
targets_test['target'] = 1  # тут только покупки 

targets_test = result_test.merge(targets_test, on=['user_id', 'item_id'], how='left')

targets_test['target'].fillna(0, inplace= True)

In [241]:
targets_test.head(5)

Unnamed: 0,user_id,item_id,target
0,1,6534178,0.0
1,1,1106523,0.0
2,1,951590,0.0
3,1,1133018,0.0
4,1,883404,0.0


In [242]:
targets_test['target'].mean()

0.1428183011438215

In [243]:
targets_test = targets_test.merge(item_features, on='item_id', how='left')
targets_test = targets_test.merge(user_features, on='user_id', how='left')

targets_test.tail(3)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
106657,2500,9526410,0.0,544,GROCERY,National,BAG SNACKS,POTATO CHIPS,11.5 OZ,,,,,,,
106658,2500,909714,0.0,2,PRODUCE,National,APPLES,APPLES GRANNY SMITH (BULK&BAG),,,,,,,,
106659,2500,1023720,0.0,2,PRODUCE,National,STONE FRUIT,PEACHES YELLOW FLESH,,,,,,,,


In [244]:
X_test = targets_test.drop('target', axis=1)
y_test = targets_test[['target']]

In [245]:
cat_feats = X_test.columns[2:].tolist()
X_test[cat_feats] = X_test[cat_feats].astype('category')

In [246]:
# список столбцов с пропущенными значениями
nan_cols = X_test.isna().sum().loc[X_test.isna().sum() > 0].index.tolist()

# заменяем пропуски модами
for col in nan_cols:
    X_test[col].fillna(X_test[col].mode()[0], inplace=True)

In [247]:
targets_test.tail(5)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
106655,2500,962229,0.0,1636,GROCERY,National,BAKED BREAD/BUNS/ROLLS,MAINSTREAM WHITE BREAD,24 OZ,,,,,,,
106656,2500,6533889,0.0,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,,,,,,,,
106657,2500,9526410,0.0,544,GROCERY,National,BAG SNACKS,POTATO CHIPS,11.5 OZ,,,,,,,
106658,2500,909714,0.0,2,PRODUCE,National,APPLES,APPLES GRANNY SMITH (BULK&BAG),,,,,,,,
106659,2500,1023720,0.0,2,PRODUCE,National,STONE FRUIT,PEACHES YELLOW FLESH,,,,,,,,


In [248]:
test_preds = lgb.predict_proba(X_test)
test_preds[:, 1]

array([0.57753276, 0.28412669, 0.12713546, ..., 0.00456661, 0.25578265,
       0.01372349])

In [249]:
targets_test['proba_item_purchase'] = test_preds[:, 1]

In [250]:
targets_test[targets_test['user_id']==1].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

[940947, 6534178, 856942, 856942, 5577022]

In [251]:
final_test.head(3)

Unnamed: 0,user_id,actual,candidates
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 1106523, 951590, 1133018, 883404, 84..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 1106523, 951590, 1133018, 1053690, 8..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6534178, 1106523, 951590, 1133018, 883404, 84..."


In [252]:
%%time

final_test['candidates_ranked'] = final_test['user_id'].apply(lambda x:\
        targets_test[targets_test['user_id']== x].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist())

CPU times: total: 1.05 s
Wall time: 796 ms


In [253]:
final_test.head(3)

Unnamed: 0,user_id,actual,candidates,candidates_ranked
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[6534178, 1106523, 951590, 1133018, 883404, 84...","[940947, 6534178, 856942, 856942, 5577022]"
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[6534178, 1106523, 951590, 1133018, 1053690, 8...","[7167218, 9526563, 1022003, 6534178, 870547]"
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[6534178, 1106523, 951590, 1133018, 883404, 84...","[1041259, 6534178, 1037863, 994928, 1119051]"


***Precision@5***

In [254]:
final_test.apply(lambda x: precision_at_k(x.candidates_ranked, x.actual, 5), axis=1).mean()

0.2552399608227228

### Вывод

После применения модели второго уровня (LGBMClassifier) - ранжирования кандидатов, увеличили метрику Precision@5 с 0.15798 до 0.25523 и преодолели необходимый порог для сдачи проекта - метрика > 25%