# Рекомендательные системы. Финальный проект

### Загрузите задание на проверку до 6 июля, 20:00 +03:00 UTC

In [66]:
import pandas as pd
import numpy as np

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

from gensim.models import Word2Vec

# Модель второго уровня
from catboost import CatBoostClassifier

import os, sys
sys.path.insert(1, os.getcwd() + '/src/')


from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

import warnings
warnings.filterwarnings('ignore')

In [71]:
print(sys.version)

3.10.9 | packaged by Anaconda, Inc. | (main, Mar  1 2023, 18:18:15) [MSC v.1916 64 bit (AMD64)]


In [72]:
print("Pandas version:", pd.__version__)
print("NumPy version:", np.__version__)
print("Scipy version:", scipy.__version__)
print("Sklearn version:", sklearn.__version__)

Pandas version: 2.0.2
NumPy version: 1.25.0
Scipy version: 1.10.1
Sklearn version: 1.2.2


In [2]:
data = pd.read_csv('C:\\Users\\SAMOL\\000 Рекомендательные системы\\retail_train.csv')
item_features = pd.read_csv('C:\\Users\\SAMOL\\000 Рекомендательные системы\\product.csv')
user_features = pd.read_csv('C:\\Users\\SAMOL\\000 Рекомендательные системы\\hh_demographic.csv')

In [3]:
N_PREDICT = 100

In [4]:
# Добавим необходимые функции

def print_stats_data(df_data, name_df):
    print(name_df)
    print(f"Shape: {df_data.shape} Users: {df_data[USER_COL].nunique()} Items: {df_data[ITEM_COL].nunique()}")
    
def make_recommendations(df_result, recommend_model, N_PREDICT=N_PREDICT, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

def calc_recall(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def calc_precision(df_data, top_k, ACTUAL_COL='actual'):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()
        
def rerank(user_id, df, USER_COL='user_id', proba_col_name='proba_item_purchase', N=5):
    return df[df[USER_COL]==user_id].sort_values(proba_col_name, ascending=False).head(N).item_id.tolist()

def get_scores(df_result, recommend_model, N_PREDICT=N_PREDICT, USER_COL='user_id'):
    return df_result[USER_COL].apply(lambda x: recommend_model(x, N=N_PREDICT))

In [5]:
#3. Подготовка данных
#Проведем переименования определенных признаков и приведем названия к единообразному формату.

ITEM_COL = 'item_id'
USER_COL = 'user_id'

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

In [6]:
#Разделим выборку на обучающий, валидационный и тестовый датасеты.

VAL_MATCHER_WEEKS = 5
VAL_RANKER_WEEKS = 3

# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]

In [7]:
print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (2136728, 12) Users: 2498 Items: 84180
val_matcher
Shape: (141762, 12) Users: 2097 Items: 25770
train_ranker
Shape: (141762, 12) Users: 2097 Items: 25770
val_ranker
Shape: (118314, 12) Users: 2042 Items: 24329


In [8]:
#Проведем префильтрацию данных.
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=10000)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 84180 to 10001


In [9]:
#Оставим только пользователей, которые встречаются в тренировочном датасете, чтобы избежать проблемы холодного старта.

# ищем общих пользователей
common_users = data_train_matcher.user_id.values

data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

print_stats_data(data_train_matcher,'train_matcher')
print_stats_data(data_val_matcher,'val_matcher')
print_stats_data(data_train_ranker,'train_ranker')
print_stats_data(data_val_ranker,'val_ranker')

train_matcher
Shape: (872777, 13) Users: 2496 Items: 10001
val_matcher
Shape: (141737, 12) Users: 2095 Items: 25768
train_ranker
Shape: (141737, 12) Users: 2095 Items: 25768
val_ranker
Shape: (118282, 12) Users: 2040 Items: 24325


In [10]:
#4. Построение модели первого уровня
#Создадим экземляр класса MainRecommender.
recommender = MainRecommender(data_train_matcher)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

  0%|          | 0/10001 [00:00<?, ?it/s]

In [11]:
ACTUAL_COL = 'actual'
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [14]:
result_eval_matcher['own_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_own_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['als_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_als_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['bm25_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_bm25_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['tfidf_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_tfidf_recommendations, 
                                                      N_PREDICT=N_PREDICT)

result_eval_matcher['cosine_rec'] = make_recommendations(result_eval_matcher, 
                                                      recommender.get_cosine_recommendations, 
                                                      N_PREDICT=N_PREDICT)
result_eval_matcher.head(2)

Unnamed: 0,user_id,actual,own_rec,als_rec,bm25_rec,tfidf_rec,cosine_rec
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[865334, 1029743, 1106523, 5569230, 916122, 84...","[872137, 1029743, 5569374, 871570, 859191, 878...","[865334, 1017711, 12384779, 1000328, 1056413, ...","[865334, 1017711, 983646, 847292, 1001369, 998...","[865334, 1017711, 1029743, 1106523, 5569230, 9..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[865334, 1029743, 1106523, 5569230, 916122, 84...","[930118, 12302069, 5569845, 5569230, 1068719, ...","[865334, 1017711, 12384779, 1000328, 1056413, ...","[865334, 1017711, 983646, 847292, 1001369, 998...","[865334, 1017711, 1029743, 1106523, 5569230, 9..."


In [15]:
TOPK_RECALL = 100
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('als_rec', 0.12413498441309054),
 ('own_rec', 0.06061789561946516),
 ('cosine_rec', 0.06039421560342006),
 ('bm25_rec', 0.05545359377601677),
 ('tfidf_rec', 0.055362395918756566)]

### Генерация признаков для модели второго уровня
Cформируем датасет для построения модели ранжирования, сгенерируем новые признаки и построим модель, которая будет работать с отобранными товарами для построения окончательных рекомендаций.

In [25]:
# Пользователи из трейна для ранжирования
# Создали DataFrame с уникальными идентификаторами пользователей.
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique(), columns=[USER_COL])
# Сгенерировали рекомендации, используя функцию рекомендации_модели.
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
# Вычислим баллы для каждого пользователя  в кадре данных df_match_candidates на основе собственного механизма оценки рекомендаций
df_match_candidates['candidates_scores'] = get_scores(df_match_candidates,recommender.get_own_scores,N_PREDICT=N_PREDICT)
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().\
            reset_index(level=1, drop=True)
df_scores = df_match_candidates.apply(lambda x: pd.Series(x['candidates_scores']), axis=1).stack().\
            reset_index(level=1, drop=True)
# предоставляем описательную метку для столбца,упрощая обращение к DataFrame и работу с ним в последующем коде.
df_items.name = 'item_id'
# удалим столбцы «candidates» и «candidates_scores» из df_match_candidates и заменим их столбцом «item_id» из df_items.
# заменим исходные столбцы новым столбцом, содержащим идентификаторы элементов.
df_match_candidates = df_match_candidates.drop(['candidates', 'candidates_scores'], axis=1).join(df_items)
# добавим столбец item_score в df_match_candidates и заполним его оценками, полученными из серии df_scores.
df_match_candidates['item_score'] = df_scores
df_match_candidates.head()

Unnamed: 0,user_id,item_id,item_score
0,1827,865334,11168.2
0,1827,1029743,11168.2
0,1827,1106523,-1.797693e+308
0,1827,5569230,-1.797693e+308
0,1827,916122,-1.797693e+308


In [26]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

# чистим дубликаты
df_ranker_train = df_ranker_train.drop_duplicates(subset=[USER_COL, ITEM_COL])

df_ranker_train['target'].fillna(0, inplace= True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,item_score,target
0,1827,865334,11168.2,0.0
1,1827,1029743,11168.2,1.0
3,1827,1106523,-1.797693e+308,0.0
4,1827,5569230,-1.797693e+308,0.0
5,1827,916122,-1.797693e+308,0.0


In [27]:
df_ranker_train.target.value_counts()

target
0.0    202695
1.0      6805
Name: count, dtype: int64

In [28]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,,,,,,,
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,


### Сгенерируем новые типовые признаки и добавим их к датасету для обучения.

In [29]:
# Добавим параметр категории к исходному обучающему датасету для удобства создания новых фичей
data_department = data_train_ranker.merge(item_features[['item_id', 'department']], on='item_id', how='inner')
data_department.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,department
0,1827,40702967646,601,891141,2,2.73,33923,0.0,7,87,0.0,0.0,PRODUCE
1,496,40739402373,603,891141,1,1.83,445,0.0,2226,87,0.0,0.0,PRODUCE


In [30]:
# Средная цена купленных товаров пользователем
users_sales = data_train_ranker.groupby(USER_COL)[['sales_value', 'quantity']].sum().reset_index()
users_sales['avg_price'] = users_sales['sales_value'] / users_sales['quantity']
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_price']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,,,,,,,,2.208947
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,,,,,,,,2.208947


In [31]:
# Количество покупок в каждой категории и средная сумма покупки в каждой категории для пользователя
users_sales_department = data_department.groupby([USER_COL, 'department'])\
                        [['sales_value', 'quantity']].sum().reset_index()
users_sales_department.rename(columns={'quantity': 'n_sold_category'}, inplace=True)
users_sales_department['avg_transaction_category'] = users_sales_department['sales_value']\
                                                    /users_sales_department['n_sold_category']
users_sales_department.drop(columns=['sales_value'], inplace=True)

df_ranker_train = df_ranker_train.merge(
    users_sales_department, on=[USER_COL, 'department'], how='left')
df_ranker_train['Missing n_sold_category'] = 0
df_ranker_train.loc[df_ranker_train['n_sold_category'].isna(), 'Missing n_sold_category'] = 1
df_ranker_train['n_sold_category'].fillna(0, inplace=True)

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,,,,,,,2.208947,2.0,1.0,0
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,,,,,,,2.208947,25.0,2.2408,0


In [32]:
# Средняя сумма покупки в категории
department_sales = data_department.groupby('department')['sales_value'].mean().reset_index()
department_sales.rename(columns={'sales_value': 'mean_sales_value_category'}, inplace=True)
department_sales.tail(2)

n_weeks = data_department['week_no'].max() - data_department['week_no'].min() + 1

# Количество покупок юзером конкретной категории в неделю
users_department = data_department.groupby([USER_COL, 'department'])['quantity'].sum().reset_index()
users_department['quantity'] /= n_weeks
users_department.rename(columns={'quantity': 'n_sold_category_user_week'}, inplace=True)

df_ranker_train = df_ranker_train.merge(department_sales, on='department', how='left')
df_ranker_train = df_ranker_train.merge(users_department, on=[USER_COL, 'department'], how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,,,,,2.208947,2.0,1.0,0,3.994219,0.4
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,,,,,2.208947,25.0,2.2408,0,2.541433,5.0


In [33]:
# Цена
items_sales = data_department.groupby(ITEM_COL)[['sales_value', 'quantity']].sum().reset_index()
items_sales['price'] = items_sales['sales_value'] / items_sales['quantity']
items_sales['price'].fillna(0, inplace=True)

# Количество покупок товара в неделю
items_sales['quantity_per_week'] = items_sales['quantity'] / n_weeks


df_ranker_train = df_ranker_train.merge(items_sales[[ITEM_COL,'price', 'quantity_per_week']],
                                        on=ITEM_COL, how='left')

df_ranker_train['Missing price'] = 0
df_ranker_train.loc[df_ranker_train['price'].isna(), 'Missing price'] = 1
df_ranker_train['price'].fillna(0, inplace=True)

df_ranker_train['Missing quantity per week'] = 0
df_ranker_train.loc[df_ranker_train['quantity_per_week'].isna(), 'Missing quantity per week'] = 1
df_ranker_train['quantity_per_week'].fillna(0, inplace=True)

df_ranker_train.head()

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,avg_price,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week
0,1827,865334,11168.2,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,2.208947,2.0,1.0,0,3.994219,0.4,0.0,0.0,1,1
1,1827,1029743,11168.2,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.208947,25.0,2.2408,0,2.541433,5.0,2.424173,198.4,0,0
2,1827,1106523,-1.797693e+308,0.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.208947,25.0,2.2408,0,2.541433,5.0,2.454024,132.2,0,0
3,1827,5569230,-1.797693e+308,0.0,1208,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,...,2.208947,25.0,2.2408,0,2.541433,5.0,4.180207,38.6,0,0
4,1827,916122,-1.797693e+308,0.0,4314,MEAT,National,CHICKEN,CHICKEN BREAST BONELESS,,...,2.208947,0.0,,1,6.552912,,3.782692,88.4,0,0


In [34]:
# Количество уникальных магазинов, в которых продавался товар
items_stores = data_department.groupby(ITEM_COL)['store_id'].nunique().reset_index()
items_stores.rename(columns={'store_id': 'n_unique_stores'}, inplace=True)
df_ranker_train = df_ranker_train.merge(items_stores, on=ITEM_COL, how='left')

df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,n_sold_category,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,2.0,1.0,0,3.994219,0.4,0.0,0.0,1,1,
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,25.0,2.2408,0,2.541433,5.0,2.424173,198.4,0,0,106.0


In [35]:
# Среднее количество транзакций клиента в неделю
users_transactions = data_department.groupby(USER_COL)[ITEM_COL].count().reset_index()
users_transactions.rename(columns={'item_id': 'n_transactions_per_week'}, inplace=True)
users_transactions['n_transactions_per_week'] /= n_weeks


df_ranker_train = df_ranker_train.merge(users_transactions, on=USER_COL, how='left')

df_ranker_train.tail(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,avg_transaction_category,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week
209498,1745,944836,-1.797693e+308,0.0,5154,MEAT,National,BEEF,ANGUS,1 LB,...,,1,6.552912,,3.966471,13.6,0,0,26.0,0.6
209499,1745,8065410,-1.797693e+308,0.0,397,MEAT-PCKGD,National,BACON,ECONOMY,16 OZ,...,2.99,0,3.82821,0.4,2.753881,13.4,0,0,32.0,0.6


In [36]:
# Средний чек
users_sales = data_train_ranker.groupby(USER_COL)['sales_value'].mean().reset_index()
users_sales.rename(columns={'sales_value': 'avg_cheque'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_sales[['user_id', 'avg_cheque']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,Missing n_sold_category,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,0,3.994219,0.4,0.0,0.0,1,1,,6.0,2.798
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,0,2.541433,5.0,2.424173,198.4,0,0,106.0,6.0,2.798


In [37]:
# Среднее количество уникальных категорий в корзине
users_baskets = data_department.groupby([USER_COL, 'basket_id'])['department'].nunique().reset_index()
users_baskets = users_baskets.groupby(USER_COL)['department'].mean().reset_index()
users_baskets.rename(columns={'department': 'avg_basket_department'}, inplace=True)
df_ranker_train = df_ranker_train.merge(users_baskets[['user_id', 'avg_basket_department']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,mean_sales_value_category,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,3.994219,0.4,0.0,0.0,1,1,,6.0,2.798,2.4
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.541433,5.0,2.424173,198.4,0,0,106.0,6.0,2.798,2.4


In [38]:
# Признак, отражающий средний интервал между покупками пользователя.

users_days = data_department.groupby(USER_COL)['day'].unique().reset_index()
users_days['day'] = users_days['day'].apply(lambda x: sorted(x))
users_days.head()

Unnamed: 0,user_id,day
0,1,"[606, 608, 610, 620, 622, 632]"
1,2,"[608, 614, 620, 622]"
2,4,"[605, 617, 627]"
3,6,"[603, 607, 610, 611, 616, 619, 620, 624, 627, ..."
4,7,"[606, 610, 614, 623, 629]"


In [39]:
def avg_ndays(days):
    diff = 0
    if len(days) > 1:
        for i in range(len(days) - 1):
            diff += days[i+1] - days[i]
        return diff / (len(days) - 1)
    else:
        return 0
    
users_days['avg_interval'] = users_days['day'].apply(avg_ndays)

df_ranker_train = df_ranker_train.merge(users_days[['user_id', 'avg_interval']], on='user_id', how='left')
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,n_sold_category_user_week,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,0.4,0.0,0.0,1,1,,6.0,2.798,2.4,7.0
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,5.0,2.424173,198.4,0,0,106.0,6.0,2.798,2.4,7.0


### Построим признак, в котором будет закодировано место товара в пяти последних покупках клиента.

In [40]:
users_items = data_train_ranker.groupby(USER_COL)[ITEM_COL].apply(list).reset_index()
users_items['item_id'] = users_items['item_id'].apply(lambda x: x[-5:])
users_items.head()

Unnamed: 0,user_id,item_id
0,1,"[5577022, 8293439, 9526676, 9527558, 10149640]"
1,2,"[7407562, 10149597, 13776981, 15572067, 17215077]"
2,4,"[963365, 1038692, 1133312, 5570830, 7431408]"
3,6,"[1099058, 895268, 1017061, 1082185, 1119051]"
4,7,"[9837501, 12524016, 13072715, 13987153, 13987338]"


In [42]:
def code_last_sales(x, df=users_items):
    last_sales = df.loc[df['user_id'] == x[0], 'item_id'].item()
    code = str()
    last_sales.reverse()
    for item in last_sales:
        code += '1' if item == x[1] else '0'
    return code

df_ranker_train['Last5sales'] = df_ranker_train[[USER_COL, ITEM_COL]].apply(code_last_sales, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,price,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,0.0,0.0,1,1,,6.0,2.798,2.4,7.0,0
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,2.424173,198.4,0,0,106.0,6.0,2.798,2.4,7.0,10000


#### Построим модель Word2Vec для получения эмбеддингов товаров, а затем посчитаем длину вектора, описывающего каждый товар, и расстояние от word2vec эмбеддинга товара до среднего эмбеддинга товаров, купленных клиентом.


In [43]:
df_ = data_train_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
df_.head()

Unnamed: 0,user_id,item_id
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[873654, 994928, 1098844, 1122879, 8357613, 98..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


In [44]:
purchases = []

for user in df_['user_id']:
    purchases.append([str(item) for item in df_[df_['user_id'] == user].item_id.values[0]])
    
print(f"Total # of Sessions: {len(purchases)}")

Total # of Sessions: 2095


In [45]:
w2v_model = Word2Vec(min_count=1, vector_size=100, sg=1, workers=3)
w2v_model.build_vocab(purchases, progress_per=100)
w2v_model.train(purchases, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

(1209182, 1214930)

In [46]:
def word2vec_len(itemid):
    try:
        return sum([i**2 for i in w2v_model.wv[str(itemid)]])
    except:
        return -1

df_ranker_train['Word2Vec_length'] = df_ranker_train[ITEM_COL].apply(lambda x: word2vec_len(x))
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,quantity_per_week,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales,Word2Vec_length
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,0.0,1,1,,6.0,2.798,2.4,7.0,0,-1.0
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,198.4,0,0,106.0,6.0,2.798,2.4,7.0,10000,7.04723


In [47]:
def avg_word2vec(items):
    return sum([w2v_model.wv[str(item)] for item in items]) / len(items)

df_['Avg_Word2Vec'] = df_[ITEM_COL].apply(avg_word2vec)
df_.head()

Unnamed: 0,user_id,item_id,Avg_Word2Vec
0,1,"[1005186, 907466, 909497, 940947, 963542, 1067...","[-0.10414569, 0.18625408, 0.20200114, 0.032644..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[-0.09097483, 0.25257882, 0.19576512, 0.071112..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67...","[-0.09394967, 0.23061757, 0.19857715, 0.042350..."
3,6,"[873654, 994928, 1098844, 1122879, 8357613, 98...","[-0.08779907, 0.22913173, 0.19350904, 0.047811..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886...","[-0.10020275, 0.21766569, 0.20840691, 0.030765..."


In [48]:
def get_w2v_distance(x, df=df_):
    avg_w2v = df.loc[df_[USER_COL] == x[0], 'Avg_Word2Vec'].item()
    try:
        return sum((w2v_model.wv[str(x[1])] - avg_w2v) ** 2)
    except:
        return -1
    
df_ranker_train['Word2Vec_distance_from_avg'] = df_ranker_train[[USER_COL, ITEM_COL]].\
                                                apply(get_w2v_distance, axis=1)
df_ranker_train.head(2)

Unnamed: 0,user_id,item_id,item_score,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,...,Missing price,Missing quantity per week,n_unique_stores,n_transactions_per_week,avg_cheque,avg_basket_department,avg_interval,Last5sales,Word2Vec_length,Word2Vec_distance_from_avg
0,1827,865334,11168.201881,0.0,870,DRUG GM,National,LAXATIVES,LAXATIVES,,...,1,1,,6.0,2.798,2.4,7.0,0,-1.0,-1.0
1,1827,1029743,11168.201881,1.0,69,GROCERY,Private,FLUID MILK PRODUCTS,FLUID MILK WHITE ONLY,1 GA,...,0,0,106.0,6.0,2.798,2.4,7.0,10000,7.04723,3.347656


### 6. Построение модели второго уровня.

Теперь обучим модель ранжирования и посчитаем метрики.

In [49]:
X_train = df_ranker_train.drop(['target', 
                                'Missing n_sold_category', 
                                'n_sold_category_user_week', 
                                'mean_sales_value_category',], axis=1)
y_train = df_ranker_train['target']

In [50]:
cat_feats = ['manufacturer', 
             'department', 
             'brand', 
             'commodity_desc',
             'sub_commodity_desc',
             'curr_size_of_product',
             'age_desc',
             'marital_status_code',
             'income_desc',
             'homeowner_desc',
             'hh_comp_desc',
             'household_size_desc',
             'kid_category_desc',
             'Missing price',
             'Missing quantity per week',
             'Last5sales',
            ]


for column in cat_feats:
    X_train[column].fillna(0, inplace=True)
    
X_train[cat_feats] = X_train[cat_feats].astype('category')

In [51]:
%%time
cb = CatBoostClassifier(learning_rate=0.1,
                        max_depth=12,
                        n_estimators=500,
                        random_state=42, 
                        cat_features=cat_feats, 
                        silent=False)

cb.fit(X_train, y_train)

train_preds = cb.predict_proba(X_train)

0:	learn: 0.5318703	total: 533ms	remaining: 4m 25s
1:	learn: 0.4297466	total: 574ms	remaining: 2m 22s
2:	learn: 0.3343490	total: 1.37s	remaining: 3m 47s
3:	learn: 0.2724333	total: 1.72s	remaining: 3m 33s
4:	learn: 0.2356844	total: 1.85s	remaining: 3m 3s
5:	learn: 0.1954762	total: 2.84s	remaining: 3m 54s
6:	learn: 0.1696132	total: 3.05s	remaining: 3m 34s
7:	learn: 0.1542356	total: 3.16s	remaining: 3m 14s
8:	learn: 0.1414689	total: 4s	remaining: 3m 38s
9:	learn: 0.1341344	total: 4.27s	remaining: 3m 29s
10:	learn: 0.1295019	total: 4.35s	remaining: 3m 13s
11:	learn: 0.1246707	total: 5.18s	remaining: 3m 30s
12:	learn: 0.1205085	total: 5.91s	remaining: 3m 41s
13:	learn: 0.1177923	total: 6.75s	remaining: 3m 54s
14:	learn: 0.1155156	total: 7.55s	remaining: 4m 4s
15:	learn: 0.1134816	total: 8.28s	remaining: 4m 10s
16:	learn: 0.1114508	total: 9.12s	remaining: 4m 19s
17:	learn: 0.1099796	total: 9.97s	remaining: 4m 27s
18:	learn: 0.1091563	total: 10.2s	remaining: 4m 17s
19:	learn: 0.1077443	total:

In [52]:
fi = pd.DataFrame(cb.feature_importances_, index=X_train.columns, columns=['importance'])
fi.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
avg_transaction_category,16.185627
n_sold_category,14.924851
Word2Vec_distance_from_avg,7.217838
n_transactions_per_week,5.817501
avg_cheque,5.555884
avg_interval,5.548832
user_id,5.409517
avg_price,4.580082
avg_basket_department,4.39099
price,4.3015


### Оценим качество построенной модели с помощью метрики precision@5 на валидационном датасете.

In [53]:
df_ranker_predict = df_ranker_train.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [54]:
N_PREDICT = 100
TOPK_PRECISION = 5

result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]
result_eval_ranker['own_rec'] = make_recommendations(result_eval_ranker, 
                                                     recommender.get_own_recommendations, N_PREDICT=N_PREDICT)

sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True)

[('own_rec', 0.10107843137254902)]

In [55]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].\
                                            apply(lambda user_id: rerank(user_id, df_ranker_predict))
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.18250666666666668)
('own_rec', 0.10107843137254902)


Как видим с помощью модели второго уровня удалось значительно поднять метрики для построенных рекомендаций.

### 7. Рекомендации для тестового датасета

Теперь построим предсказания для итогового датасета.


In [56]:
data_test = pd.read_csv('retail_test1.csv')
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [57]:
df_test_candidates = pd.DataFrame(data_test[USER_COL].unique())
df_test_candidates.columns = [USER_COL]

df_test_candidates['recommendations'] = df_test_candidates[USER_COL].\
                                        apply(lambda user_id: rerank(user_id, df_ranker_predict))

In [59]:
# сохраняем полученные рекомендации в файл.

df_test_candidates.to_csv('recommendations.csv', index=False)
df_test_candidates.head()

Unnamed: 0,user_id,recommendations
0,1340,"[1037840, 852856, 1075368, 986912, 1029743]"
1,588,"[899624, 933835, 907631, 1106523, 866211]"
2,2070,"[899624, 916122, 865456, 913210, 838186]"
3,1602,"[1070820, 832678, 872137, 901062, 6034857]"
4,447,"[899624, 1004906, 1106523, 909714, 1029743]"
