# Курсовой проект Рекомендательные системы. 

## Двухуровневая модель рекомендаций

**Основное**
- Дедлайн - 17 ноября 23:59
- Целевая метрика map@5 (необходимо получить значение >=0.2)
- Бейзлайн решения - MainRecommender
- Файл recommendations.csv (user_id | [rec_1, rec_2, ...] с рекомендациями. rec_i - реальные id item-ов (из retail_train.csv)

**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации: делим на трейн и тест
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy() 
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
# будем использовать топ 5000 товаров (см src/MainRecommender)

n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=5000)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [4]:
recommender = MainRecommender(data_train_lvl_1)



  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

### Измеряем presicion@k

- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

In [5]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."


In [6]:
users_lvl_1 = pd.DataFrame(data_train_lvl_1['user_id'].unique())
users_lvl_1.columns = ['user_id']

In [7]:
K_num = 50
result_lvl_1['als_rec'] = users_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=K_num))
result_lvl_1['own_rec'] = users_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=K_num))

In [8]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als_rec,own_rec
0,1,"[853529, 865456, 867607, 872137, 874905, 87524...","[871756, 1044078, 844179, 899624, 1000753, 809...","[948640, 918046, 847962, 907099, 873980, 88469..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870...","[1135834, 5582712, 954424, 940947, 923670, 967...","[1101378, 8090570, 857176, 947013, 1065979, 10..."


In [9]:
def calculate_precision_k(data, K): #data - pandas df
    for column in data.columns[2:]:
        yield column, data.apply(lambda row: precision_at_k(row[column], row['actual'], k=K), axis=1).mean()

In [10]:
prec_K = 5
sorted(calculate_precision_k(result_lvl_1, prec_K), key=lambda x: x[1],reverse=True)

[('als_rec', 0.026369545032497756), ('own_rec', 0.004456824512534818)]

### Обучаем модель 2-ого уровня на выбранных кандидатах

- Обучаем на data_train_lvl_2
- Обучаем *только* на выбранных кандидатах
- (!) Если юзер купил < 50 товаров, то get_own_recommendations дополнит рекоммендации топ-популярными

In [11]:
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 

In [12]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start, фильтруем юзеров
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=50))
users_lvl_2.head(2)

Unnamed: 0,user_id,candidates
0,2070,"[1105426, 1097350, 879194, 948640, 928263, 944..."
1,2021,"[950935, 1119454, 835578, 863762, 1019142, 102..."


In [13]:
s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['flag'] = 1

users_lvl_2.head(4)

Unnamed: 0,user_id,item_id,flag
0,2070,1105426,1
0,2070,1097350,1
0,2070,879194,1
0,2070,948640,1


In [14]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('flag', axis=1, inplace=True)

In [15]:
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target
0,2070,1105426,0.0
1,2070,1097350,0.0


In [16]:
targets_lvl_2['target'].value_counts()

0.0    99669
1.0    11644
Name: target, dtype: int64

In [17]:
#объединим все признаки в один датафрейм
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown
1,2070,1097350,0.0,2468,GROCERY,National,DOMESTIC WINE,VALUE GLASS WINE,4 LTR,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown


In [18]:
# объединим все в одну df для построения новых фичей
df_feach_eng = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

df_feach_eng = df_feach_eng.merge(item_features, on='item_id', how='left')
df_feach_eng = df_feach_eng.merge(user_features, on='user_id', how='left')
df_feach_eng.columns

Index(['user_id', 'basket_id', 'day', 'item_id', 'quantity', 'sales_value',
       'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc'],
      dtype='object')

In [19]:
# добавим фичи для data_val_lvl_2(используем его для валидации модели)

df_val = data_val_lvl_2.copy()
df_val = df_val.merge(item_features, on='item_id', how='left')
df_val = df_val.merge(user_features, on='user_id', how='left')
df_val.columns

Index(['user_id', 'basket_id', 'day', 'item_id', 'quantity', 'sales_value',
       'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc'],
      dtype='object')

In [20]:
targets_lvl_2 = targets_lvl_2.merge(data_train_lvl_2, on='user_id', how='left')

In [21]:
targets_lvl_2.columns

Index(['user_id', 'item_id_x', 'target', 'manufacturer', 'department', 'brand',
       'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product',
       'age_desc', 'marital_status_code', 'income_desc', 'homeowner_desc',
       'hh_comp_desc', 'household_size_desc', 'kid_category_desc', 'basket_id',
       'day', 'item_id_y', 'quantity', 'sales_value', 'store_id',
       'retail_disc', 'trans_time', 'week_no', 'coupon_disc',
       'coupon_match_disc'],
      dtype='object')

### Новые признаки

In [22]:
# средний чек
df = df_feach_eng.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'avg_bill']
targets_lvl_2 = targets_lvl_2.merge(df, on='user_id')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,item_id_y,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,1019940,1,1.0,311,-0.29,40,86,0.0,0.0,14.355581
1,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,1019940,1,1.0,311,-0.29,201,86,0.0,0.0,14.355581


In [23]:
df = df_val.groupby(['user_id', 'basket_id'])['sales_value'].sum().reset_index()
df = df.groupby('user_id')['sales_value'].mean().reset_index()
df.columns = ['user_id', 'avg_bill']
df_val = df_val.merge(df, on='user_id')
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill
0,338,41260573635,636,840173,1,1.99,369,0.0,112,92,...,CARDS SEASONAL,,,,,,,,,31.249333
1,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,...,PEACHES,15 OZ,,,,,,,,31.249333


In [24]:
#Кол-во покупок юзера в каждой категории
df = df_feach_eng.groupby(['user_id', 'department'])['quantity'].sum().reset_index()
df = df.groupby('user_id')['quantity'].mean().reset_index()
df.columns = ['user_id', 'avg_count_pursh_dep']
targets_lvl_2 = targets_lvl_2.merge(df, on='user_id')
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,1,1.0,311,-0.29,40,86,0.0,0.0,14.355581,1755.0
1,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,1,1.0,311,-0.29,201,86,0.0,0.0,14.355581,1755.0


In [25]:
df = df_val.groupby(['user_id', 'department'])['quantity'].sum().reset_index()
df = df.groupby('user_id')['quantity'].mean().reset_index()
df.columns = ['user_id', 'avg_count_pursh_dep']
df_val = df_val.merge(df, on='user_id')
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep
0,338,41260573635,636,840173,1,1.99,369,0.0,112,92,...,,,,,,,,,31.249333,17.777778
1,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,...,15 OZ,,,,,,,,31.249333,17.777778


In [26]:
#цена товара
targets_lvl_2['price'] = targets_lvl_2['sales_value']/targets_lvl_2['quantity']
targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id_x,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,avg_bill,avg_count_pursh_dep,price
0,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,1.0,311,-0.29,40,86,0.0,0.0,14.355581,1755.0,1.0
1,2070,1105426,0.0,69,DELI,Private,SANDWICHES,SANDWICHES - (COLD),,45-54,...,1.0,311,-0.29,201,86,0.0,0.0,14.355581,1755.0,1.0


In [27]:
df_val['price'] = df_val['sales_value']/df_val['quantity']
df_val.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,...,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,avg_bill,avg_count_pursh_dep,price
0,338,41260573635,636,840173,1,1.99,369,0.0,112,92,...,,,,,,,,31.249333,17.777778,1.99
1,338,41260573635,636,1037348,1,0.89,369,-0.3,112,92,...,,,,,,,,31.249333,17.777778,0.89


In [28]:
X_train = targets_lvl_2.drop('target', axis=1)
X_train = X_train.drop('item_id_y', axis=1)
y_train = targets_lvl_2[['target']]

In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9154179 entries, 0 to 9154178
Data columns (total 28 columns):
 #   Column                Dtype  
---  ------                -----  
 0   user_id               int64  
 1   item_id_x             int64  
 2   manufacturer          int64  
 3   department            object 
 4   brand                 object 
 5   commodity_desc        object 
 6   sub_commodity_desc    object 
 7   curr_size_of_product  object 
 8   age_desc              object 
 9   marital_status_code   object 
 10  income_desc           object 
 11  homeowner_desc        object 
 12  hh_comp_desc          object 
 13  household_size_desc   object 
 14  kid_category_desc     object 
 15  basket_id             int64  
 16  day                   int64  
 17  quantity              int64  
 18  sales_value           float64
 19  store_id              int64  
 20  retail_disc           float64
 21  trans_time            int64  
 22  week_no               int64  
 23  coupon_

In [30]:
df_val = df_val.rename(columns={'item_id': 'item_id_x'})
df_val.columns.tolist()

['user_id',
 'basket_id',
 'day',
 'item_id_x',
 'quantity',
 'sales_value',
 'store_id',
 'retail_disc',
 'trans_time',
 'week_no',
 'coupon_disc',
 'coupon_match_disc',
 'manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'avg_bill',
 'avg_count_pursh_dep',
 'price']

In [31]:
cat_feats =['department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc']

In [32]:
for c in cat_feats:
    
    X_train[c] = X_train[c].astype('category')

In [33]:
for c in cat_feats:
    
    df_val[c] = df_val[c].astype('category')

In [34]:
lgb = LGBMClassifier(objective='binary',
                     max_depth=8,
                     n_estimators=300,
                     learning_rate=0.05,
                     categorical_column=cat_feats)

lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LGBMClassifier(categorical_column=['department', 'brand', 'commodity_desc',
                                   'sub_commodity_desc', 'curr_size_of_product',
                                   'age_desc', 'marital_status_code',
                                   'income_desc', 'homeowner_desc',
                                   'hh_comp_desc', 'household_size_desc',
                                   'kid_category_desc'],
               learning_rate=0.05, max_depth=8, n_estimators=300,
               objective='binary')

In [35]:
train_preds = lgb.predict(X_train)

In [36]:
train_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [37]:
val_preds = lgb.predict_proba(df_val)

In [38]:
val_preds

array([[0.99459395, 0.00540605],
       [0.99363108, 0.00636892],
       [0.99478106, 0.00521894],
       ...,
       [0.99173614, 0.00826386],
       [0.99072396, 0.00927604],
       [0.99072396, 0.00927604]])

In [39]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']
result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [40]:
pred_ds = df_val[['user_id', 'item_id_x']].copy()
pred_ds['proba'] = val_preds[:,1]
pred_ds = pred_ds.groupby(['user_id', 'item_id_x'])['proba'].mean().reset_index()
pred_s = pred_ds.groupby('user_id').apply(lambda x: x.sort_values('proba', ascending=False)['item_id_x'].tolist())

def get_LGBM_recommendations(user_id, N=5):
    recommendations = pred_s[user_id][:N]
    
    overall_top_purchases = data_val_lvl_2.groupby('item_id')['quantity'].count().reset_index()
    overall_top_purchases.sort_values('quantity', ascending=False, inplace=True)
    overall_top_purchases = overall_top_purchases[overall_top_purchases['item_id'] != 999999]
    overall_top_purchases = overall_top_purchases.item_id.tolist()
    
    if len(recommendations) < N:
            recommendations.extend(overall_top_purchases[:N])
            recommendations = recommendations[:N]
    
    return recommendations

In [42]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']


#только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

K_num = 50
result_lvl_2['LGBM'] = result_lvl_2['user_id'].apply(lambda x: get_LGBM_recommendations(x, N=K_num))

In [43]:
result_lvl_2.head(2)

Unnamed: 0,user_id,actual,LGBM
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[9677939, 883616, 9527329, 9296986, 8090541, 1..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[958154, 994891, 879948, 851057, 13842214, 108..."


In [44]:
def calculate_precision_k(data, K): #data - pandas df
    for column in data.columns[2:]:
        yield column, data.apply(lambda row: precision_at_k(row[column], row['actual'], k=K), axis=1).mean()

In [45]:
# Посчитаем precision_5 LGBM  
sorted(calculate_precision_k(result_lvl_2, 5), key=lambda x: x[1],reverse=True)

[('LGBM', 0.9585700293829578)]

In [46]:
# финальный датасет
final_data = result_lvl_2.drop('actual', axis=1).rename(columns={'LGBM': 'recs'})
final_data

Unnamed: 0,user_id,recs
0,1,"[9677939, 883616, 9527329, 9296986, 8090541, 1..."
1,3,"[958154, 994891, 879948, 851057, 13842214, 108..."
2,6,"[948650, 6553035, 1024306, 12604482, 1055911, ..."
3,7,"[888835, 939900, 12647859, 875118, 14111027, 1..."
4,8,"[854405, 953561, 7147176, 939275, 9396767, 131..."
...,...,...
2037,2496,"[6534178, 1082185, 6534178, 1029743, 995242, 1..."
2038,2497,"[1103513, 1032905, 995238, 1078912, 1072870, 1..."
2039,2498,"[10121837, 978879, 1079248, 7025363, 15781095,..."
2040,2499,"[13654811, 1119993, 1114050, 5583288, 1024629,..."


In [47]:
RESULTS_FILE_PATH = 'recommendations.csv'
final_data.to_csv(RESULTS_FILE_PATH, index=False)