# ДЗ 6


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [4]:
data = pd.read_csv('./data/retail_train.csv')
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)


# Важна схема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)
val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 3790


In [7]:
recommender = MainRecommender(data_train_lvl_1, item_features)



In [8]:
item_list = data_train_lvl_1.loc[data_train_lvl_1["item_id"] != 999999, "item_id"].unique()
item_list[:10]

array([ 950384, 1077373,  839040,  854483,  892314,  930666,  948254,
        957411, 1007549, 1044089])

In [9]:
recommender.get_als_recommendations(2375, rec_num=5)

[899624, 972931, 1037863, 883932, 865456]

In [10]:
recommender.get_own_recommendations(2375, rec_num=5)

[1036501, 907099, 1085983, 1079023, 910439]

In [11]:
recommender.get_similar_items_recommendation(2375, rec_num=5)

[9527494, 899624, 989069, 1079023, 1036501]

In [12]:
recommender.get_similar_users_recommendation(2375, rec_num=5)

[1008074, 6533936, 981521, 867188, 916381]

In [13]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head()

Unnamed: 0,user_id,actual
0,1,"[853529, 865456, 867607, 872137, 874905, 87524..."
1,2,"[15830248, 838136, 839656, 861272, 866211, 870..."
2,4,"[883932, 970760, 1035676, 1055863, 1097610, 67..."
3,6,"[1024306, 1102949, 6548453, 835394, 940804, 96..."
4,7,"[836281, 843306, 845294, 914190, 920456, 93886..."


In [15]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

# Пока только warm start
train_users = data_train_lvl_1['user_id'].unique()
users_lvl_2 = users_lvl_2[users_lvl_2['user_id'].isin(train_users)]

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender.get_own_recommendations(x, rec_num=50))

In [16]:
users_lvl_2.head()

Unnamed: 0,user_id,candidates
0,2070,"[834103, 925258, 1119399, 917033, 936508, 9689..."
1,2021,"[1119454, 1019142, 871279, 1038462, 6534077, 9..."
2,1753,"[13842224, 901543, 862981, 1089066, 1037894, 8..."
3,2120,[]
4,1346,"[5574377, 903738, 5568758, 1082212, 1070538, 1..."


In [17]:
import random

In [18]:
def data_extension(data_u, x):
    k_n = 50 - len(data_u.loc[data_u["user_id"] == x, "candidates"].values[0])
    if k_n > 0:
        it_rand_list = random.choices(item_list, k=k_n)
        data_u.loc[data_u["user_id"] == x, "candidates"].values[0].extend(it_rand_list)
    return data_u.loc[data_u["user_id"] == x, "candidates"].values[0]

In [19]:
users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: data_extension(users_lvl_2, x))

In [20]:
users_lvl_2.head(10)

Unnamed: 0,user_id,candidates
0,2070,"[834103, 925258, 1119399, 917033, 936508, 9689..."
1,2021,"[1119454, 1019142, 871279, 1038462, 6534077, 9..."
2,1753,"[13842224, 901543, 862981, 1089066, 1037894, 8..."
3,2120,"[15596515, 13115493, 914318, 1070076, 949142, ..."
4,1346,"[5574377, 903738, 5568758, 1082212, 1070538, 1..."
5,2324,"[967760, 1057260, 938118, 934369, 907392, 8679..."
6,2430,"[845193, 9392700, 9803545, 881047, 989409, 835..."
7,1434,"[1121213, 888532, 8181477, 908283, 847139, 927..."
8,2181,"[1082627, 967760, 901776, 1007136, 9187298, 86..."
9,1011,"[951176, 1016539, 999563, 1083070, 953561, 111..."


In [21]:
df=pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2.candidates.values)})

In [22]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)

In [23]:
data_train_lvl_2.head(10)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2104867,2070,40618492260,594,1019940,1,1.0,311,-0.29,40,86,0.0,0.0
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0
2107472,2021,40618753059,594,951590,1,1.69,443,-0.6,101,86,0.0,0.0
2107473,2021,40618753059,594,1019142,2,5.0,443,-1.98,101,86,0.0,0.0
2107474,2021,40618753059,594,1051489,2,2.5,443,-0.68,101,86,0.0,0.0
2107475,2021,40618753059,594,1111839,1,0.59,443,0.0,101,86,0.0,0.0
2107476,2021,40618753059,594,9835223,1,9.27,443,-3.63,101,86,0.0,0.0


### Задание 1

A) Попробуйте различные варианты генерации кандидатов. Какие из них дают наибольший recall@k ?
- Пока пробуем отобрать 50 кандидатов (k=50)
- Качество измеряем на data_val_lvl_1: следующие 6 недель после трейна

Дают ли own recommendtions + top-popular лучший recall?  

B)* Как зависит recall@k от k? Постройте для одной схемы генерации кандидатов эту зависимость для k = {20, 50, 100, 200, 500}  
C)* Исходя из прошлого вопроса, как вы думаете, какое значение k является наиболее разумным?


#### get_als_recommendations

In [33]:
users_lvl_1_pred_als = pd.DataFrame(data_val_lvl_1['user_id'].unique())
users_lvl_1_pred_als.columns = ['user_id']
users_lvl_1_pred_als.head()

Unnamed: 0,user_id
0,2070
1,2021
2,1753
3,2120
4,1346


In [37]:
recommender.get_als_recommendations(2120, rec_num=7)

[995055, 9926758, 5585510, 1110843, 1112238, 6534077, 1081189]

In [48]:
user_train_list = data_train_lvl_1["user_id"].unique().tolist()

In [49]:
1984 in user_train_list

False

In [50]:
def get_pred_als(recommender, user_train_list, item_list, x, num=7):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_recommendations(x, rec_num=num))
    else:
        res.extend(random.choices(item_list, k=num))
    return res

In [51]:
get_pred_als(recommender, user_train_list, item_list, 2120, num=7)

[995055, 9926758, 5585510, 1110843, 1112238, 6534077, 1081189]

In [52]:
get_pred_als(recommender, user_train_list, item_list, 1984, num=7)

[1001266, 822965, 9934800, 1027447, 973693, 935993, 883003]

In [53]:
users_lvl_1_pred_als['predictions'] = users_lvl_1_pred_als['user_id'].apply(\
                            lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=7))
users_lvl_1_pred_als.head()

Unnamed: 0,user_id,predictions
0,2070,"[1107553, 1055646, 879755, 9526410, 1131438, 9..."
1,2021,"[12731544, 998556, 836445, 969205, 896862, 981..."
2,1753,"[861445, 991951, 5568489, 944317, 13842224, 85..."
3,2120,"[995055, 9926758, 5585510, 1110843, 1112238, 6..."
4,1346,"[999250, 856435, 1023720, 1105488, 941361, 901..."


In [28]:
data_val_lvl_1[data_val_lvl_1["user_id"] == 2021][:5]

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
2107468,2021,40618753059,594,840361,1,0.99,443,0.0,101,86,0.0,0.0
2107469,2021,40618753059,594,856060,1,1.77,443,-0.09,101,86,0.0,0.0
2107470,2021,40618753059,594,869344,1,1.67,443,-0.22,101,86,0.0,0.0
2107471,2021,40618753059,594,896862,2,5.0,443,-2.98,101,86,0.0,0.0
2107472,2021,40618753059,594,951590,1,1.69,443,-0.6,101,86,0.0,0.0


In [55]:
users_lvl_1_pred_als.loc[users_lvl_1_pred_als["user_id"] == 2021, "predictions"].values[0]

[12731544, 998556, 836445, 969205, 896862, 981521, 1037863]

In [59]:
list(result_lvl_1.loc[result_lvl_1["user_id"] == 2021, "actual"].values[0])[:7]

[840361, 856060, 869344, 896862, 951590, 1019142, 1051489]

In [60]:
recall_at_k(users_lvl_1_pred_als.loc[users_lvl_1_pred_als["user_id"] == 2021, "predictions"].values[0], 
            list(result_lvl_1.loc[result_lvl_1["user_id"] == 2021, "actual"].values[0]), 
            k=7)

0.02857142857142857

In [61]:
users_lvl_1_pred_als["rec"] = users_lvl_1_pred_als['user_id'].apply(\
        lambda x: recall_at_k(users_lvl_1_pred_als.loc[users_lvl_1_pred_als["user_id"] == x, "predictions"].values[0], 
                              list(result_lvl_1.loc[result_lvl_1["user_id"] == x, "actual"].values[0]), 
                              k=7))
users_lvl_1_pred_als.head()

Unnamed: 0,user_id,predictions,rec
0,2070,"[1107553, 1055646, 879755, 9526410, 1131438, 9...",0.00641
1,2021,"[12731544, 998556, 836445, 969205, 896862, 981...",0.028571
2,1753,"[861445, 991951, 5568489, 944317, 13842224, 85...",0.012195
3,2120,"[995055, 9926758, 5585510, 1110843, 1112238, 6...",0.0
4,1346,"[999250, 856435, 1023720, 1105488, 941361, 901...",0.0


In [62]:
users_lvl_1_pred_als["rec"].mean()

0.010973037391130102

In [63]:
k_list = [20, 50, 100, 200, 500]

In [64]:
recall_als_list = []

for k_n in k_list:
    users_lvl_1_pred_als['predictions'] = users_lvl_1_pred_als['user_id'].apply(\
                            lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=k_n))
    
    users_lvl_1_pred_als["rec"] = users_lvl_1_pred_als['user_id'].apply(\
        lambda x: recall_at_k(users_lvl_1_pred_als.loc[users_lvl_1_pred_als["user_id"] == x, "predictions"].values[0], 
                              list(result_lvl_1.loc[result_lvl_1["user_id"] == x, "actual"].values[0]), 
                              k=k_n))
    recall_als_list.append(users_lvl_1_pred_als["rec"].mean())

In [65]:
recall_ls_list

[0.024507068681124386,
 0.0469574115355462,
 0.07332386154771282,
 0.10929288363461566,
 0.1769507080298454]

#### get_own_recommendations

In [None]:
def get_pred_own(recommender, user_train_list, item_list, x, num=7):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_recommendations(x, rec_num=num))
    else:
        res.extend(random.choices(item_list, k=num))
    return res

In [None]:
recall_own_list = []

for k_n in k_list:
    users_lvl_1_pred_als['predictions'] = users_lvl_1_pred_als['user_id'].apply(\
                            lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=k_n))
    
    users_lvl_1_pred_als["rec"] = users_lvl_1_pred_als['user_id'].apply(\
        lambda x: recall_at_k(users_lvl_1_pred_als.loc[users_lvl_1_pred_als["user_id"] == x, "predictions"].values[0], 
                              list(result_lvl_1.loc[result_lvl_1["user_id"] == x, "actual"].values[0]), 
                              k=k_n))
    recall_at_list.append(users_lvl_1_pred_als["rec"].mean())

recall_own_list

#### get_similar_items_recommendation

#### get_similar_users_recommendation

### Задание 2.

Обучите модель 2-ого уровня, при этом:
    - Добавьте минимум по 2 фичи для юзера, товара и пары юзер-товар
    - Измерьте отдельно precision@5 модели 1-ого уровня и двухуровневой модели на data_val_lvl_2
    - Вырос ли precision@5 при использовании двухуровневой модели?

In [24]:
# your_code

### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 0.27%
- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями 