In [1]:
import sys

import pandas as pd
import numpy as np
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Детерминированные алгоритмы
from implicit.nearest_neighbours import ItemItemRecommender

# Подключение сторонних скриптов
sys.path.append('../')
import metrics as m

In [2]:
def eval_item_item(df, knn = 5, n_th = 8):
    for nn in knn:
        model = ItemItemRecommender(K=nn, num_threads=n_th)  # K - кол-во ближайших соседей

        model.fit(csr_matrix(user_item_matrix).T.tocsr(),  # На вход item-user matrix
                  show_progress=True)

        df[f'itemitem k-{nn}'] = df['user_id'].apply(lambda x:
                                                     [id_to_item_id[rec[0]] for rec in
                                                        model.recommend(userid=user_id_to_id[x],
                                                                        user_items=sparse_user_item,
                                                                        N=5,
                                                                        filter_already_liked_items=True,
                                                                        filter_items=None,
                                                                        recalculate_user=True)])
    return df

In [3]:
data = pd.read_csv('../data/retail_train.csv')
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [4]:
result = pd.read_csv('predictions_basic.csv')
result.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[13416054, 936084, 7410040, 9527114, 377218]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 826249, 1106523, 923746, 1058997]","[1098066, 826249, 860776, 854852, 1068719]","[1098066, 826249, 860776, 1068719, 916122]","[999999, 1082185, 1029743, 6534178, 1127831]"
3,7,[ 840386 889774 898068 909714 929067 ...,"[5574336, 990072, 868548, 995880, 842226]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1098066, 840361, 883404, 916122]","[981760, 1098066, 883404, 1004906, 859075]","[981760, 883404, 1098066, 859075, 916122]","[999999, 1082185, 1029743, 1127831, 995785]"
4,8,[ 835098 872137 910439 924610 992977 ...,"[1277401, 94446, 3133282, 1925252, 855699]","[6534178, 6533889, 1029743, 6534166, 1082185]","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 5588666, 1096036, 979707, 1013321]","[904360, 1096036, 5588666, 979707, 1013321]","[999999, 1082185, 1029743, 1098066, 6534178]"


In [5]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

### Задание 1. Weighted Random Recommendation

Напишите код для случайных рекоммендаций, в которых вероятность рекомендовать товар прямо пропорциональна логарифму продаж
- Можно сэмплировать товары случайно, но пропорционально какому-либо весу
- Например, прямопропорционально популярности. Вес = log(sales_sum товара)

In [6]:
def weighted_random_recommendation(items_weights, n=5):
    """Случайные рекоммендации
    
    Input
    -----
    items_weights: pd.DataFrame
        Датафрейм со столбцами item_id, weight. Сумма weight по всем товарам = 1
    """
    
    # Подсказка: необходимо модифицировать функцию random_recommendation()
    recs = np.random.choice(items_weights['item_id'].values, size=n, replace=False, p=items_weights['weight'].values)

    return recs.tolist()

In [7]:
%%time
items_weights = data_train.groupby('item_id')['sales_value'].sum().reset_index()
items_weights = items_weights[items_weights['sales_value'] > 1]
items_weights['weight']= np.log(items_weights['sales_value'] + 0.001)
items_weights['weight'] = items_weights['weight'] / items_weights['weight'].sum()
items_weights.drop('sales_value', axis = 1, inplace = True)

result['wrr'] = result['user_id'].apply(lambda x: weighted_random_recommendation(items_weights, n=5))

Wall time: 2.56 s


In [8]:
result.head(2)


Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,wrr
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[12599639, 946881, 909052, 8358386, 915190]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[15778522, 1179810, 1105615, 12267981, 8019419]"


### Задание 2. Расчет метрик
Рассчитайте Precision@5 для каждого алгоритма с помощью функции из вебинара 1. Какой алгоритм показывает лучшее качество?

In [9]:
cols = result.columns
for col in cols[2:]:
    result[f'precision_{col}'] = result['user_id'].apply(lambda x: m.precision_at_k(result[col].values, result['actual'].values))

In [10]:
result.head()

Unnamed: 0,user_id,actual,random_recommendation,popular_recommendation,itemitem,cosine,tfidf,own_purchases,wrr,precision_random_recommendation,precision_popular_recommendation,precision_itemitem,precision_cosine,precision_tfidf,precision_own_purchases,precision_wrr
0,1,[ 821867 834484 856942 865456 889248 ...,"[5586238, 1015228, 866118, 2416733, 2603573]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 878996, 826249]","[981760, 1127831, 1098066, 826249, 878996]","[999999, 1082185, 1029743, 995785, 1004906]","[12599639, 946881, 909052, 8358386, 915190]",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,[ 835476 851057 872021 878302 879948 ...,"[161354, 63027, 1027802, 12263694, 307395]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 995242, 1029743, 840361, 961554]","[981760, 1004906, 961554, 1096036, 1080414]","[981760, 1004906, 859075, 1096036, 961554]","[999999, 1082185, 1098066, 6534178, 1127831]","[15778522, 1179810, 1105615, 12267981, 8019419]",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[13416054, 936084, 7410040, 9527114, 377218]","[6534178, 6533889, 1029743, 6534166, 1082185]","[1098066, 826249, 1106523, 923746, 1058997]","[1098066, 826249, 860776, 854852, 1068719]","[1098066, 826249, 860776, 1068719, 916122]","[999999, 1082185, 1029743, 6534178, 1127831]","[13190074, 7168895, 984941, 2594916, 13909957]",0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,7,[ 840386 889774 898068 909714 929067 ...,"[5574336, 990072, 868548, 995880, 842226]","[6534178, 6533889, 1029743, 6534166, 1082185]","[981760, 1098066, 840361, 883404, 916122]","[981760, 1098066, 883404, 1004906, 859075]","[981760, 883404, 1098066, 859075, 916122]","[999999, 1082185, 1029743, 1127831, 995785]","[13672357, 7168205, 14111292, 6443310, 876043]",0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8,[ 835098 872137 910439 924610 992977 ...,"[1277401, 94446, 3133282, 1925252, 855699]","[6534178, 6533889, 1029743, 6534166, 1082185]","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 5588666, 1096036, 979707, 1013321]","[904360, 1096036, 5588666, 979707, 1013321]","[999999, 1082185, 1029743, 1098066, 6534178]","[10356148, 1083813, 10121619, 868075, 15927205]",0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Задание 3*. Улучшение бейзлайнов и ItemItem

- Попробуйте улучшить бейзлайны, считая их на топ-5000 товаров
- Попробуйте улучшить разные варианты ItemItemRecommender, выбирая число соседей $K$.

In [11]:
popularity = data_train.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace = True)
top_5000 = popularity.sort_values('n_sold', ascending = False).head(5000).item_id.tolist()

# top_5000

In [12]:
data_train.item_id.max()
# максимальное значение в id 17829232

data_train.loc[~data_train['item_id'].isin(top_5000), 'item_id'] = 99999999

user_item_matrix = pd.pivot_table(data_train,
                                  index = 'user_id',
                                  columns = 'item_id',
                                  values = 'quantity',
                                  aggfunc = 'count',
                                  fill_value = 0)

user_item_matrix[user_item_matrix > 0] = 1
user_item_matrix = user_item_matrix.astype(float)

sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15831255,15926712,15926775,15926844,15926886,15927403,15927661,15927850,16809471,99999999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
user_id = user_item_matrix.index.values
items_id = user_item_matrix.columns.values

matrix_user_id = np.arange(len(user_id))
matrix_items_id = np.arange(len(items_id))

id_to_user_id = dict(zip(matrix_user_id, user_id))
id_to_item_id = dict(zip(matrix_items_id, items_id))

item_id_to_id = dict(zip(items_id, matrix_items_id))
user_id_to_id = dict(zip(user_id, matrix_user_id))

In [14]:
item_item_df = result.copy().drop(result.columns[2:], axis=1)
k_neighb = np.arange(1, 11, 1)

In [15]:
%%time

item_item_res = eval_item_item(item_item_df, k_neighb)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))











Wall time: 6.58 s


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5001.0), HTML(value='')))

In [16]:
item_item_res.head()

Unnamed: 0,user_id,actual,itemitem k-1,itemitem k-2,itemitem k-3,itemitem k-4,itemitem k-5,itemitem k-6,itemitem k-7,itemitem k-8,itemitem k-9,itemitem k-10
0,1,[ 821867 834484 856942 865456 889248 ...,"[866227, 995785, 1029743, 1110764, 99999999]","[10455984, 12172240, 12810393, 15926844, 12324...","[981760, 6534178, 12324948, 15926844, 12810393]","[981760, 1127831, 1098066, 854852, 866211]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 826249, 878996]","[981760, 1127831, 1098066, 826249, 6534178]","[981760, 1127831, 1098066, 826249, 6534178]","[981760, 1127831, 1098066, 826249, 6534178]"
1,3,[ 835476 851057 872021 878302 879948 ...,"[1023958, 8205731, 99999999]","[12810393, 13416351, 13945244, 15596279, 13842...","[981760, 13416351, 13842214, 13945244, 15596279]","[981760, 995242, 908531, 840361, 961554]","[981760, 995242, 1029743, 840361, 961554]","[981760, 995242, 840361, 1029743, 1004906]","[981760, 995242, 840361, 1029743, 961554]","[981760, 995242, 840361, 1029743, 961554]","[981760, 995242, 840361, 1029743, 1004906]","[981760, 995242, 840361, 1029743, 1004906]"
2,6,[ 920308 926804 946489 1006718 1017061 ...,"[942525, 995785, 1034176, 1046262, 1029743]","[12731436, 12731544, 13511722, 15926712, 13003...","[12731436, 12731544, 13511722, 15926712, 13003...","[1058997, 1098066, 916122, 860776, 1005186]","[1098066, 826249, 1106523, 923746, 1058997]","[1098066, 826249, 854852, 1106523, 923746]","[1098066, 826249, 854852, 860776, 923746]","[1098066, 826249, 854852, 860776, 923746]","[1098066, 826249, 854852, 860776, 923746]","[1098066, 826249, 854852, 860776, 923746]"
3,7,[ 840386 889774 898068 909714 929067 ...,"[995785, 1046262, 1029743, 99999999]","[12384775, 12731436, 13511722, 15511891, 12810...","[981760, 12731436, 12810393, 13511722, 15511891]","[981760, 1098066, 883404, 916122, 840361]","[981760, 1098066, 840361, 883404, 916122]","[981760, 1098066, 840361, 883404, 6534178]","[981760, 1098066, 840361, 883404, 6534178]","[981760, 1098066, 840361, 883404, 6534178]","[981760, 1098066, 840361, 883404, 6534178]","[981760, 1098066, 840361, 883404, 6534178]"
4,8,[ 835098 872137 910439 924610 992977 ...,"[935393, 995785, 1034176, 1046262, 1029743]","[12810393, 13115903, 13190294, 15596515, 13189...","[12810393, 13115903, 13190294, 15596515, 13189...","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 13115903, 13189726, 13190294, 15596515]","[904360, 866211, 979707, 13190294, 15596515]","[904360, 866211, 979707, 13190294, 15596515]","[904360, 866211, 1096036, 979707, 908531]","[904360, 866211, 1096036, 979707, 908531]","[904360, 866211, 1096036, 979707, 908531]"


In [17]:
for col in item_item_res.iloc[:,2:]:
    print(col +": ","{0:.4f}".format(item_item_res.apply(lambda row: m.precision_at_k(row[col], row['actual'], k=5), axis=1).mean()))

  mask |= (ar1 == a)


itemitem k-1:  0.0000
itemitem k-2:  0.0000
itemitem k-3:  0.0000
itemitem k-4:  0.0000
itemitem k-5:  0.0000
itemitem k-6:  0.0000
itemitem k-7:  0.0000
itemitem k-8:  0.0000
itemitem k-9:  0.0000
itemitem k-10:  0.0000
