# ДЗ 4

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender  # нужен для одного трюка
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
src_path = os.path.join(module_path, "src")
if module_path not in sys.path:
    sys.path.append(module_path)
    sys.path.append(src_path)
    
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items

In [21]:
from src.recommenders import MainRecommender

In [22]:
data = pd.read_csv('./sem_2/retail_train.csv')   # '../data/transaction_data.csv'

data.columns = [col.lower() for col in data.columns]
data.rename(columns={'household_key': 'user_id',
                     'product_id': 'item_id'},
            inplace=True)


test_size_weeks = 3

data_train = data[data['week_no'] <  data['week_no'].max() - test_size_weeks]
data_test  = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
recommender = MainRecommender(data_train)



---

### get_similar_items_recommendation

In [6]:
popularity = recommender.prep_data.loc[recommender.prep_data["user_id"]==2222, ['item_id', 'quantity']].\
                groupby(['item_id'])['quantity'].count().reset_index()
popularity.sort_values('quantity', ascending=False, inplace=True)
top_item_list = popularity.loc[popularity["item_id"]!=999999, "item_id"][:5].tolist()
top_item_list

[5569230, 995242, 870780, 1101010, 951590]

In [7]:
for item in top_item_list:
    print(recommender.id_to_itemid[recommender.model.similar_items(recommender.itemid_to_id[item], N=1)[0][0]])

5569230
995242
870780
1101010
951590


In [8]:
rec = set()

In [9]:
for item in top_item_list:
    rec.add(recommender.id_to_itemid[recommender.model.similar_items(recommender.itemid_to_id[item], N=1)[0][0]])

In [10]:
rec

{870780, 951590, 995242, 1101010, 5569230}

In [11]:
recommender.get_similar_items_recommendation(user=2222)

{870780, 951590, 995242, 1101010, 5569230}

---

### get_similar_users_recommendation

In [23]:
user_id = 2425

In [24]:
sim_users_list = [recommender.id_to_userid[usr] for usr, p in recommender.model.similar_users(recommender.userid_to_id[user_id], 
                                                                                              N=5)]
sim_users_list

[2425, 817, 1484, 1676, 409]

In [25]:
sim_users_list = [recommender.id_to_userid[usr] for usr, p in recommender.model.similar_users(recommender.userid_to_id[user_id], 
                                                                                              N=5+1)[1:5+1]]
sim_users_list

[817, 1484, 1676, 409, 1078]

In [14]:
rec = set()

In [15]:
for user in sim_users_list:
    rec.add(list(recommender.get_recommendations(user, rec_num=1))[0])

In [16]:
rec

{1041259, 1058997, 1082185, 1126899}

In [17]:
recommender.get_similar_users_recommendation(user_id)

{1041259, 1058997, 1082185, 1126899, 1127831}

### Фильтрация

In [25]:
# Оставим только 5000 самых популярных товаров
popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5000 = popularity.sort_values('n_sold', ascending=False).head(5000).item_id.tolist()
#добавим, чтобы не потерять юзеров
data.loc[~data['item_id'].isin(top_5000), 'item_id'] = 999999

In [26]:
popularity.sort_values('n_sold', ascending=False).head(7)

Unnamed: 0,item_id,n_sold
56233,6534178,199684264
56193,6533889,16911359
56228,6534166,12946508
56341,6544236,2578976
44111,1404121,1645146
3460,397896,1246340
44331,1426702,453293


In [27]:
item_features = pd.read_csv('./sem_2/product.csv')   # '../data/product.csv'
item_features.columns = [col.lower() for col in item_features.columns]
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


#### Фильтруем 50 самых популярных

In [28]:
top_5000[:7]

[6534178, 6533889, 6534166, 6544236, 1404121, 397896, 1426702]

In [29]:
item_features.loc[item_features['item_id'].isin(top_5000[:50])]

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
2820,202291,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,
3565,397896,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,
3738,420647,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,
4058,480014,69,KIOSK-GAS,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,
4416,545926,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,
5246,707683,69,MISC SALES TRAN,Private,COUPON/MISC ITEMS,GASOLINE-REG UNLEADED,
5358,731106,69,KIOSK-GAS,Private,FUEL,GASOLINE-REG UNLEADED,
6677,826249,69,GROCERY,Private,BAKED BREAD/BUNS/ROLLS,HAMBURGER BUNS,12 OZ
8281,840361,69,GROCERY,Private,EGGS,EGGS - LARGE,1 DZ
10630,860776,2,PRODUCE,National,VEGETABLES - ALL OTHERS,CUCUMBERS,36 CT


In [30]:
data.loc[data['item_id'].isin(top_5000[:50]), 'item_id'] = 999999

#### Фильтруем самые непопулярные

In [31]:
popularity.sort_values('n_sold', ascending=False).tail(7)

Unnamed: 0,item_id,n_sold
19729,943787,0
26977,1009825,0
56196,6533903,0
37484,1104446,0
39415,1122192,0
77273,12731302,0
12412,878926,0


In [32]:
popularity.loc[popularity['n_sold']==0, "n_sold"].count()

308

In [33]:
popularity_pu = data_train.groupby('item_id')['user_id'].nunique().reset_index() / data_train['user_id'].nunique()
popularity_pu.rename(columns={'user_id': 'share_unique_users'}, inplace=True)

In [45]:
popularity_pu = data_train.groupby('item_id')['user_id'].nunique().reset_index()
popularity_pu.rename(columns={'user_id': 'share_unique_users'}, inplace=True)
popularity_pu['share_unique_users'] = popularity_pu['share_unique_users'] / data_train['user_id'].nunique()
popularity_pu.sort_values('share_unique_users', ascending=False, inplace=True)
popularity_pu.head()

Unnamed: 0,item_id,share_unique_users
2381,999999,0.9996
3408,1082185,0.806323
2148,981760,0.623449
2307,995242,0.561024
2757,1029743,0.527011


In [46]:
popularity_pu.tail()

Unnamed: 0,item_id,share_unique_users
4110,2848087,0.0004
4,545926,0.0004
4256,5747233,0.0004
4259,5850988,0.0004
4258,5845857,0.0004


In [48]:
popularity_pu[popularity_pu['share_unique_users'] < 0.01]

Unnamed: 0,item_id,share_unique_users
822,878442,0.009604
836,879393,0.009604
4794,10204735,0.009604
2647,1020729,0.009604
2665,1022066,0.009604
...,...,...
4110,2848087,0.000400
4,545926,0.000400
4256,5747233,0.000400
4259,5850988,0.000400


In [52]:
unpop_list = popularity_pu[popularity_pu['share_unique_users'] < 0.01].item_id.tolist()
unpop_list[:7], unpop_list[-7:]

([878442, 879393, 10204735, 1020729, 1022066, 12188126, 911017],
 [1388206, 2690723, 2848087, 545926, 5747233, 5850988, 5845857])

In [53]:
data.loc[data['item_id'].isin(unpop_list), 'item_id'] = 999999