## Курсовой проект

**Основное**
- Дедлайн - 21 июня 23:59
- Целевая метрика precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%
- Бейзлайн решения - [MainRecommender](https://github.com/geangohn/recsys-tutorial/blob/master/src/recommenders.py)
- Сдаем ссылку на github с решением. На github должен быть файл recommendations.csv (user_id | [rec_1, rec_2, ...] с рекомендациями. rec_i - реальные id item-ов (из retail_train.csv)

- Будет public тестовый датасет, на котором вы сможете измерять метрику
- Также будет private тестовый датасет для измерения финального качества
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте

**Hints:** 

Сначала просто попробуйте разные параметры MainRecommender:  
- N в топ-N товарах при формировании user-item матирцы (сейчас топ-5000)  
- Различные веса в user-item матрице (0/1, кол-во покупок, log(кол-во покупок + 1), сумма покупки, ...)  
- Разные взвешивания матрицы (TF-IDF, BM25 - у него есть параметры)  
- Разные смешивания рекомендаций (обратите внимание на бейзлайн - прошлые покупки юзера)  

Сделайте MVP - минимально рабочий продукт - (пусть даже top-popular), а потом его улучшайте

Если вы делаете двухуровневую модель - следите за валидацией 

In [128]:
%matplotlib inline

import importlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix
from implicit import als
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import src

In [125]:
DATAROOT = './data/'

data = pd.read_csv(f'{DATAROOT}transactions.csv.gz', compression='gzip')
test = pd.read_csv(f'{DATAROOT}test.csv.gz', compression='gzip')

items = pd.read_csv(f'{DATAROOT}product.csv.gz')
items.columns = [col.lower() for col in items.columns]
items.rename(columns={'product_id': 'item_id'}, inplace=True)

users = pd.read_csv(f'{DATAROOT}demographic.csv.gz')
users.columns = [col.lower() for col in users.columns]
users.rename(columns={'household_key': 'user_id'}, inplace=True)

## work

In [787]:
importlib.reload(src.utils)
importlib.reload(src.recommenders)
importlib.reload(src.metrics)
from src.metrics import total_precision_at_N
from src.utils import prefilter_items, prefilter_items2, postfilter_items, split_train_val
from src.recommenders import RecommenderDataset, BaseRecommender, OwnRecommender, AlsRecommender

In [788]:
importlib.reload(src.utils)
from src.utils import prefilter_items, prefilter_items2, postfilter_items, split_train_val

# делим на train / validate1 / validate2
train, val1, val2 = split_train_val(data, 6, 3)

# фильтруем items от ненужного и берем только 5000 (+1 на остальное)
was = train.item_id.nunique()
train = prefilter_items(train, price=(None, None), popular=(None, None), products=items, top_n=5000)

print(f'Decreased amount of items from {was} to {train.item_id.nunique()}')

Decreased amount of items from 83685 to 5001


In [789]:
ds = RecommenderDataset(train)

In [790]:
base = BaseRecommender(ds).fit()
own = OwnRecommender(ds).fit()
als = AlsRecommender(ds).fit()

def estimate(res, N=5):
    res['recommend_base'] = base.recommend(res.user_id, N)
    res['recommend_own'] = own.recommend(res.user_id, N)
    res['recommend_als'] = als.recommend(res.user_id, N)
    res['recommend_als_similar_items'] = als.recommend(res.user_id, N, by='similarItems')
    res['recommend_als_similar_users'] = als.recommend(res.user_id, N, by='similarUsers')

    total = total_precision_at_N(res, N)
    return total.describe().drop('user_id', axis=1).T.drop('count', axis=1).sort_values('mean', ascending=False)

HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




#### подсчет моделей на Train

In [791]:
res = train.groupby('user_id')['item_id'].unique().reset_index()
res.columns=['user_id', 'actual']
estimate(res)

Unnamed: 0,mean,std,min,25%,50%,75%,max
precision_base,0.985829,0.110951,0.0,1.0,1.0,1.0,1.0
precision_als_similar_users,0.546117,0.254072,0.0,0.4,0.6,0.8,1.0
precision_als_similar_items,0.411689,0.277568,0.0,0.2,0.4,0.6,1.0
precision_als,0.256845,0.273344,0.0,0.0,0.2,0.4,1.0
precision_own,0.038191,0.090733,0.0,0.0,0.0,0.0,0.6


#### валидация моделей на Val-1

In [792]:
res = val1.groupby('user_id')['item_id'].unique().reset_index()
res.columns=['user_id', 'actual']
estimate(res)

Unnamed: 0,mean,std,min,25%,50%,75%,max
precision_base,0.396657,0.321787,0.0,0.2,0.4,0.6,1.0
precision_als_similar_users,0.22507,0.200147,0.0,0.0,0.2,0.4,1.0
precision_als_similar_items,0.10585,0.159694,0.0,0.0,0.0,0.2,0.8
precision_als,0.067967,0.13433,0.0,0.0,0.0,0.2,1.0
precision_own,0.006407,0.037771,0.0,0.0,0.0,0.0,0.6


In [771]:
# делим на train / validate1 / validate2
train, val1, val2 = split_train_val(data, 6, 3)

In [786]:
importlib.reload(src.utils)
from src.utils import prefilter_items, prefilter_items2, postfilter_items, split_train_val

print('prefilter')
df = prefilter_items(train, price=(None, None), popular=(None, None), products=items, top_n=5000)
# df = prefilter_items2(train)

print(df.groupby('item_id').agg({
                            'user_id': 'nunique',
                            'price': 'median',
                            'quantity': 'count',
                            'weight': 'mean',
                        }).reset_index().sort_values('weight', ascending=False))


print('dataset')
ds = RecommenderDataset(df)
# ds = RecommenderDataset(df, values='quantity', aggfunc='count')

print('fit')
base = BaseRecommender(ds).fit()
own = OwnRecommender(ds).fit()
als = AlsRecommender(ds).fit()

def estimate(res, N=5):
    res['recommend_base'] = base.recommend(res.user_id, N)
#     res['recommend_own'] = own.recommend(res.user_id, N)
#     res['recommend_als'] = als.recommend(res.user_id, N)
#     res['recommend_als_similar_items'] = als.recommend(res.user_id, N, by='similarItems')
    res['recommend_als_similar_users'] = als.recommend(res.user_id, N, by='similarUsers')

    total = total_precision_at_N(res, N)
    return total.describe().drop('user_id', axis=1).T.drop('count', axis=1).sort_values('mean', ascending=False)

print('predict')
res = val1.groupby('user_id')['item_id'].unique().reset_index()
res.columns=['user_id', 'actual']
estimate(res)

prefilter
      item_id  user_id  price  quantity     weight
2209   995242     1361   1.00     10226  36.126671
2633  1029743     1285   2.49     11661  33.128850
3418  1098066     1255   0.99      4456  30.008918
90     826249     1236   0.99      4764  29.582790
271    840361     1234   1.00      4179  29.167029
...       ...      ...    ...       ...        ...
1930   970890       45   0.62        65   1.000896
3922  5565719       44   2.00        62   1.000310
906    889989       43   2.00        88   1.000137
2579  1024875       44   1.75        71   1.000000
4652  9999999     2497   2.00   1060108   0.000000

[5001 rows x 5 columns]
dataset
fit


HBox(children=(FloatProgress(value=0.0, max=5001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


predict


Unnamed: 0,mean,std,min,25%,50%,75%,max
precision_base,0.396657,0.321787,0.0,0.2,0.4,0.6,1.0
precision_als_similar_users,0.22507,0.200147,0.0,0.0,0.2,0.4,1.0


In [576]:
res = val1.groupby('user_id')['item_id'].unique().reset_index()
res.columns=['user_id', 'actual']
estimate(res)

Unnamed: 0,mean,std,min,25%,50%,75%,max
precision_base,0.353668,0.309965,0.0,0.0,0.4,0.6,1.0
precision_als_similar_users,0.169359,0.181619,0.0,0.0,0.2,0.2,1.0
precision_own,0.118663,0.171426,0.0,0.0,0.0,0.2,0.8
precision_als_similar_items,0.1,0.156156,0.0,0.0,0.0,0.2,1.0
precision_als,0.021541,0.067469,0.0,0.0,0.0,0.0,0.6
