# Import libs

In [840]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from metrics import precision_at_k, recall_at_k
from utils import prefilter_items
from recommenders import MainRecommender

# Read data

In [788]:
data = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
df_test = pd.read_csv('retail_test1.csv')

# Set global const

In [789]:
ITEM_COL = 'item_id'
USER_COL = 'user_id'
ACTUAL_COL = 'actual'
N_PREDICT =50
TOPK_RECALL = 50
TOPK_PRECISION = 5

# Process features dataset

In [790]:
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': ITEM_COL}, inplace=True)
user_features.rename(columns={'household_key': USER_COL }, inplace=True)

# Split dataset for train, eval, test

In [791]:
VAL_MATCHER_WEEKS = 10
VAL_RANKER_WEEKS = 2

In [792]:
# берем данные для тренировки matching модели
data_train_matcher = data[data['week_no'] < data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)]

# берем данные для валидации matching модели
data_val_matcher = data[(data['week_no'] >= data['week_no'].max() - (VAL_MATCHER_WEEKS + VAL_RANKER_WEEKS)) &
                      (data['week_no'] < data['week_no'].max() - (VAL_RANKER_WEEKS))]

# берем данные для тренировки ranking модели
data_train_ranker = data_val_matcher.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться

# берем данные для теста ranking, matching модели
data_val_ranker = data[data['week_no'] >= data['week_no'].max() - VAL_RANKER_WEEKS]



In [793]:
df_join_train_matcher = pd.concat([data_train_matcher, data_val_matcher])

# Prefilter items

In [794]:
n_items_before = data_train_matcher['item_id'].nunique()

data_train_matcher = prefilter_items(data_train_matcher, item_features=item_features, take_n_popular=100)

n_items_after = data_train_matcher['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 82059 to 101


# Make cold-start to warm-start

In [795]:
common_users = list(set(data_train_matcher.user_id.values)&(set(data_val_matcher.user_id.values))&set(data_val_ranker.user_id.values)&set(df_test.user_id.values))

data_train_matcher = data_train_matcher[data_train_matcher.user_id.isin(common_users)]
data_val_matcher = data_val_matcher[data_val_matcher.user_id.isin(common_users)]
data_train_ranker = data_train_ranker[data_train_ranker.user_id.isin(common_users)]
data_val_ranker = data_val_ranker[data_val_ranker.user_id.isin(common_users)]

# Init/train recommender

In [796]:
recommender = MainRecommender(data_train_matcher)

100%|██████████| 15/15 [00:00<00:00, 107.60it/s]
100%|██████████| 101/101 [00:00<00:00, 38786.37it/s]


# Train evaluation

In [797]:
result_eval_matcher = data_val_matcher.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_matcher.columns=[USER_COL, ACTUAL_COL]

In [798]:
result_eval_matcher['own_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_matcher['sim_item_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_matcher['als_rec'] = result_eval_matcher[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

In [799]:
def calc_precision(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: precision_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [800]:
def calc_recall(df_data, top_k):
    for col_name in df_data.columns[2:]:
        yield col_name, df_data.apply(lambda row: recall_at_k(row[col_name], row[ACTUAL_COL], k=top_k), axis=1).mean()

In [801]:
sorted(calc_recall(result_eval_matcher, TOPK_RECALL), key=lambda x: x[1],reverse=True)

[('own_rec', 0.041199012782315185),
 ('als_rec', 0.04092736277832301),
 ('sim_item_rec', 0.03833348243070481)]

In [803]:
sorted(calc_precision(result_eval_matcher, TOPK_PRECISION), key=lambda x: x[1],reverse=True)

[('als_rec', 0.26033728919425053),
 ('own_rec', 0.23947532792004747),
 ('sim_item_rec', 0.140911930043722)]

baseline MainRecommender со стандартными значениями был

('own_rec', 0.1462140992167092)

('reranked_own_rec', 0.1311749347258475)

# Test evaluation

In [804]:
df_test = df_test.loc[df_test['user_id'].isin(common_users)]
df_transactions = pd.read_csv('retail_train.csv')
result_test = df_test.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_test.columns=[USER_COL, ACTUAL_COL]

In [805]:
result_test['own_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_test['sim_item_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_test['als_rec'] = result_test[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

print(*sorted(calc_precision(result_test, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('als_rec', 0.13191755153029297)
('own_rec', 0.11755153029356616)
('sim_item_rec', 0.07008119925046899)


In [806]:
df_match_candidates = pd.DataFrame(data_train_ranker[USER_COL].unique())
df_match_candidates.columns = [USER_COL]

In [807]:
# собираем кандитатов с первого этапа (matcher)
df_match_candidates['candidates'] = df_match_candidates[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))

In [808]:
df_items = df_match_candidates.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
df_items.name = 'item_id'

In [809]:
df_match_candidates = df_match_candidates.drop('candidates', axis=1).join(df_items)

In [810]:
df_ranker_train = data_train_ranker[[USER_COL, ITEM_COL]].copy()
df_ranker_train['target'] = 1  # тут только покупки 

df_ranker_train = df_match_candidates.merge(df_ranker_train, on=[USER_COL, ITEM_COL], how='left')

df_ranker_train['target'].fillna(0, inplace= True)

In [811]:
df_ranker_train = df_ranker_train.merge(item_features, on='item_id', how='left')
df_ranker_train = df_ranker_train.merge(user_features, on='user_id', how='left')


In [812]:
# средний чек
df_ranker_train = df_ranker_train.merge(data[['user_id', 'sales_value']].groupby(['user_id',]).mean(), on='user_id', copy='False')


# количество купленных товаров за год в пересчете на месяц
df_ranker_train = df_ranker_train.merge(data[['week_no', 'basket_id', 'user_id', 'item_id', 'quantity']], on=['user_id', 'item_id'])
df_ranker_train['items_per_month'] = df_ranker_train['item_id'].map(df_ranker_train.loc[df_ranker_train['week_no'] > df_ranker_train['week_no'].max()-52].groupby('item_id')['quantity'].agg('sum').to_dict())
df_ranker_train['items_per_month'] = df_ranker_train['items_per_month'] / 4.33


# колчество покупок товаров в перерасчете на неделю с начала отсчета
df_ranker_train['items_sold_per_week'] = df_ranker_train['item_id'].map((df_ranker_train.groupby(['item_id'])['quantity'].sum()/df_ranker_train['week_no'].max()).to_dict())


# # средняя цена товара в категории
df_ranker_train['avg_price'] = df_ranker_train['department'].map(df_ranker_train.groupby(['department'])['sales_value'].agg('mean').to_dict())


# количество продаж по категориям в перерасчете на неделю с начала отсчета
df_ranker_train['cat_sold_per_week'] = df_ranker_train['department'].map((df_ranker_train.groupby(['department'])['quantity'].sum()/df_ranker_train['week_no'].max()).to_dict())

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('sales_value').sum().rename('total_item_sales_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('total_quantity_value'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq'), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('sales_value').sum().rename('total_user_sales_value'), how='left',on=USER_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_week')/df_join_train_matcher.week_no.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg('quantity').sum().rename('item_quantity_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg('quantity').sum().rename('user_quantity_per_baskter')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=ITEM_COL).agg(USER_COL).count().rename('item_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=ITEM_COL)

df_ranker_train = df_ranker_train.merge(df_join_train_matcher.groupby(by=USER_COL).agg(USER_COL).count().rename('user_freq_per_basket')/df_join_train_matcher.basket_id.nunique(), how='left',on=USER_COL)


In [813]:
df_ranker_train.head()

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,...,total_quantity_value,item_freq,user_freq,total_user_sales_value,item_quantity_per_week,user_quantity_per_week,item_quantity_per_basket,user_quantity_per_baskter,item_freq_per_basket,user_freq_per_basket
0,790,989101,0.0,69,GROCERY,Private,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,GAL,45-54,...,2000,1097,818,2484.88,21.73913,240.23913,0.008046,0.088916,0.004413,0.003291
1,790,989101,0.0,69,GROCERY,Private,WATER - CARBONATED/FLVRD DRINK,NON-CRBNTD DRNKING/MNERAL WATE,GAL,45-54,...,2000,1097,818,2484.88,21.73913,240.23913,0.008046,0.088916,0.004413,0.003291
2,790,848268,0.0,1251,GROCERY,National,MEAT - SHELF STABLE,PASTA: CANNED,15 OZ,45-54,...,459,239,818,2484.88,4.98913,240.23913,0.001847,0.088916,0.000961,0.003291
3,790,947068,0.0,1251,GROCERY,National,MEAT - SHELF STABLE,PASTA: CANNED,14.75 OZ,45-54,...,689,308,818,2484.88,7.48913,240.23913,0.002772,0.088916,0.001239,0.003291
4,790,1050229,0.0,1251,GROCERY,National,SOUP,CONDENSED SOUP,10.7OZ,45-54,...,2459,1086,818,2484.88,26.728261,240.23913,0.009893,0.088916,0.004369,0.003291


In [814]:
df_ranker_train = pd.get_dummies(df_ranker_train)

In [841]:
X_train, X_test, y_train, y_test = train_test_split(df_ranker_train.drop('target', axis=1), df_ranker_train['target'], test_size=0.33, random_state=42)

In [815]:
# X_train = df_ranker_train.drop('target', axis=1)
# y_train = df_ranker_train['target']

In [816]:
from sklearn.ensemble import RandomForestClassifier

In [826]:
clf = RandomForestClassifier(criterion='entropy', 
                            random_state=42,
                            )

In [842]:
clf.fit(X_train, y_train)

train_preds = clf.predict_proba(X_test)

In [843]:
df_ranker_predict = X_test.copy()
df_ranker_predict['proba_item_purchase'] = train_preds[:,1]

In [844]:
result_eval_ranker = data_val_ranker.groupby(USER_COL)[ITEM_COL].unique().reset_index()
result_eval_ranker.columns=[USER_COL, ACTUAL_COL]

In [845]:
result_eval_ranker['own_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_own_recommendations(x, N=N_PREDICT))
result_eval_ranker['sim_item_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_similar_items_recommendation(x, N=N_PREDICT))
result_eval_ranker['als_rec'] = result_eval_ranker[USER_COL].apply(lambda x: recommender.get_als_recommendations(x, N=N_PREDICT))

In [846]:
def rerank(user_id):
    return df_ranker_predict[df_ranker_predict[USER_COL]==user_id].sort_values('proba_item_purchase', ascending=False).head(5).item_id.tolist()

In [847]:
result_eval_ranker['reranked_own_rec'] = result_eval_ranker[USER_COL].apply(lambda user_id: rerank(user_id))

In [848]:
print(*sorted(calc_precision(result_eval_ranker, TOPK_PRECISION), key=lambda x: x[1], reverse=True), sep='\n')

('reranked_own_rec', 0.15259987317691706)
('als_rec', 0.13628981886321007)
('own_rec', 0.12104934415989936)
('sim_item_rec', 0.07282948157401678)


  return flags.sum() / len(recommended_list)


Лучшим методом получился als, с помощью ранжирования получилось незначительно увеличить метрику на валидационной выборке.