# Двухуровневая модель рекомендаций товаров для пользователя


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split

# Для работы с матрицами
from scipy.sparse import csr_matrix


# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, money_precision_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender
from src.preprocessing import new_features, train_test_preprocessing, get_important_features, get_final_recomendation

  from pandas import Panel


In [2]:
pd.set_option('display.max_columns', None)

In [3]:
data = pd.read_csv('../data/retail_train.csv')
item_features = pd.read_csv('../data/product.csv')
user_features = pd.read_csv('../data/hh_demographic.csv')
TEST_1_PATH = '../data/retail_test1.csv'

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)
test_1 = pd.read_csv(TEST_1_PATH)

N=150 # Количество рекомендаций

VAL_SIZE = 5

train_1 = data[data['week_no'] < data['week_no'].max() - (VAL_SIZE)]
val = data[data['week_no'] >= data['week_no'].max() - (VAL_SIZE)]

train_2 = val.copy()

In [4]:
n_items_before = train_1['item_id'].nunique()
train_1 = prefilter_items(train_1, item_features=item_features, take_n_popular= 3000)
n_items_after = train_1['item_id'].nunique()

print(f'Decreased # items from {n_items_before} to {n_items_after}')

Decreased # items from 85828 to 3001


In [5]:
recommender = MainRecommender(train_1)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3001.0), HTML(value='')))




In [6]:
items_emb_df = recommender.items_emb_df
users_emb_df = recommender.users_emb_df

In [7]:
%%time
train = new_features(train_2, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
train.head(2)

Wall time: 17.1 s


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc_x,coupon_match_disc,price,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,10_x,11_x,12_x,13_x,14_x,15_x,16_x,17_x,18_x,19_x,coupon_disc_y,quantity_of_sales,quantity_of_sales_per_week,qnt_of_sales_per_dep,qnt_of_sales_per_item_per_dep_per_week,qnt_of_sales_per_sub_commodity_desc,qnt_of_sales_per_item_per_sub_commodity_desc_per_week,marital_status_code,homeowner_desc,hh_comp_desc,household_size_desc,0_y,1_y,2_y,3_y,4_y,5_y,6_y,7_y,8_y,9_y,10_y,11_y,12_y,13_y,14_y,15_y,16_y,17_y,18_y,19_y,mean_time,age,income,kids,average_basket,sum_per_week,count_purchases_week_mean,sum_purchases_week_mean,target
0,843,40955282722,622,845193,3,5.37,364,-1.5,19,90,0.0,0.0,1.79,999999999,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,10.5 OZ,-0.001092,0.007983,0.012715,0.00443,0.001487,0.015562,0.010285,0.00271,0.005241,0.001169,-0.001434,0.007415,0.004361,0.00396,-0.003372,-0.011868,-0.00395,0.005651,0.016036,0.009001,0.0,52,8.666667,112255,0.137313,893,0.146202,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002468,0.013529,1.0
1,843,40955282722,622,845193,3,5.37,364,-1.5,19,90,0.0,0.0,1.79,999999999,GROCERY,National,FROZEN PIZZA,SNACKS/APPETIZERS,10.5 OZ,-0.001092,0.007983,0.012715,0.00443,0.001487,0.015562,0.010285,0.00271,0.005241,0.001169,-0.001434,0.007415,0.004361,0.00396,-0.003372,-0.011868,-0.00395,0.005651,0.016036,0.009001,0.0,52,8.666667,112255,0.137313,893,0.146202,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002468,0.013529,1.0


In [8]:
X_train = train.drop(['target'], axis=1)
y_train = train[['target']]

In [9]:
cat_feats=[]
for y in X_train.columns:
    if(X_train[y].dtype == np.object):
          cat_feats.append(y)
            
X_train[cat_feats + ['user_id', 'item_id']] = X_train[cat_feats + ['user_id', 'item_id']].astype('category')

In [10]:
test = new_features(data, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test = test.drop(['target'], axis=1)
y_test = test[['target']]
X_test[cat_feats + ['user_id', 'item_id']] = X_test[cat_feats + ['user_id', 'item_id']].astype('category')

In [11]:
%%time
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
basic_feats = get_important_features(lgb, X_train, y_train)

  return f(**kwargs)


Wall time: 3.38 s


In [12]:
%%time
lgb = LGBMClassifier(
    objective='binary',
    max_depth=7,
    categorical_feature=cat_feats
)
lgb.fit(X_train[basic_feats], y_train)

Please use categorical_feature argument of the Dataset constructor to pass this parameter.
  .format(key))


Wall time: 3.12 s


LGBMClassifier(categorical_feature=['department', 'brand', 'commodity_desc',
                                    'sub_commodity_desc',
                                    'curr_size_of_product',
                                    'marital_status_code', 'homeowner_desc',
                                    'hh_comp_desc', 'household_size_desc'],
               max_depth=7, objective='binary')

In [13]:
%%time
preds = lgb.predict(X_test[basic_feats])
test_preds_proba = lgb.predict_proba(X_test[basic_feats])[:, 1]

Wall time: 55 s


In [14]:
result_train = get_final_recomendation(X_test, test_preds_proba, data, train_1, item_features)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2499/2499 [14:57<00:00,  2.78it/s]


In [15]:
df_price = train_1.groupby('item_id')['price'].mean().reset_index()

In [16]:
result_train.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.8985969187675071

In [17]:
test_2 = new_features(test_1, train_1, recommender, item_features, user_features, items_emb_df, users_emb_df, N)
X_test_2 = test_2.drop(['target'], axis=1)
y_test_2 = test_2[['target']]
X_test_2[cat_feats + ['user_id', 'item_id']] = X_test_2[cat_feats + ['user_id', 'item_id']].astype('category')

In [None]:
test_preds_proba = lgb.predict_proba(X_test_2[basic_feats])[:, 1]
result = get_final_recomendation(X_test_2, test_preds_proba, test_1, train_1, item_features)

 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 1514/1885 [04:01<01:01,  6.06it/s]

In [None]:
result.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

In [None]:
result.drop('actual', axis=1, inplace=True)

In [None]:
result.to_csv('recommendations.csv', index=False)