In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k, money_precision_at_k
from src.utils import prefilter_items, postfilter_items, prepare_data
from src.utils import split_data_2_lvl, add_new_features_for_lvl_2, get_items_popular_sorted
from src.recommenders import MainRecommender

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [7]:
data, item_features, user_features, test_data = prepare_data()
data = prefilter_items(data,take_n_popular=3000, item_features=item_features)
data_train_lvl_1, data_val_lvl_1, data_train_lvl_2, data_val_lvl_2 = split_data_2_lvl(data)
recommender_lvl_1 = MainRecommender(data_train_lvl_1, item_features)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [8]:
# Генерация кандидатов для всех юзеров
N = 300
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: recommender_lvl_1.get_own_recommendations(x, N=N))
# [ (user_id, [recommendations]) ]

s = users_lvl_2.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'item_id'

users_lvl_2 = users_lvl_2.drop('candidates', axis=1).join(s)
users_lvl_2['drop'] = 1  # фиктивная пересенная

targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = users_lvl_2.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)
targets_lvl_2.drop('drop', axis=1, inplace=True)

In [9]:
data_train_lvl_2, user_features, item_features = add_new_features_for_lvl_2(data_train_lvl_2, user_features, item_features, recommender_lvl_1.items_emb_df, recommender_lvl_1.users_emb_df)

In [10]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left')
targets_lvl_2 = targets_lvl_2.merge(data_train_lvl_2, on=['user_id', 'item_id'], how='left')

In [11]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats + ['user_id', 'item_id']] = X_train[cat_feats + ['user_id', 'item_id']].astype('category')

In [12]:
%%time

lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)
lgb.fit(X_train, y_train)

train_preds = lgb.predict(X_train)
train_preds_proba = lgb.predict_proba(X_train)[:, 1]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Wall time: 46.8 s


In [13]:
X_train['predict_proba'] = train_preds_proba
X_train.sort_values(['user_id', 'predict_proba'], ascending=False, inplace=True)

recs = X_train.groupby('user_id')['item_id']
recomendations = []
for user, preds in recs:
    recomendations.append({'user_id': user, 'recomendations': preds.tolist()})
recomendations = pd.DataFrame(recomendations)

result_2 = data.groupby('user_id')['item_id'].unique().reset_index()
result_2.columns = ['user_id', 'actual']

result = result_2.merge(recomendations, how='left')
result['recomendations'] = result['recomendations'].fillna(0)

In [14]:
overall_popular = get_items_popular_sorted(data)

data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))
df_price = data.groupby('item_id')['price'].mean().reset_index()

result['recomendations'] = result.progress_apply \
        (lambda x: postfilter_items(x['user_id'],                                    
                                    recommendations=result.loc[result['user_id'] == x['user_id']]['recomendations'].to_list()[0],
                                    popular_recs=overall_popular,
                                    data=data,                                    
                                    df_price=df_price,
                                    item_features=item_features,
                                    N=5),
         axis=1)

100%|██████████████████████████████████████| 2499/2499 [38:05<00:00,  1.09it/s]


In [15]:
result.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.5900579844913847

In [16]:
result_test = test_data.groupby('user_id')['item_id'].unique().reset_index()
result_test.columns = ['user_id', 'actual']

result_test = result_test.merge(result[['user_id', 'recomendations']], on=['user_id'], how='left')
result_test['recomendations'] = result_test['recomendations'].fillna(0)

In [17]:
def fill_na_by_popular(row):
    if row['recomendations'] == 0:
        row['recomendations'] = postfilter_items(row['user_id'],                                    
                                    0,
                                    popular_recs=overall_popular,
                                    data=data,                                    
                                    df_price=df_price,
                                    item_features=item_features,
                                    N=5)
    return row

In [18]:
result_test = result_test.apply(fill_na_by_popular, axis=1)

In [19]:
result_test.apply(lambda row: money_precision_at_k(row['recomendations'], row['actual'], df_price), axis=1).mean()

0.0989276762425071

In [21]:
result[['user_id', 'recomendations']].to_csv('./recomendations.csv')