In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load

from numba import jit, typeof, typed, types, prange

from implicit.gpu.als import AlternatingLeastSquares
from implicit.nearest_neighbours import ItemItemRecommender
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from scipy.sparse import csr_matrix, coo_matrix

from rec_lib.utils import prefilter_items, rec_sort, rec_len_check, recall, recall_at_k, isin
from rec_lib.utils import get_recommendations, precision_at_k, ap_k, get_sim_users

import warnings
warnings.filterwarnings("ignore")

In [2]:
%load_ext autoreload
%autoreload 2

# Получение рекомендаций

In [3]:
df_transactions = pd.read_parquet('archive/transactions_train_for_power_bi.parquet')

### Разделим данные на train/test и validation

In [4]:
df_transactions.groupby(['week_number'])['t_dat'].nunique()

week_number
1      5
2      7
3      7
4      7
5      7
      ..
101    7
102    7
103    7
104    7
105    2
Name: t_dat, Length: 105, dtype: int64

In [5]:
'''
Для бейзлайна в качестве валидации возьмем 104 неделю (38 неделя года), т.к. 105 неполная,
103 (37 неделя года) - test, в качестве трейна - весь предыдущий период
'''
validation = df_transactions.loc[df_transactions['week_number'] == 104]
test = df_transactions.loc[df_transactions['week_number'] == 103]
train = df_transactions.loc[df_transactions['week_number'] < 103]

In [6]:
%%time
# train = prefilter_items(train, n_popular_limit=500, item_features=None)
train = train.sort_values(['customer_id_short', 'article_id_short']).reset_index(drop = True)

CPU times: user 12.7 s, sys: 1.11 s, total: 13.8 s
Wall time: 13.8 s


In [7]:
# Проверим сколько пользователей из трейна отсутствует в тесте
print(f"В трейне - {len(set(train['customer_id_short']))} покупателей\n\
В тесте - {len(set(test['customer_id_short']))} покупателей\n\
Покупателей из теста нет в трейне - {len(set(test['customer_id_short']) - set(train['customer_id_short']))}")

В трейне - 1343966 покупателей
В тесте - 76528 покупателей
Покупателей из теста нет в трейне - 5870


In [8]:
# Создадим df в котором будут фактические данные теста, сюда потом добавим предсказания
result = test.groupby('customer_id_short')['article_id_short'].unique().reset_index()
result.columns=['customer_id_short', 'actual_article_id_short']

In [9]:
popularity = train.groupby('article_id_short')['values'].sum().reset_index()
popularity = popularity.sort_values('values', ascending=False)

In [10]:
sim_weeks_articles = train.loc[train['week_number_of_year'].isin([36])]
sim_weeks_articles = sim_weeks_articles.groupby('article_id_short')['values'].sum().reset_index()
sim_weeks_articles = sim_weeks_articles.sort_values('values', ascending=False)

In [11]:
# num top sim_weeks articles
K = 1300

# top_articles = popularity.iloc[:N]['article_id_short'].values
# top_articles_set = set(popularity.iloc[:K]['article_id_short'].values)
top_sim_weeks_articles_nb = typed.List(set(sim_weeks_articles.iloc[:K]['article_id_short'].values))

In [12]:
%%time
# создаем массив использованных article_id
used_article_id_short = sorted(np.array(train['article_id_short'].unique()))
article_id_for_dict = np.arange(0,len(used_article_id_short))

# Создаем справочники users и items для более быстрой работы кода
used_itemid_to_id = dict(zip(used_article_id_short, article_id_for_dict))
id_to_used_itemid = dict(zip(article_id_for_dict, used_article_id_short))


# создаем массив использованных customer_id_short
used_user_id_short = sorted(np.array(train['customer_id_short'].unique()))
user_id_for_dict = np.arange(0,len(used_user_id_short))

# Создаем справочники users и items для более быстрой работы кода
used_userid_to_id = dict(zip(used_user_id_short, user_id_for_dict))
id_to_used_userid = dict(zip(user_id_for_dict, used_user_id_short))

CPU times: user 669 ms, sys: 79.7 ms, total: 749 ms
Wall time: 748 ms


In [13]:
%%time
# numba dict for recommeddation
id_to_used_itemid_nb = typed.Dict.empty(types.int64,types.int64)

for k, v in id_to_used_itemid.items():
    id_to_used_itemid_nb[k] = v

CPU times: user 864 ms, sys: 8.06 ms, total: 872 ms
Wall time: 876 ms


In [14]:
%%time
# получаем массивы для построения разряженной матрицы по координатам ненулевых элементов
user_id_short_arr_for_matrix = train.customer_id_short.values
user_id_short_arr_for_matrix = np.array([used_userid_to_id[el] for el in user_id_short_arr_for_matrix])

article_id_short_arr_for_matrix = train.article_id_short.values
article_id_short_arr_for_matrix = np.array([used_itemid_to_id[el] for el in article_id_short_arr_for_matrix])

article_user_counter_for_matrix = train['values'].values.astype(np.float64)

CPU times: user 9.9 s, sys: 251 ms, total: 10.2 s
Wall time: 10.2 s


In [15]:
%%time
# Создаем разряженную матриуц по координатам ненулевых элементов
coo = coo_matrix((article_user_counter_for_matrix, (user_id_short_arr_for_matrix, article_id_short_arr_for_matrix)))

CPU times: user 249 ms, sys: 9.72 ms, total: 259 ms
Wall time: 258 ms


In [16]:
# Приведем матрицу в нужный формат для модели и произведем взвешивание bm25
custom_sparse_user_item = csr_matrix(coo).tocsr()
custom_bm25_user_item_matrix = bm25_weight(custom_sparse_user_item.T).T.tocsr()

In [17]:
%%time

# GPU ALS
als = AlternatingLeastSquares(factors=60,
                regularization=0.8,
                iterations=3,
                calculate_training_loss=True,
                random_state=42)

als.fit(custom_bm25_user_item_matrix, show_progress=False)

CPU times: user 2min 14s, sys: 504 ms, total: 2min 14s
Wall time: 2min 16s


In [18]:
%%time

# Количество потоков процессора для обучения
NUM_THREADS = 16

# Own recommender
own_recommender = ItemItemRecommender(K=1, num_threads=NUM_THREADS)
own_recommender.fit(custom_sparse_user_item, show_progress=False)

CPU times: user 46.2 s, sys: 5.65 s, total: 51.8 s
Wall time: 5.59 s


### Найдем похожих покупателей

In [19]:
%%time
# количество похожих покупателей
N_USERS = 500

result[f'sim_users'] = result['customer_id_short'].map(lambda x: get_sim_users(x, used_userid_to_id, id_to_used_userid, als, N_USERS))

CPU times: user 7min 48s, sys: 5.41 s, total: 7min 54s
Wall time: 7min 53s


In [39]:
%%time
# rec before ranking
N = 500

result[f'own_rec'] = result.apply(lambda row: get_recommendations(row['customer_id_short'], row['sim_users'], als, own_recommender, used_userid_to_id, used_itemid_to_id, custom_sparse_user_item, id_to_used_itemid_nb, top_sim_weeks_articles_nb, N), axis=1)
# result_list = [get_recommendations(result.iloc[i]['customer_id_short'], result.iloc[i]['sim_users'], als, own_recommender, used_userid_to_id, used_itemid_to_id, custom_sparse_user_item, id_to_used_itemid_nb, top_sim_weeks_articles_nb, N) for i in range(1000)]

CPU times: user 3h 23min 57s, sys: 4min 23s, total: 3h 28min 21s
Wall time: 3h 28min 12s


In [40]:
print('Recall_own_rec: ', result.apply(lambda row: recall(row['own_rec'], row['actual_article_id_short']), axis=1).mean())

Recall_own_rec:  0.18546809308615347


In [47]:
# result_list = np.array(result[f'own_rec'])

In [48]:
# %%time
# for i, el in enumerate(result_list):
#     if len(el) != 500:
#         print(i, len(el))

In [44]:
def col_convert(val):
    if type(val) is not list:
        return list(val)#.tolist()
    return val

In [45]:
%%time
result[f'actual_article_id_short'] = result.apply(lambda row: col_convert(row['actual_article_id_short']), axis=1)
result[f'sim_users'] = result.apply(lambda row: col_convert(row['sim_users']), axis=1)
result[f'own_rec'] = result.apply(lambda row: col_convert(row['own_rec']), axis=1)

CPU times: user 13.8 s, sys: 1.43 s, total: 15.3 s
Wall time: 15.3 s


In [46]:
result.to_parquet('archive/result.parquet')

In [107]:
# ALL articles, N=500 ownrec_+_als_user-user_(500)_[:300] usr in top weeks recs
# [36] top week all recs user+item rec [:500]:                          0.18546809308615347  3h 35min 47s

# Все, что ниже меньше на около 2%, чем указано, была ошибка в 

# ALL articles, N=500 ownrec_+_als_user-user_(1000)_[:300] usr in top weeks recs
# [36] top week all recs user+item rec [:300]:                          0.19477580725279156

# ALL articles, N=500 ownrec_+_als_user-user_(1500)_[:300] usr in top weeks recs
# [36] top week all recs user+item rec [:300]:                          0.19468697652055272


# ALL articles, N=500 ownrec_+_als_user-user_(1000)_[:200] usr in top weeks recs
# [36] top week all recs user+item rec [:200]:                          0.18547566396237963  3,5 часа
# [36] top week all recs user+item rec [:250]:                          0.1871061748466698   2 часа
# [36] top week all recs user+item rec [:300]:                          0.18786509431469386  1,5 часа


# ALL articles, N=500 ownrec_+_als_user-user_(1000)_[:200] usr in top weeks recs
# [36] top week all recs:                                               0.17699052881027386
# [36] top week non rec:                                                0.15856317784885243
# [36]:                                                                 0.14282189518238486

# ALL articles, N=500 ownrec_+_als_user-user_(50)_[:200] usr in top weeks recs          
# [36]:                                                                 0.136161043703649
# [39, 47, 43, 44, 41, 40, 37, 36, 42, 38, 35, 46, 45, 50, 49, 1, 52]:  0.12959007290888985
# [39, 47, 43, 44, 41, 40, 37, 36, 42, 38, 35, 46, 45, 49]:             0.12964804101548066
# [39, 41, 40, 37, 36, 42, 38, 35]:                                     0.13142969085634304
# [39, 37, 36, 38, 35]:                                                 0.13335686448557282
# [39, 40, 37, 36, 38]:                                                 0.130681402495449
# [37, 38]:                                                             0.12909899728601051
# [37, 36]:                                                             0.13437953871833408
# [37]:                                                                 0.1291321164660187
# [39]:                                                                 0.12827095089728174