# Домашнее задание №7

1. Выбрать по 6 предсказаний для пользователей
2. Удалить фейковый элемент
3. Оставить 5 остортированных предсказаний для каждого пользователя
4. Посчитать метрику (map@5, precision@5)

Определите сильно ли отличается качество ALS из implicit и pyspark, сравнивайте по метрикам map@5, precision@5

In [1]:
import pandas as pd
import numpy as np

from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as sf

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight


# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from best_rec_lib.metrics import precision_at_k, ap_k, recall_at_k
from best_rec_lib.utils import prefilter_items

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../retail_train.csv')
item_features = pd.read_csv('../product.csv')
user_features = pd.read_csv('../hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [3]:
n_items_before = data_train['item_id'].nunique()

data_train = prefilter_items(data_train, 5000, item_features)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 86865 to 5001


In [4]:
user_item_matrix = pd.pivot_table(data_train, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат saprse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(2)

item_id,117847,279994,818981,819255,819308,819400,819487,819590,819594,819840,...,15926775,15926844,15926886,15972074,15972298,15972565,15972790,16100266,16729299,16729415
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]

In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[856942, 865456, 951954, 971585, 979707, 99065..."
1,3,[920626]


In [7]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

##### Spark

In [8]:
session = (SparkSession.builder.config('spark.executor.memory', "1500mb")
    .config("spark.sql.shuffle.partitions", "100")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .config("spark.driver.host", "localhost")
    .config('spark.executor.instances', 4)
    .config('spark.executor.cores', 4)
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .master("local[*]")
    .enableHiveSupport()
    .getOrCreate()
    )

In [9]:
session

In [11]:
df_to_spark = data_train[["user_id", "item_id", "quantity"]]

In [12]:
spark_data_train = session.createDataFrame(df_to_spark)

In [13]:
spark_data_train = spark_data_train.withColumnRenamed("quantity", "relevance")

In [14]:
def get_recommendations_spark(df, N=5):
    spam = list(df.item_id)
    if 999999 in spam:
        spam.remove(999999)
    
    return spam[:N]

In [15]:
def als_spark(prec_met, map_met, result, spark_data_train, factors, reg_st, iterations, alpha, N=5):
    i = 1
    for fact in factors:
        for reg in reg_st:
            for itrn in iterations:
                for alp in alpha:
                    
                    print(f'{i}/{len(factors)*len(reg_st)*len(iterations)*len(alpha)}', fact, reg, itrn, alp)

                    model = ALS(rank=fact, 
                                userCol="user_id",
                                itemCol="item_id", 
                                ratingCol="relevance", 
                                implicitPrefs=True, # тип ALS
                                coldStartStrategy="drop",
                                maxIter=itrn, 
                                alpha=alp, 
                                regParam=reg,
                                seed=42
                    ).fit(spark_data_train)

                    # предсказания для всех пользователей
                    recs_als = model.recommendForAllUsers(6)

                    # Разворачиваем рекомендации через функцию explode
                    recs_als = (recs_als
                                .withColumn("recommendations", sf.explode("recommendations"))
                                .withColumn("item_id", sf.col("recommendations.item_id"))
                                .withColumn("relevance", sf.col("recommendations.rating").cast(DoubleType()),)
                                .select("user_id", "item_id", "relevance")
                        )

                    recs_als = recs_als.toPandas()
                    users = list(set(recs_als.user_id))
                    new_test_users = set(result['user_id']) - set(users)
                    
                    if new_test_users:
                        result = result[~result['user_id'].isin(new_test_users)]


                    result[f'als_spark_{fact}_{reg}_{alp}_{itrn}'] = result['user_id'].map(lambda x: get_recommendations_spark(recs_als.loc[recs_als['user_id'] == x], N))
                    prec_met[f'als_spark_{fact}_{reg}_{alp}_{itrn}'] = result.apply(lambda row: precision_at_k(row[f'als_spark_{fact}_{reg}_{alp}_{itrn}'], row['actual']), axis=1).mean()
                    map_met[f'als_spark_{fact}_{reg}_{alp}_{itrn}'] = result.apply(lambda row: ap_k(row[f'als_spark_{fact}_{reg}_{alp}_{itrn}'], row['actual']), axis=1).mean()
                    
                    i += 1
                    
    return result, prec_met, map_met

In [16]:
# Словари метрик
prec_met = dict()
map_met = dict()

# Списки гиперпараметров для моделей
factors = [50, 150, 200, 250, 300, 350]
reg_st = [0.005, 0.01, 0.03, 0.05]
alpha = [0.25, 0.5, 1.0, 1.25, 1.5]
iterations = [1, 2, 3, 5, 8, 10]

In [17]:
%%time
result, prec_met, map_met = als_spark(prec_met, map_met, result, spark_data_train, factors, reg_st, iterations, alpha, N=5)

1/720 50 0.005 1 0.25
2/720 50 0.005 1 0.5
3/720 50 0.005 1 1.0
4/720 50 0.005 1 1.25
5/720 50 0.005 1 1.5
6/720 50 0.005 2 0.25
7/720 50 0.005 2 0.5
8/720 50 0.005 2 1.0
9/720 50 0.005 2 1.25
10/720 50 0.005 2 1.5
11/720 50 0.005 3 0.25
12/720 50 0.005 3 0.5
13/720 50 0.005 3 1.0
14/720 50 0.005 3 1.25
15/720 50 0.005 3 1.5
16/720 50 0.005 5 0.25
17/720 50 0.005 5 0.5
18/720 50 0.005 5 1.0
19/720 50 0.005 5 1.25
20/720 50 0.005 5 1.5
21/720 50 0.005 8 0.25
22/720 50 0.005 8 0.5
23/720 50 0.005 8 1.0
24/720 50 0.005 8 1.25
25/720 50 0.005 8 1.5
26/720 50 0.005 10 0.25
27/720 50 0.005 10 0.5
28/720 50 0.005 10 1.0
29/720 50 0.005 10 1.25
30/720 50 0.005 10 1.5
31/720 50 0.01 1 0.25
32/720 50 0.01 1 0.5
33/720 50 0.01 1 1.0
34/720 50 0.01 1 1.25
35/720 50 0.01 1 1.5
36/720 50 0.01 2 0.25
37/720 50 0.01 2 0.5
38/720 50 0.01 2 1.0
39/720 50 0.01 2 1.25
40/720 50 0.01 2 1.5
41/720 50 0.01 3 0.25
42/720 50 0.01 3 0.5
43/720 50 0.01 3 1.0
44/720 50 0.01 3 1.25
45/720 50 0.01 3 1.5
46/720 50 0

355/720 200 0.05 8 1.5
356/720 200 0.05 10 0.25
357/720 200 0.05 10 0.5
358/720 200 0.05 10 1.0
359/720 200 0.05 10 1.25
360/720 200 0.05 10 1.5
361/720 250 0.005 1 0.25
362/720 250 0.005 1 0.5
363/720 250 0.005 1 1.0
364/720 250 0.005 1 1.25
365/720 250 0.005 1 1.5
366/720 250 0.005 2 0.25
367/720 250 0.005 2 0.5
368/720 250 0.005 2 1.0
369/720 250 0.005 2 1.25
370/720 250 0.005 2 1.5
371/720 250 0.005 3 0.25
372/720 250 0.005 3 0.5
373/720 250 0.005 3 1.0
374/720 250 0.005 3 1.25
375/720 250 0.005 3 1.5
376/720 250 0.005 5 0.25
377/720 250 0.005 5 0.5
378/720 250 0.005 5 1.0
379/720 250 0.005 5 1.25
380/720 250 0.005 5 1.5
381/720 250 0.005 8 0.25
382/720 250 0.005 8 0.5
383/720 250 0.005 8 1.0
384/720 250 0.005 8 1.25
385/720 250 0.005 8 1.5
386/720 250 0.005 10 0.25
387/720 250 0.005 10 0.5
388/720 250 0.005 10 1.0
389/720 250 0.005 10 1.25
390/720 250 0.005 10 1.5
391/720 250 0.01 1 0.25
392/720 250 0.01 1 0.5
393/720 250 0.01 1 1.0
394/720 250 0.01 1 1.25
395/720 250 0.01 1 1.5
3

699/720 350 0.05 2 1.25
700/720 350 0.05 2 1.5
701/720 350 0.05 3 0.25
702/720 350 0.05 3 0.5
703/720 350 0.05 3 1.0
704/720 350 0.05 3 1.25
705/720 350 0.05 3 1.5
706/720 350 0.05 5 0.25
707/720 350 0.05 5 0.5
708/720 350 0.05 5 1.0
709/720 350 0.05 5 1.25
710/720 350 0.05 5 1.5
711/720 350 0.05 8 0.25
712/720 350 0.05 8 0.5
713/720 350 0.05 8 1.0
714/720 350 0.05 8 1.25
715/720 350 0.05 8 1.5
716/720 350 0.05 10 0.25
717/720 350 0.05 10 0.5
718/720 350 0.05 10 1.0
719/720 350 0.05 10 1.25
720/720 350 0.05 10 1.5
CPU times: total: 18min 18s
Wall time: 15h 22min 28s


In [18]:
data = [prec_met, map_met]
data = pd.DataFrame(data, index =['Precision@k', 'MAP@k'])
data = data.T

In [19]:
data.sort_values('Precision@k', ascending=False)

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.05_10,0.286907,0.220063
als_spark_350_0.05_10,0.286802,0.219798
als_spark_350_0.05_8,0.286489,0.219002
als_spark_350_0.05_5,0.285968,0.218560
als_spark_300_0.03_8,0.285133,0.219111
...,...,...
als_spark_50_0.005_2,0.180386,0.118409
als_spark_50_0.01_1,0.142514,0.083462
als_spark_50_0.005_1,0.142201,0.083295
als_spark_50_0.03_1,0.140219,0.083685


In [20]:
data.sort_values('MAP@k', ascending=False)

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.05_10,0.286907,0.220063
als_spark_350_0.05_10,0.286802,0.219798
als_spark_300_0.03_10,0.284298,0.219298
als_spark_300_0.03_8,0.285133,0.219111
als_spark_350_0.05_8,0.286489,0.219002
...,...,...
als_spark_50_0.005_2,0.180386,0.118409
als_spark_50_0.05_1,0.136463,0.084017
als_spark_50_0.03_1,0.140219,0.083685
als_spark_50_0.01_1,0.142514,0.083462


In [21]:
# Лучшие параметры по метрике Precision@5
data[data['Precision@k'] == data['Precision@k'].max()]

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.05_10,0.286907,0.220063


In [22]:
# Лучшие параметры по метрике MAP@5
data[data['MAP@k'] == data['MAP@k'].max()]

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.05_10,0.286907,0.220063


Проведем дополнительный перебор параметров, т.к. некоторые приняли максимальные указанные их значения в списках.

In [16]:
# Словари метрик
prec_met = dict()
map_met = dict()

# Списки гиперпараметров для моделей
factors = [300,]
reg_st = [0.01, 0.25, 0.05,]
alpha = [0.05, 0.1, 0.05, 0.5, 1.0, 1.5]
iterations = [15, 20] # 20 max, 25 вылетает

In [17]:
%%time
result, prec_met, map_met = als_spark(prec_met, map_met, result, spark_data_train, factors, reg_st, iterations, alpha, N=5)

1/36 300 0.01 15 0.05
2/36 300 0.01 15 0.1
3/36 300 0.01 15 0.05
4/36 300 0.01 15 0.5
5/36 300 0.01 15 1.0
6/36 300 0.01 15 1.5
7/36 300 0.01 20 0.05
8/36 300 0.01 20 0.1
9/36 300 0.01 20 0.05
10/36 300 0.01 20 0.5
11/36 300 0.01 20 1.0
12/36 300 0.01 20 1.5
13/36 300 0.25 15 0.05
14/36 300 0.25 15 0.1
15/36 300 0.25 15 0.05
16/36 300 0.25 15 0.5
17/36 300 0.25 15 1.0
18/36 300 0.25 15 1.5
19/36 300 0.25 20 0.05
20/36 300 0.25 20 0.1
21/36 300 0.25 20 0.05
22/36 300 0.25 20 0.5
23/36 300 0.25 20 1.0
24/36 300 0.25 20 1.5
25/36 300 0.05 15 0.05
26/36 300 0.05 15 0.1
27/36 300 0.05 15 0.05
28/36 300 0.05 15 0.5
29/36 300 0.05 15 1.0
30/36 300 0.05 15 1.5
31/36 300 0.05 20 0.05
32/36 300 0.05 20 0.1
33/36 300 0.05 20 0.05
34/36 300 0.05 20 0.5
35/36 300 0.05 20 1.0
36/36 300 0.05 20 1.5
CPU times: total: 56.2 s
Wall time: 4h 11min 14s


In [19]:
data = [prec_met, map_met]
data = pd.DataFrame(data, index =['Precision@k', 'MAP@k'])
data = data.T

In [20]:
data.sort_values('Precision@k', ascending=False)

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.01_0.05_20,0.295149,0.23602
als_spark_300_0.01_0.05_15,0.295044,0.236039
als_spark_300_0.01_0.1_20,0.294627,0.233865
als_spark_300_0.01_0.1_15,0.294418,0.233692
als_spark_300_0.05_0.05_20,0.291706,0.232495
als_spark_300_0.05_0.05_15,0.291601,0.232323
als_spark_300_0.05_0.1_20,0.290037,0.231297
als_spark_300_0.05_0.5_20,0.289411,0.227232
als_spark_300_0.05_0.1_15,0.289098,0.230696
als_spark_300_0.01_0.5_15,0.288889,0.224448


In [21]:
data.sort_values('MAP@k', ascending=False)

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.01_0.05_15,0.295044,0.236039
als_spark_300_0.01_0.05_20,0.295149,0.23602
als_spark_300_0.01_0.1_20,0.294627,0.233865
als_spark_300_0.01_0.1_15,0.294418,0.233692
als_spark_300_0.05_0.05_20,0.291706,0.232495
als_spark_300_0.05_0.05_15,0.291601,0.232323
als_spark_300_0.05_0.1_20,0.290037,0.231297
als_spark_300_0.05_0.1_15,0.289098,0.230696
als_spark_300_0.05_0.5_20,0.289411,0.227232
als_spark_300_0.05_0.5_15,0.288576,0.226648


In [22]:
# Лучшие параметры по метрике Precision@5
data[data['Precision@k'] == data['Precision@k'].max()]

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.01_0.05_20,0.295149,0.23602


In [23]:
# Лучшие параметры по метрике MAP@5
data[data['MAP@k'] == data['MAP@k'].max()]

Unnamed: 0,Precision@k,MAP@k
als_spark_300_0.01_0.05_15,0.295044,0.236039


In [24]:
session.stop()

### Сравним метрики с ALS моделью с лучшими найденными параметрами

In [10]:
def get_recommendations(user, model, sparse_user_item, N=5):
    res = [id_to_itemid[rec] for rec in 
                    model.recommend(userid=userid_to_id[user], # можно вставить несколько пользователей
                                    user_items=sparse_user_item[userid_to_id[user]],   # на вход user-item matrix
                                    N=N, 
                                    filter_already_liked_items=False, 
                                    filter_items=[itemid_to_id[999999]], 
                                    recalculate_user=True)[0]] # recalculate_user=True это для перерасчета для новых users
    return res

In [13]:
%%time
bm25_user_item_matrix = bm25_weight(user_item_matrix.T).T.tocsr()

model = AlternatingLeastSquares(factors=350, 
                regularization=0.05,
                iterations=1,
                calculate_training_loss=True, 
                random_state=42)

model.fit(bm25_user_item_matrix, show_progress=False)

CPU times: total: 8.19 s
Wall time: 851 ms


In [16]:
test_users = result.shape[0]
new_test_users = len(set(data_test['user_id']) - set(data_train['user_id']))

print('В тестовом дата сете {} юзеров'.format(test_users))
print('В тестовом дата сете {} новых юзеров'.format(new_test_users))

В тестовом дата сете 1919 юзеров
В тестовом дата сете 2 новых юзеров


In [17]:
# уберем пользователей, которых нет в трейне
new_test_users = set(data_test['user_id']) - set(data_train['user_id'])
result = result[~result['user_id'].isin(new_test_users)]

In [20]:
result['als_bm25_T'] = result['user_id'].map(lambda x: get_recommendations(x, model, sparse_user_item, 5))
print('Precision@k: ', result.apply(lambda row: precision_at_k(row[f'als_bm25_T'], row['actual']), axis=1).mean())
print('MAP@k: ', result.apply(lambda row: ap_k(row[f'als_bm25_T'], row['actual']), axis=1).mean())

Precision@k:  0.2372456964006229
MAP@k:  0.17222396105025153


**Выводы:** модель ALS Spark показала значительно лучшие результаты