# Курсовая работа "Рекомендательные системы"


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als


import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import money_precision_at_k
from src.utils import prefilter_items, get_prices, postfilter_items
from src.recommenders import MainRecommender

In [2]:
data = pd.read_csv('../Рекомендательные системы/retail_train.csv')
item_features = pd.read_csv('../Рекомендательные системы/product.csv')
test = pd.read_csv('../Рекомендательные системы/retail_test1.csv')

In [3]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
data.columns = [col.lower() for col in data.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
data.rename(columns={'household_key': 'user_id','product_id': 'item_id'},inplace=True)

In [4]:
n_items_before = data['item_id'].nunique()
data = prefilter_items(data, take_n_popular=20000, item_features=item_features)
data = data.merge(item_features, on='item_id', how='left')
n_items_after = data['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 20001


In [5]:
test =  prefilter_items(test, take_n_popular=20000, item_features=item_features)
test = test.merge(item_features, on='item_id', how='left')

In [6]:
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_valid = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

In [7]:
recommender = MainRecommender(data_train)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19998.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19998.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19998.0), HTML(value='')))




In [8]:
result = data_valid.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[856942, 865456, 889248, 907957, 914190, 94331..."
1,3,"[835476, 872021, 909638, 920626, 958154, 10536..."


In [9]:
result['bm25'] = result['user_id'].apply(lambda x: recommender.get_bm25_recommendations(x, N=200))

In [10]:
result.head()

Unnamed: 0,user_id,actual,bm25
0,1,"[856942, 865456, 889248, 907957, 914190, 94331...","[5577022, 9297615, 856942, 1049998, 1074612, 1..."
1,3,"[835476, 872021, 909638, 920626, 958154, 10536...","[1044078, 835476, 9337581, 998206, 9337369, 10..."
2,6,"[1006718, 1104227, 1108624, 1110392, 825541, 8...","[13003092, 1082185, 878996, 1041259, 1037863, ..."
3,7,"[840386, 898068, 909714, 993838, 1003188, 1056...","[6534406, 1086732, 6533878, 909611, 1072483, 1..."
4,8,"[835098, 872137, 910439, 924610, 992977, 55692...","[8181377, 1044078, 926808, 1105433, 1013503, 9..."


In [11]:
result['final'] = result.apply(lambda row: postfilter(row['bm25'], data), axis=1)

In [12]:
result.head()

Unnamed: 0,user_id,actual,bm25,final
0,1,"[856942, 865456, 889248, 907957, 914190, 94331...","[5577022, 9297615, 856942, 1049998, 1074612, 1...","[5577022, 856942, 1049998, 1074612, 1075074]"
1,3,"[835476, 872021, 909638, 920626, 958154, 10536...","[1044078, 835476, 9337581, 998206, 9337369, 10...","[1044078, 835476, 9337581, 998206, 9337369]"
2,6,"[1006718, 1104227, 1108624, 1110392, 825541, 8...","[13003092, 1082185, 878996, 1041259, 1037863, ...","[13003092, 1082185, 878996, 1041259, 1037863]"
3,7,"[840386, 898068, 909714, 993838, 1003188, 1056...","[6534406, 1086732, 6533878, 909611, 1072483, 1...","[6534406, 1086732, 1072483, 989221, 7147142]"
4,8,"[835098, 872137, 910439, 924610, 992977, 55692...","[8181377, 1044078, 926808, 1105433, 1013503, 9...","[8181377, 1044078, 926808, 1105433, 1013503]"


In [13]:
result['price'] = result['final'].apply(lambda x: get_prices(x, data))

In [14]:
result.head()

Unnamed: 0,user_id,actual,bm25,final,price
0,1,"[856942, 865456, 889248, 907957, 914190, 94331...","[5577022, 9297615, 856942, 1049998, 1074612, 1...","[5577022, 856942, 1049998, 1074612, 1075074]","[2.7554571428571433, 2.7999644128113887, 1.843..."
1,3,"[835476, 872021, 909638, 920626, 958154, 10536...","[1044078, 835476, 9337581, 998206, 9337369, 10...","[1044078, 835476, 9337581, 998206, 9337369]","[3.244310160832738, 1.794201388888889, 1.08942..."
2,6,"[1006718, 1104227, 1108624, 1110392, 825541, 8...","[13003092, 1082185, 878996, 1041259, 1037863, ...","[13003092, 1082185, 878996, 1041259, 1037863]","[4.659296482412061, 1.3360895793355156, 2.9515..."
3,7,"[840386, 898068, 909714, 993838, 1003188, 1056...","[6534406, 1086732, 6533878, 909611, 1072483, 1...","[6534406, 1086732, 1072483, 989221, 7147142]","[2.0150526315789477, 1.828918918918919, 1.2356..."
4,8,"[835098, 872137, 910439, 924610, 992977, 55692...","[8181377, 1044078, 926808, 1105433, 1013503, 9...","[8181377, 1044078, 926808, 1105433, 1013503]","[1.2386746987951807, 3.244310160832738, 2.6381..."


In [15]:
result.apply(lambda row: money_precision_at_k(row['final'], row['actual'], row['price'], 5), axis=1).mean()

0.207271332869401

In [16]:
result_final = test.groupby('user_id')['item_id'].unique().reset_index()
result_final.columns=['user_id', 'actual']
result_final.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820291, 826784, 826835, 829009, 866211, 87060..."


In [17]:
result_final['bm25'] = result_final['user_id'].apply(lambda x: recommender.get_bm25_recommendations(x, N=200))

In [18]:
result_final.head(2)

Unnamed: 0,user_id,actual,bm25
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[5577022, 9297615, 856942, 1049998, 1074612, 1..."
1,2,"[820291, 826784, 826835, 829009, 866211, 87060...","[978332, 1113780, 826784, 1103898, 831125, 970..."


In [19]:
result_final['rec'] = result_final.apply(lambda row: postfilter_items(row['bm25'], data), axis=1)

In [20]:
result_final.head(2)

Unnamed: 0,user_id,actual,bm25,rec
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[5577022, 9297615, 856942, 1049998, 1074612, 1...","[5577022, 856942, 1049998, 1074612, 1075074]"
1,2,"[820291, 826784, 826835, 829009, 866211, 87060...","[978332, 1113780, 826784, 1103898, 831125, 970...","[978332, 1113780, 826784, 1103898, 831125]"


In [21]:
prod = result_final[['user_id', 'rec']]

In [22]:
prod.to_csv('recommendations.csv', index=False)