In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [4]:
data_full = pd.read_csv('./data/retail_train.csv')

data_full.columns = [col.lower() for col in data_full.columns]
data_full.rename(columns={'household_key': 'user_id',
                    'product_id': 'item_id'},
           inplace=True)


data_full.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [5]:
item_features = pd.read_csv('./data/product.csv')
user_features = pd.read_csv('./data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [7]:
n_items_before = data_full['item_id'].nunique()

data_train = prefilter_items(data_full, item_features=item_features, take_n_popular=2400)

n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))
data_train.head(5)

Decreased # items from 89051 to 2400


Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price
6,2375,26984851516,1,1043142,1,1.57,364,-0.68,1642,1,0.0,0.0,1.57
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99
11,1364,26984896261,1,842930,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19
19,1130,26984905972,1,1048462,1,1.19,31642,-0.8,1340,1,0.0,0.0,1.19
25,98,26984951769,1,965138,2,3.0,337,-0.08,1937,1,0.0,0.0,1.5


In [8]:
recommender = MainRecommender(data_train, data_full)



  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/2474 [00:00<?, ?it/s]

In [9]:
res = recommender.get_own_recommendations(N=5)
res 

{1: [1031697, 977545, 1056775, 5577022, 856942],
 2: [1136486, 925437, 9707240, 961756, 9365106],
 3: [5564906, 904435, 1075979, 1136486, 1013528],
 4: [951821, 883932, 5568447, 1121367, 877523],
 5: [968992, 1120009, 1056775, 987044, 1131312],
 6: [8203834, 913406, 996269, 1132911, 1042021],
 7: [853354, 879689, 12731436, 988208, 932503],
 8: [5566809, 981660, 7144131, 1013503, 1021715],
 9: [980860, 9297474, 1068855, 1128244, 9707240],
 10: [1060872, 863762, 15596488, 1121059, 918846],
 12: [881706, 939323, 1026984, 1104414, 887618],
 13: [1011089, 950439, 9677874, 1070105, 862070],
 14: [878445, 1082310, 13876901, 1127838, 818981],
 15: [931757, 1042616, 1132911, 1082310, 910439],
 16: [1100533, 923149, 1084551, 9835695, 1035843],
 17: [1099164, 9707240, 12731432, 896085, 858373],
 18: [1000237, 938118, 9837501, 9677886, 870929],
 19: [905087, 837751, 1000236, 1038746, 858373],
 20: [999090, 945611, 993441, 953675, 821556],
 21: [934697, 912817, 1113381, 986327, 849297],
 22: [99581

In [31]:
result = pd.DataFrame(columns=['user_id'])
result['user_id'] = data_full['user_id'].unique()
result = result.sort_values(by='user_id').reset_index().drop('index',axis=1)
result

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5
...,...
2494,2496
2495,2497
2496,2498
2497,2499


In [32]:
result['own_recommendations'] = 1
temp_list = []
for i in result['user_id'].values:#range(len(result)):
    try:
        temp_list.append(res[i])
    except:
        #temp_list.append(recommender.get_similar_users_recommendation(user=i, N=5))
        temp_list.append(recommender._extend_with_top_popular(recommendations=[], N=5))
        #temp_list.append(i)
    #result.loc[result['user_id']==i, 'als_recommendations'] = res
    #result[result['user_id']==i]['als_recommendations'] = res[i]

result['own_recommendations'] = temp_list
result

Unnamed: 0,user_id,own_recommendations
0,1,"[1031697, 977545, 1056775, 5577022, 856942]"
1,2,"[1136486, 925437, 9707240, 961756, 9365106]"
2,3,"[5564906, 904435, 1075979, 1136486, 1013528]"
3,4,"[951821, 883932, 5568447, 1121367, 877523]"
4,5,"[968992, 1120009, 1056775, 987044, 1131312]"
...,...,...
2494,2496,"[1013503, 996269, 5591083, 7441210, 957741]"
2495,2497,"[1127838, 908181, 1004436, 1119089, 961747]"
2496,2498,"[884039, 991747, 987044, 12949855, 858649]"
2497,2499,"[933303, 953675, 1131312, 1104414, 986021]"


In [33]:
recommender.fit_2_level(item_features_st2=item_features, user_features_st2=user_features)

0.5978461419590676


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [34]:
stg_2_ = recommender.predict_2_level(data_train['user_id'].unique(),item_features_st2=item_features, user_features_st2=user_features, N=5)

In [35]:
result['2stg_own'] = 1
temp_list = []
for i in result['user_id'].values:#range(len(result)):
    try:
        temp_list.append(stg_2_[i])
    except:
        #temp_list.append(recommender.get_similar_users_recommendation(user=i, N=5))
        temp_list.append(recommender._extend_with_top_popular(recommendations=[], N=5))
        #temp_list.append(i)
    #result.loc[result['user_id']==i, 'als_recommendations'] = res
    #result[result['user_id']==i]['als_recommendations'] = res[i]

result['2stg_own'] = temp_list
result

Unnamed: 0,user_id,own_recommendations,2stg_own
0,1,"[1031697, 977545, 1056775, 5577022, 856942]","[854496, 934369, 971585, 920200, 924423]"
1,2,"[1136486, 925437, 9707240, 961756, 9365106]","[961756, 850133, 927360, 9803601, 5568489]"
2,3,"[5564906, 904435, 1075979, 1136486, 1013528]","[1057113, 1089025, 1076161, 15741861, 1015597]"
3,4,"[951821, 883932, 5568447, 1121367, 877523]","[902172, 7025275, 10149640, 839747, 877523]"
4,5,"[968992, 1120009, 1056775, 987044, 1131312]","[987044, 9487553, 1081710, 935393, 1131321]"
...,...,...,...
2494,2496,"[1013503, 996269, 5591083, 7441210, 957741]","[880150, 1020823, 1120928, 972931, 991546]"
2495,2497,"[1127838, 908181, 1004436, 1119089, 961747]","[897125, 908181, 7166791, 1127838, 905087]"
2496,2498,"[884039, 991747, 987044, 12949855, 858649]","[15926927, 991747, 940090, 1031697, 5577022]"
2497,2499,"[933303, 953675, 1131312, 1104414, 986021]","[941797, 822346, 920145, 1060872, 1042021]"


In [36]:
result['top_pop'] = 1
temp_list = []
for i in result['user_id'].values:#range(len(result)):


    temp_list.append(recommender._extend_with_top_popular(recommendations=[], N=5))

result['top_pop'] = temp_list
result

Unnamed: 0,user_id,own_recommendations,2stg_own,top_pop
0,1,"[1031697, 977545, 1056775, 5577022, 856942]","[854496, 934369, 971585, 920200, 924423]","[1082185, 6534178, 1029743, 995242, 1106523]"
1,2,"[1136486, 925437, 9707240, 961756, 9365106]","[961756, 850133, 927360, 9803601, 5568489]","[1082185, 6534178, 1029743, 995242, 1106523]"
2,3,"[5564906, 904435, 1075979, 1136486, 1013528]","[1057113, 1089025, 1076161, 15741861, 1015597]","[1082185, 6534178, 1029743, 995242, 1106523]"
3,4,"[951821, 883932, 5568447, 1121367, 877523]","[902172, 7025275, 10149640, 839747, 877523]","[1082185, 6534178, 1029743, 995242, 1106523]"
4,5,"[968992, 1120009, 1056775, 987044, 1131312]","[987044, 9487553, 1081710, 935393, 1131321]","[1082185, 6534178, 1029743, 995242, 1106523]"
...,...,...,...,...
2494,2496,"[1013503, 996269, 5591083, 7441210, 957741]","[880150, 1020823, 1120928, 972931, 991546]","[1082185, 6534178, 1029743, 995242, 1106523]"
2495,2497,"[1127838, 908181, 1004436, 1119089, 961747]","[897125, 908181, 7166791, 1127838, 905087]","[1082185, 6534178, 1029743, 995242, 1106523]"
2496,2498,"[884039, 991747, 987044, 12949855, 858649]","[15926927, 991747, 940090, 1031697, 5577022]","[1082185, 6534178, 1029743, 995242, 1106523]"
2497,2499,"[933303, 953675, 1131312, 1104414, 986021]","[941797, 822346, 920145, 1060872, 1042021]","[1082185, 6534178, 1029743, 995242, 1106523]"


In [19]:
pd.DataFrame.from_dict(dict_loss) 

Unnamed: 0,own_recommendations,2stg_own,top_pop
recall_at_k,0.022274,0.01765,0.009126
precision_at_k,0.981032,0.932453,0.568467


In [38]:
final_result = result[['user_id', '2stg_own']]
final_result

Unnamed: 0,user_id,2stg_own
0,1,"[854496, 934369, 971585, 920200, 924423]"
1,2,"[961756, 850133, 927360, 9803601, 5568489]"
2,3,"[1057113, 1089025, 1076161, 15741861, 1015597]"
3,4,"[902172, 7025275, 10149640, 839747, 877523]"
4,5,"[987044, 9487553, 1081710, 935393, 1131321]"
...,...,...
2494,2496,"[880150, 1020823, 1120928, 972931, 991546]"
2495,2497,"[897125, 908181, 7166791, 1127838, 905087]"
2496,2498,"[15926927, 991747, 940090, 1031697, 5577022]"
2497,2499,"[941797, 822346, 920145, 1060872, 1042021]"


In [39]:
final_result.to_csv('final_result.csv')