### USER-USER 協同過濾：轉換為最相似的顧客族群(USER-USER Similarity Matrix)，查看他們經常購買的商品，推薦給目前鎖定的顧客。 
- https://medium.datadriveninvestor.com/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6

In [1]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./data/transaction_data_v2.csv')
product_df = pd.read_csv('./data/product_v2.csv')
product_df1 = product_df[['PRODUCT_ID','COMMODITY_DESC']]
df_merge = df.merge(product_df1,on='PRODUCT_ID')
df = df_merge.copy()

In [3]:
household_product_count  = df.groupby(['COMMODITY_DESC','household_key']).QUANTITY.sum().reset_index().sort_values('COMMODITY_DESC',ascending=False) # Group together
household_count = df.groupby('household_key').BASKET_ID.nunique()
household_count = pd.DataFrame(household_count)
ratings = household_product_count.merge(household_count,on='household_key')
ratings['rating'] = (ratings['QUANTITY']/ratings['BASKET_ID'])

bins = [0,0.005,0.008,0.01,0.02,999]
def rating(value, bins):
    for i in range(len(bins) - 1):
        if value >= bins[i] and value <= bins[i+1]:
            return i+1
    return 1

ratings['rating_label'] = ratings.rating.apply(lambda i: rating(i, bins))
ratings

Unnamed: 0,COMMODITY_DESC,household_key,QUANTITY,BASKET_ID,rating,rating_label
0,YOGURT,2500,118,100,1.18,5
1,WATER - CARBONATED/FLVRD DRINK,2500,8,100,0.08,5
2,WAREHOUSE SNACKS,2500,31,100,0.31,5
3,VITAMINS,2500,6,100,0.06,5
4,VEGETABLES SALAD,2500,1,100,0.01,3
...,...,...,...,...,...,...
284167,COOKIES/CONES,1897,1,1,1.00,5
284168,CITRUS,1897,1,1,1.00,5
284169,BEERS/ALES,1897,1,1,1.00,5
284170,BAKING NEEDS,1897,1,1,1.00,5


In [4]:
data = ratings[['household_key', 'COMMODITY_DESC', 'rating_label']]
data.columns = ['customerId', 'productId', 'rating_label']

In [5]:
data.head()

Unnamed: 0,customerId,productId,rating_label
0,2500,YOGURT,5
1,2500,WATER - CARBONATED/FLVRD DRINK,5
2,2500,WAREHOUSE SNACKS,5
3,2500,VITAMINS,5
4,2500,VEGETABLES SALAD,3


In [6]:
df_matrix = pd.pivot_table(data, values='rating_label', index='customerId', columns='productId')

In [7]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [8]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(284168, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
1,2,ADULT INCONTINENCE,1.0
18,19,ADULT INCONTINENCE,0.0
25,26,ADULT INCONTINENCE,0.5
48,49,ADULT INCONTINENCE,1.0
51,52,ADULT INCONTINENCE,0.75


In [9]:
d

productId,customerId,ADULT INCONTINENCE,AIR CARE,ANALGESICS,ANTACIDS,APPAREL,APPLES,AUDIO/VIDEO PRODUCTS,AUTOMOTIVE PRODUCTS,BABY FOODS,...,VEAL,VEGETABLES - ALL OTHERS,VEGETABLES - SHELF STABLE,VEGETABLES SALAD,VITAMINS,WAREHOUSE SNACKS,WATCHES/CALCULATORS/LOBBY,WATER,WATER - CARBONATED/FLVRD DRINK,YOGURT
scaled_purchase_freq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,,1.00,1.0,,,1.00,,,,...,,1.00,1.0,1.00,1.0,1.00,,,,
1,2,1.0,1.00,1.0,,,1.00,,,1.0,...,,1.00,1.0,,,1.00,,,1.0,1.0
2,3,,1.00,,,,,,,1.0,...,,,1.0,1.00,,1.00,,,1.0,1.0
3,4,,1.00,1.0,,,,,,,...,,,1.0,,,1.00,,,,
4,5,,1.00,,1.00,,,,,,...,,,1.0,,,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,2496,,1.00,1.0,,,1.00,,,1.0,...,,1.00,1.0,1.00,,1.00,,,1.0,1.0
2496,2497,,1.00,1.0,0.00,,1.00,,0.75,1.0,...,,1.00,1.0,1.00,,0.00,,,1.0,1.0
2497,2498,,1.00,1.0,1.00,0.25,0.75,,,,...,,1.00,1.0,,,0.25,,,1.0,1.0
2498,2499,1.0,0.75,,0.75,,1.00,0.75,0.75,,...,,0.75,1.0,0.75,,1.00,,,1.0,1.0


In [10]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [11]:
train_data, test_data = split_data(data)
# train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [12]:
customers = df['household_key'].reset_index()
customers = customers.drop('index',axis=1)
customers = customers.groupby('household_key').sum()
customers = customers.reset_index()
customers

Unnamed: 0,household_key
0,1
1,2
2,3
3,4
4,5
...,...
2495,2496
2496,2497
2497,2498
2498,2499


In [13]:
# constant variables to define field names include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers['household_key'])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [14]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

### Popularity Model as Baseline
- The popularity model takes the most popular items for recommendation. These items are products with the highest number of sells across customers.
- Training data is used for model selection

#### Using purchase count

In [15]:
name = 'popularity'
target = 'rating_label'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+---------------------------+--------------------+------+
| customerId |         productId         |       score        | rank |
+------------+---------------------------+--------------------+------+
|     1      |    FROZEN PACKAGE MEAT    |        5.0         |  1   |
|     1      |    FLUID MILK PRODUCTS    |  4.99120082815735  |  2   |
|     1      |        FROZEN PIZZA       | 4.928485576923077  |  3   |
|     1      |    CRACKERS/MISC BKD FD   |  4.91965811965812  |  4   |
|     1      |   ICE CREAM/MILK/SHERBTS  | 4.912968299711816  |  5   |
|     1      |     CONDIMENTS/SAUCES     | 4.912817551963048  |  6   |
|     1      |           YOGURT          | 4.895104895104895  |  7   |
|     1      |            CORN           | 4.893098782138025  |  8   |
|     1      |    MEAT - SHELF STABLE    | 4.885887913571911  |  9   |
|     1      |       TROPICAL FRUIT      | 4.8817080207732255 |  10  |
|     2      |    FROZEN PACKAGE MEAT    |        5.0         |  1   |
|     

#### Using purchase dummy

In [16]:
# name = 'popularity'
# target = 'purchase_dummy'
# pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

#### Using scaled purchase count

In [17]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      |             CHEESE             | 0.9965425531914893 |  1   |
|     1      |          FROZEN PIZZA          | 0.9808282208588958 |  2   |
|     1      |     SALD DRSNG/SNDWCH SPRD     | 0.9800693240901213 |  3   |
|     1      |             YOGURT             | 0.9748811948404617 |  4   |
|     1      |      MEAT - SHELF STABLE       | 0.9689793195463643 |  5   |
|     1      |              CORN              | 0.9688365650969529 |  6   |
|     1      |            CHICKEN             | 0.9622302158273381 |  7   |
|     1      |         DINNER MXS:DRY         | 0.9621418826739427 |  8   |
|     1      |   BEANS - CANNED GLASS & MW    | 0.9543808834178131 |  9   |
|     1      | WATER - CARBONATED/FLVRD DRINK | 0.9498680738786279 |  10  |
|     2     

### Cosine similarity

#### Using purchase count

In [18]:
name = 'cosine'
target = 'rating_label'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      |         TROPICAL FRUIT         | 2.8428714584845762 |  1   |
|     1      |       CONDIMENTS/SAUCES        | 2.841829036290829  |  2   |
|     1      |      FLUID MILK PRODUCTS       | 2.8319414481520653 |  3   |
|     1      |      CRACKERS/MISC BKD FD      | 2.7615692873413744 |  4   |
|     1      |            CHICKEN             | 2.7357007912718334 |  5   |
|     1      |          FROZEN PIZZA          | 2.7334124978918295 |  6   |
|     1      |     ICE CREAM/MILK/SHERBTS     | 2.704537934408738  |  7   |
|     1      |     DRY BN/VEG/POTATO/RICE     | 2.6994249717547345 |  8   |
|     1      | WATER - CARBONATED/FLVRD DRINK | 2.536541764552777  |  9   |
|     1      |             YOGURT             | 2.535817356636891  |  10  |
|     2     

#### Using purchase dummy

In [19]:
# name = 'cosine'
# target = 'purchase_dummy'
# cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

#### Using scaled purchase count

In [20]:
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+---------------------+------+
| customerId |           productId            |        score        | rank |
+------------+--------------------------------+---------------------+------+
|     1      |             CHEESE             |  0.5523866529416557 |  1   |
|     1      |     SALD DRSNG/SNDWCH SPRD     |  0.5508193277349376 |  2   |
|     1      |          FROZEN PIZZA          |  0.5402008584051421 |  3   |
|     1      |             YOGURT             |  0.536537619552227  |  4   |
|     1      |            CHICKEN             |  0.5235590392893011 |  5   |
|     1      |         DINNER MXS:DRY         |  0.5075852473576864 |  6   |
|     1      | WATER - CARBONATED/FLVRD DRINK | 0.49479356558635984 |  7   |
|     1      |      MEAT - SHELF STABLE       | 0.47689048388991695 |  8   |
|     1      |   BEANS - CANNED GLASS & MW    |  0.4752017632879392 |  9   |
|     1      |            TOMATOES            | 0.46980964716034707 |  10  |

### Pearson

#### Using purchase count

In [21]:
name = 'pearson'
target = 'rating_label'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+---------------------------+--------------------+------+
| customerId |         productId         |       score        | rank |
+------------+---------------------------+--------------------+------+
|     1      |    FROZEN PACKAGE MEAT    |        5.0         |  1   |
|     1      |    FLUID MILK PRODUCTS    | 4.990846012023186  |  2   |
|     1      |        FROZEN PIZZA       |  4.92831855668472  |  3   |
|     1      |    CRACKERS/MISC BKD FD   | 4.922238245975771  |  4   |
|     1      |     CONDIMENTS/SAUCES     | 4.9193577042789585 |  5   |
|     1      |   ICE CREAM/MILK/SHERBTS  | 4.915592671015762  |  6   |
|     1      |           YOGURT          | 4.898867255480554  |  7   |
|     1      |            CORN           | 4.898155670003097  |  8   |
|     1      |       TROPICAL FRUIT      | 4.890460306061152  |  9   |
|     1      |    MEAT - SHELF STABLE    | 4.888505079018196  |  10  |
|     2      |    FROZEN PACKAGE MEAT    |        5.0         |  1   |
|     

#### Using purchase dummy

In [22]:
# name = 'pearson'
# target = 'purchase_dummy'
# pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

#### Using scaled purchase count

In [23]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      |             CHEESE             | 0.996637246307957  |  1   |
|     1      |     SALD DRSNG/SNDWCH SPRD     | 0.9816025596544989 |  2   |
|     1      |          FROZEN PIZZA          | 0.9810123353921374 |  3   |
|     1      |             YOGURT             | 0.9758493711475617 |  4   |
|     1      |      MEAT - SHELF STABLE       | 0.9701598732234702 |  5   |
|     1      |              CORN              | 0.9694146566990869 |  6   |
|     1      |         DINNER MXS:DRY         | 0.9643330607983018 |  7   |
|     1      |            CHICKEN             | 0.9634875493179212 |  8   |
|     1      |   BEANS - CANNED GLASS & MW    | 0.9555942003833974 |  9   |
|     1      | WATER - CARBONATED/FLVRD DRINK | 0.9511514889168037 |  10  |
|     2     

### Model Evaluation

In [24]:
models_w_counts = [popularity, cos, pear]
# models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]
names_w_counts = ['Popularity Model on rating_label', 'Cosine Similarity on rating_label', 'Pearson Similarity on rating_label']
# names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled rating_label', 'Cosine Similarity on Scaled rating_label', 'Pearson Similarity on Scaled rating_label']

In [25]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
# eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on rating_label



Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0004004805766920302 | 4.0048057669203065e-05 |
|   2    |   0.4185022026431718  |  0.041249083554066146  |
|   3    |   0.527966893605661   |  0.07721324450058477   |
|   4    |   0.5653784541449746  |  0.10768869726951155   |
|   5    |   0.5722066479775735  |   0.1338021652283932   |
|   6    |   0.5716860232278733  |   0.1591690045187902   |
|   7    |   0.5658218433548828  |  0.18204935405003594   |
|   8    |   0.5600220264317178  |  0.20406836883604398   |
|   9    |   0.5569794864949054  |   0.2271839480639546   |
|   10   |   0.5509010812975565  |  0.24824825004751613   |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.8858243572950354

Per User RMSE (best)
+------------+----------------


Precision and recall summary statistics by cutoff
+--------+--------------------+---------------------+
| cutoff |   mean_precision   |     mean_recall     |
+--------+--------------------+---------------------+
|   1    | 0.8185822987585104 | 0.04027900496891527 |
|   2    | 0.7827392871445735 | 0.07518207140215849 |
|   3    | 0.7586437057802692 | 0.10769211522605464 |
|   4    | 0.7329795754905888 | 0.13729781748232883 |
|   5    | 0.7102122547056471 | 0.16369244965919583 |
|   6    | 0.6895608063008944 | 0.18888669251458634 |
|   7    | 0.6709766004920185 | 0.21251350358169582 |
|   8    | 0.654435322386864  | 0.23558954907743324 |
|   9    | 0.6386775241400785 | 0.25737354267968526 |
|   10   | 0.6256708049659586 | 0.27851687863798796 |
+--------+--------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 3.617606163231985

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count |
+------------+-------------


Precision and recall summary statistics by cutoff
+--------+------------------------+----------------------+
| cutoff |     mean_precision     |     mean_recall      |
+--------+------------------------+----------------------+
|   1    | 0.00040048057669203027 | 4.00480576692031e-05 |
|   2    |   0.4169002803364037   | 0.04116994673569538  |
|   3    |   0.5285008677079165   | 0.07714509164358767  |
|   4    |   0.5638766519823791   | 0.10759033532270383  |
|   5    |   0.5729275130156184   |  0.1342445144933855  |
|   6    |   0.5698838606327588   |  0.1585184664267835  |
|   7    |   0.5649064591795863   | 0.18199687928327513  |
|   8    |   0.5591209451341609   | 0.20356427496167104  |
|   9    |   0.5571574778623241   | 0.22647221808541004  |
|   10   |   0.5515418502202647   | 0.24839130199960813  |
+--------+------------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.8774369208055748

Per User RMSE (best)
+------------+----------------------+-------


Precision and recall summary statistics by cutoff
+--------+--------------------+---------------------+
| cutoff |   mean_precision   |     mean_recall     |
+--------+--------------------+---------------------+
|   1    | 0.8385416666666671 | 0.04192960837987082 |
|   2    | 0.7942708333333334 | 0.07763313906272733 |
|   3    | 0.7522702991452993 | 0.10781860154790006 |
|   4    | 0.719050480769231  |  0.1349202348189318 |
|   5    | 0.6901442307692305 |  0.1599670641553645 |
|   6    | 0.6676014957264961 | 0.18428788256371573 |
|   7    | 0.6468635531135529 | 0.20691424635129854 |
|   8    | 0.6305588942307694 |  0.2292962859859617 |
|   9    | 0.6150730056980055 |  0.2502451519024278 |
|   10   | 0.6032451923076929 |  0.2710056921118312 |
+--------+--------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.2253293330215758

Per User RMSE (best)
+------------+----------------------+-------+
| customerId |         rmse         | count |
+------------+--------


Precision and recall summary statistics by cutoff
+--------+--------------------+---------------------+
| cutoff |   mean_precision   |     mean_recall     |
+--------+--------------------+---------------------+
|   1    | 0.7796474358974359 | 0.03692896434429936 |
|   2    | 0.7526041666666667 | 0.07016895928739753 |
|   3    | 0.7361111111111108 | 0.10167691920627539 |
|   4    | 0.7187500000000007 | 0.13156457837173915 |
|   5    | 0.7084935897435897 | 0.16135005873106964 |
|   6    | 0.6926415598290584 | 0.18805566110092675 |
|   7    | 0.6800595238095245 | 0.21453085431056879 |
|   8    | 0.665564903846154  | 0.23865414134361254 |
|   9    | 0.6528222934472929 | 0.26224122376490666 |
|   10   | 0.6403445512820513 |  0.2853330549016568 |
+--------+--------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.7126898986096364

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse        | count |
+------------+------------


Precision and recall summary statistics by cutoff
+--------+--------------------+---------------------+
| cutoff |   mean_precision   |     mean_recall     |
+--------+--------------------+---------------------+
|   1    | 0.8365384615384613 | 0.04152113395168606 |
|   2    | 0.7930689102564101 | 0.07740018914270937 |
|   3    | 0.7508012820512819 |  0.1073109118613088 |
|   4    | 0.7195512820512822 |  0.1352722620696937 |
|   5    | 0.6906250000000002 | 0.16031012861379282 |
|   6    | 0.6672008547008541 | 0.18396167165738062 |
|   7    | 0.6484088827838824 | 0.20767864790474064 |
|   8    | 0.6308593749999997 | 0.22953558216703138 |
|   9    | 0.6164529914529916 | 0.25096936404335524 |
|   10   | 0.6040464743589745 |  0.2715917660733454 |
+--------+--------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.22315770605837484

Per User RMSE (best)
+------------+-----------------------+-------+
| customerId |          rmse         | count |
+------------+-----

In [26]:
# target='purchase_dummy' replace to target= None

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            similarity_type="pearson")
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+--------------------------------+--------------------+------+
| customerId |           productId            |       score        | rank |
+------------+--------------------------------+--------------------+------+
|     1      |          FROZEN PIZZA          | 0.847044233154196  |  1   |
|     1      |            CHICKEN             | 0.7800744109136006 |  2   |
|     1      | WATER - CARBONATED/FLVRD DRINK | 0.7792476229847044 |  3   |
|     1      |             YOGURT             | 0.7333608929309632 |  4   |
|     1      |         DINNER SAUSAGE         | 0.7201322860686235 |  5   |
|     1      |  BREAKFAST SAUSAGE/SANDWICHES  | 0.6978090119884249 |  6   |
|     1      |          MEAT - MISC           | 0.6647374948325755 |  7   |
|     1      | GREETING CARDS/WRAP/PARTY SPLY | 0.6622571310458867 |  8   |
|     1      |           BEERS/ALES           | 0.6622571310458867 |  9   |
|     1      |          PAPER TOWELS          | 0.6606035551880942 |  10  |
|     2     

In [27]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec

(25000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1,FROZEN PIZZA,0.847044,1
1,1,CHICKEN,0.780074,2
2,1,WATER - CARBONATED/FLVRD DRINK,0.779248,3
3,1,YOGURT,0.733361,4
4,1,DINNER SAUSAGE,0.720132,5
...,...,...,...,...
24995,2500,SALAD BAR,0.474576,6
24996,2500,AIR CARE,0.460934,7
24997,2500,PREPARED FOOD,0.406366,8
24998,2500,CIGARETTES,0.403886,9


In [28]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: ','.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('./data/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [29]:
# pear_norm replace to final_model 

df_output = create_output(final_model, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(2500, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
1,"FROZEN PIZZA,CHICKEN,WATER - CARBONATED/FLVRD ..."
2,"MEAT - SHELF STABLE,DINNER SAUSAGE,BEANS - CAN..."
3,"EGGS,VEGETABLES - ALL OTHERS,SALAD MIX,APPLES,..."
4,"SOUP,TROPICAL FRUIT,CANNED JUICES,POTATOES,ONI..."
5,"SOFT DRINKS,EGGS,TROPICAL FRUIT,ICE CREAM/MILK..."


In [30]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [31]:
customer_recomendation(1)

recommendedProducts    FROZEN PIZZA,CHICKEN,WATER - CARBONATED/FLVRD ...
Name: 1, dtype: object