In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")
#sudo apt-get install libatlas-base-dev

In [2]:
customers = pd.read_csv('data/recommend_1.csv')
transactions = pd.read_csv('data/trx_data.csv')

In [3]:
customers.head()

Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [4]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
customerId    1000 non-null int64
dtypes: int64(1)
memory usage: 7.9 KB


In [5]:
transactions.head(20)

Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2
5,6,144|144|55|266
6,7,135|206|259
7,8,79|8|8|48
8,9,102|2|2|297
9,10,84|77|290|260


In [6]:
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])

In [7]:
transactions.head()

Unnamed: 0,customerId,products
0,0,[20]
1,1,"[2, 2, 23, 68, 68, 111, 29, 86, 107, 152]"
2,2,"[111, 107, 29, 11, 11, 11, 33, 23]"
3,3,"[164, 227]"
4,5,"[2, 2]"


In [8]:
#Data Preperation
data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products')\
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2


In [9]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

data_dummy = create_data_dummy(data)
data_dummy.head()

Unnamed: 0,customerId,productId,purchase_count,purchase_dummy
0,0,1,2,1
1,0,13,1,1
2,0,19,3,1
3,0,20,1,1
4,0,31,2,1


In [10]:
#Normalize
df_matrix = pd.pivot_table(data, values = 'purchase_count', index='customerId', columns='productId')
df_matrix.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [11]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
df_matrix_norm

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.1,,,,,,,,,...,,,,,,,,,,
1,,,0.166667,,,,,,,,...,,,,0.0,,,0.000000,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,0.033333,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,0.166667,0.000000,...,,,,,,,,,,
9,0.133333,0.1,0.166667,,,,,,,,...,,,,,,,,0.2,,


In [12]:
#create a table for input modelling

d = df_matrix_norm.reset_index()
d.index_names = ['scaled_purchase_freq']
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

  after removing the cwd from sys.path.


In [13]:
data_norm.head(20)

Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333
55,56,0,0.133333
59,60,0,0.0
71,72,0,0.333333
75,76,0,0.133333
90,91,0,0.0


In [14]:
#  Above step can be combined to a function 

def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

In [15]:
def split_data(data):
    train, test = train_test_split(data, test_size=0.2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [27]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)

(106868, 3) (26717, 3)


In [28]:
# Using turicreate library, we convert dataframe to SFrame - this will be useful in the modeling part

train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [29]:
train_data

customerId,productId,purchase_count
18552,1,2
6710,2,2
1069,162,1
5547,15,1
241,210,1
19263,189,2
8862,16,1
13353,115,3
16121,86,1
1830,117,3


In [30]:
test_data

customerId,productId,purchase_count
17526,273,1
12795,10,3
15348,208,1
9225,147,1
4562,147,1
13107,29,1
22502,148,1
591,241,1
1823,15,2
11246,77,1


In [19]:
#constant variables to define field names include

user_id = 'customerId'
item_id = 'productId'

users_to_recommend = list(customers[user_id])
n_rec=10
n_display = 30

In [17]:
def model(train_data, name, user_id, item_id, target, users_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, user_id = user_id, item_id = item_id, target= target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, user_id = user_id, item_id = item_id, target = target, similarity_type = 'cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, user_id = user_id, item_id = item_id, target = target, similarity_type = 'pearson')
    recom = model.recommend(users = users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [None]:
#medium/items-recommender/scripts/data_layer.py - python script for above functions

In [42]:
#i. Using popularity count
name = 'popularity'
target = 'purchase_count'
popularity_model = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)
popularity_model

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.2542372881355934 |  1   |
|    1553    |    248    | 3.1777777777777776 |  2   |
|    1553    |     34    | 3.045801526717557  |  3   |
|    1553    |     37    | 3.037037037037037  |  4   |
|    1553    |     0     |  2.98567335243553  |  5   |
|    1553    |     3     | 2.8282208588957056 |  6   |
|    1553    |     27    | 2.738095238095238  |  7   |
|    1553    |     32    | 2.625592417061611  |  8   |
|    1553    |    110    | 2.623529411764706  |  9   |
|    1553    |    230    | 2.612676056338028  |  10  |
|   20400    |    132    | 3.2542372881355934 |  1   |
|   20400    |    248    | 3.1777777777777776 |  2   |
|   20400    |     34    | 3.045801526717557  |  3   |
|   20400    |     37    | 3.037037037037037  |  4   |
|   20400    |     0     |  2.98567335243553  |  5   |
|   20400 

Class                            : PopularityRecommender

Schema
------
User ID                          : customerId
Item ID                          : productId
Target                           : purchase_count
Additional observation features  : 0
User side features               : []
Item side features               : []

Statistics
----------
Number of observations           : 106868
Number of users                  : 23327
Number of items                  : 300

Training summary
----------------
Training time                    : 0.0192

Model Parameters
----------------
Model class                      : PopularityRecommender

In [21]:
#ii. Using purchase dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)
pop_dummy

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |     16    |  1.0  |  1   |
|    1553    |     43    |  1.0  |  2   |
|    1553    |     37    |  1.0  |  3   |
|    1553    |    284    |  1.0  |  4   |
|    1553    |     1     |  1.0  |  5   |
|    1553    |     10    |  1.0  |  6   |
|    1553    |     25    |  1.0  |  7   |
|    1553    |    231    |  1.0  |  8   |
|    1553    |    172    |  1.0  |  9   |
|    1553    |     81    |  1.0  |  10  |
|   20400    |     16    |  1.0  |  1   |
|   20400    |     43    |  1.0  |  2   |
|   20400    |     37    |  1.0  |  3   |
|   20400    |    284    |  1.0  |  4   |
|   20400    |     1     |  1.0  |  5   |
|   20400    |     10    |  1.0  |  6   |
|   20400    |     25    |  1.0  |  7   |
|   20400    |    231    |  1.0  |  8   |
|   20400    |    172    |  1.0  |  9   |
|   20400    |     81    |  1.0  |  10  |
|   19750    |     16    |  1.0  |

Class                            : PopularityRecommender

Schema
------
User ID                          : customerId
Item ID                          : productId
Target                           : purchase_dummy
Additional observation features  : 0
User side features               : []
Item side features               : []

Statistics
----------
Number of observations           : 106868
Number of users                  : 23294
Number of items                  : 300

Training summary
----------------
Training time                    : 0.0079

Model Parameters
----------------
Model class                      : PopularityRecommender

In [22]:
#iii. Scaled purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)
pop_norm

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
|    1553    |    226    |  0.7894736842105263 |  1   |
|    1553    |    247    |  0.3408521303258146 |  2   |
|    1553    |    230    |  0.327407407407407  |  3   |
|    1553    |    125    | 0.26428571428571396 |  4   |
|    1553    |    248    |         0.25        |  5   |
|    1553    |    204    | 0.23478260869565204 |  6   |
|    1553    |    276    | 0.23412698412698413 |  7   |
|    1553    |    294    | 0.22635658914728654 |  8   |
|    1553    |    155    | 0.22181818181818175 |  9   |
|    1553    |    129    |  0.2163265306122449 |  10  |
|   20400    |    226    |  0.7894736842105263 |  1   |
|   20400    |    247    |  0.3408521303258146 |  2   |
|   20400    |    230    |  0.327407407407407  |  3   |
|   20400    |    125    | 0.26428571428571396 |  4   |
|   20400    |    248    |         0.25        |

Class                            : PopularityRecommender

Schema
------
User ID                          : customerId
Item ID                          : productId
Target                           : scaled_purchase_freq
Additional observation features  : 0
User side features               : []
Item side features               : []

Statistics
----------
Number of observations           : 106868
Number of users                  : 23305
Number of items                  : 300

Training summary
----------------
Training time                    : 0.0087

Model Parameters
----------------
Model class                      : PopularityRecommender

In [25]:
train_data.head()

customerId,productId,purchase_count
8416,150,1
16767,147,1
4614,195,1
712,61,1
13140,49,3
299,44,1
11508,1,1
26010,54,2
1667,160,1
880,66,1


In [34]:
train.groupby(by=item_id)['purchase_count'].mean().sort_values(ascending=False).head(20)

productId
132    3.254237
248    3.177778
34     3.045802
37     3.037037
0      2.985673
3      2.828221
27     2.738095
32     2.625592
110    2.623529
230    2.612676
10     2.610465
226    2.600000
129    2.457627
87     2.429150
58     2.379310
54     2.375451
68     2.372881
91     2.362500
6      2.354452
294    2.346457
Name: purchase_count, dtype: float64

In [None]:
# create a item-to-item similarity matrix - cosine similarity

In [35]:
#i. Purchase count

name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     35    | 0.11329883337020874  |  1   |
|    1553    |     1     | 0.05576533079147339  |  2   |
|    1553    |     5     | 0.05559852719306946  |  3   |
|    1553    |    148    | 0.05437970161437988  |  4   |
|    1553    |     17    | 0.05338183045387268  |  5   |
|    1553    |     55    | 0.05144381523132324  |  6   |
|    1553    |     2     | 0.050361692905426025 |  7   |
|    1553    |    166    | 0.049895524978637695 |  8   |
|    1553    |     15    | 0.04759383201599121  |  9   |
|    1553    |     12    | 0.04614603519439697  |  10  |
|   20400    |    280    | 0.07918918132781982  |  1   |
|   20400    |    122    | 0.050938189029693604 |  2   |
|   20400    |    246    | 0.04454457759857178  |  3   |
|   20400    |     1     | 0.04306638240814209  |  4   |
|   20400    |    182    | 0.03

In [36]:
#i. Purchase dummy

name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.09529401063919067  |  1   |
|    1553    |     35    | 0.08295763731002807  |  2   |
|    1553    |     1     | 0.08063027858734131  |  3   |
|    1553    |     5     | 0.07500816583633423  |  4   |
|    1553    |     17    | 0.06457124948501587  |  5   |
|    1553    |     21    | 0.060996949672698975 |  6   |
|    1553    |     8     | 0.05455324649810791  |  7   |
|    1553    |     33    |  0.049656081199646   |  8   |
|    1553    |     61    | 0.049227237701416016 |  9   |
|    1553    |     20    | 0.04849923849105835  |  10  |
|   20400    |     26    | 0.05866742134094238  |  1   |
|   20400    |    215    | 0.04435950517654419  |  2   |
|   20400    |     1     |  0.0440828800201416  |  3   |
|   20400    |    122    | 0.041327714920043945 |  4   |
|   20400    |    273    | 0.04

In [37]:
#i. Purchase count
name = 'cosine'
target = 'scaled_purchase_freq'
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-----------------------+------+
| customerId | productId |         score         | rank |
+------------+-----------+-----------------------+------+
|    1553    |     31    |          0.0          |  1   |
|    1553    |    227    |          0.0          |  2   |
|    1553    |    103    |          0.0          |  3   |
|    1553    |    221    |          0.0          |  4   |
|    1553    |     57    |          0.0          |  5   |
|    1553    |     17    |          0.0          |  6   |
|    1553    |     16    |          0.0          |  7   |
|    1553    |     40    |          0.0          |  8   |
|    1553    |     41    |          0.0          |  9   |
|    1553    |     39    |          0.0          |  10  |
|   20400    |     2     |  0.003693275451660156 |  1   |
|   20400    |     1     | 0.0036584246158599854 |  2   |
|   20400    |     8     | 0.0024842584133148195 |  3   |
|   20400    |     0     | 0.0019620835781097414 |  4   |
|   20400    |

In [38]:
# these variables will change accordingly
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.254237288135593  |  1   |
|    1553    |    248    | 3.177777777777778  |  2   |
|    1553    |     34    | 3.0458015267175567 |  3   |
|    1553    |     37    | 3.0347237807733047 |  4   |
|    1553    |     0     | 2.9825299524816904 |  5   |
|    1553    |     3     | 2.8282208588957074 |  6   |
|    1553    |     27    | 2.738095238095239  |  7   |
|    1553    |     32    | 2.6255924170616103 |  8   |
|    1553    |    230    | 2.6126760563380294 |  9   |
|    1553    |     10    | 2.6104651162790704 |  10  |
|   20400    |    132    | 3.244058960575168  |  1   |
|   20400    |    248    | 3.177777777777778  |  2   |
|   20400    |     34    | 3.0458015267175567 |  3   |
|   20400    |     37    | 3.0370370370370376 |  4   |
|   20400    |     0     | 2.9842336618456278 |  5   |
|   20400 

In [39]:

# these variables will change accordingly
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |     16    |  0.0  |  1   |
|    1553    |     43    |  0.0  |  2   |
|    1553    |     37    |  0.0  |  3   |
|    1553    |    284    |  0.0  |  4   |
|    1553    |     1     |  0.0  |  5   |
|    1553    |     10    |  0.0  |  6   |
|    1553    |     25    |  0.0  |  7   |
|    1553    |    231    |  0.0  |  8   |
|    1553    |    172    |  0.0  |  9   |
|    1553    |     81    |  0.0  |  10  |
|   20400    |     16    |  0.0  |  1   |
|   20400    |     43    |  0.0  |  2   |
|   20400    |     37    |  0.0  |  3   |
|   20400    |    284    |  0.0  |  4   |
|   20400    |     1     |  0.0  |  5   |
|   20400    |     10    |  0.0  |  6   |
|   20400    |     25    |  0.0  |  7   |
|   20400    |    231    |  0.0  |  8   |
|   20400    |    172    |  0.0  |  9   |
|   20400    |     81    |  0.0  |  10  |
|   19750    |     16    |  0.0  |

In [40]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
|    1553    |    226    |  0.7894736842105263 |  1   |
|    1553    |    247    | 0.34085213032581446 |  2   |
|    1553    |    230    | 0.32705511415446253 |  3   |
|    1553    |    125    |  0.2642857142857142 |  4   |
|    1553    |    248    |  0.249865010380745  |  5   |
|    1553    |    204    |  0.2347826086956522 |  6   |
|    1553    |    276    | 0.23375756943982748 |  7   |
|    1553    |    294    | 0.22623185152693304 |  8   |
|    1553    |    155    | 0.22181818181818183 |  9   |
|    1553    |    129    | 0.21627133671118282 |  10  |
|   20400    |    226    |  0.7894544664809579 |  1   |
|   20400    |    247    | 0.34084404197551843 |  2   |
|   20400    |    230    | 0.32739911401713345 |  3   |
|   20400    |    125    | 0.26427062477384283 |  4   |
|   20400    |    248    |  0.2499819219112397 |

In [43]:
# create initial callable variables

models_w_counts = [popularity_model, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]

names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [44]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)

eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0010815487778498831 | 0.0006509321348170557 |
|   2    | 0.0008652390222799057 | 0.0009429503048365211 |
|   3    | 0.0024995793976975078 |  0.003752917034336046 |
|   4    |  0.003100439829836327 |  0.006304256266402051 |
|   5    |  0.00635950681375731  |  0.017245448229116785 |
|   6    |  0.006441223832528172 |  0.020737734897912457 |
|   7    |  0.005881565258593161 |  0.02216858387364542  |
|   8    |  0.005542937486480655 |  0.024391111988451376 |
|   9    |  0.005263537385536082 |  0.025901760952052884 |
|   10   |  0.005061648280337423 |  0.02801026604563266  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0692825796766454

Per User RMSE (best)
+------------+----------------------+-------


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.02465931213497732  | 0.012972414959760856 |
|   2    | 0.033491960487417954 | 0.03697013293865986  |
|   3    | 0.04357439853870726  | 0.07055415903025618  |
|   4    | 0.040774388924940536 |  0.0881041360518974  |
|   5    | 0.03655634869132593  | 0.09874673335295424  |
|   6    | 0.03370827024298785  |  0.1083373964858947  |
|   7    | 0.03212714893441721  |  0.1197786859068508  |
|   8    | 0.030607830413151627 | 0.13107029655223648  |
|   9    | 0.028969484301519865 | 0.13962805198992842  |
|   10   | 0.027644386761842936 | 0.14730493619720383  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.908120951701505

Per User RMSE (best)
+------------+---------------------+-------+
| customerId |         rmse 


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.00108154877784988  | 0.0006509321348170608 |
|   2    | 0.0008652390222799076 | 0.0009429503048365308 |
|   3    | 0.0024995793976974995 |  0.003752917034336007 |
|   4    |  0.003100439829836335 |  0.006304256266402053 |
|   5    |  0.006359506813757289 |  0.01724544822911682  |
|   6    |  0.006441223832528169 |  0.020737734897912277 |
|   7    |  0.005881565258593183 |  0.022168583873645464 |
|   8    |  0.005542937486480653 |  0.024391111988451224 |
|   9    |  0.005263537385536111 |  0.025901760952052932 |
|   10   |  0.00506164828033743  |  0.028010266045632684 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.066462453747817

Per User RMSE (best)
+------------+-----------------------+-------


Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.005751671579552813 | 0.0027748391757306803 |
|   2    | 0.00772880868502413  |  0.008126406988654463 |
|   3    | 0.006087185755026719 |  0.009325271033517475 |
|   4    | 0.005374218132144675 |  0.01108880834058773  |
|   5    | 0.006384355453303639 |  0.01657281555009649  |
|   6    | 0.006075203105902647 |  0.018904039937183873 |
|   7    | 0.006337109579614448 |  0.02344721100530931  |
|   8    | 0.00618304694801926  |  0.02594606928190134  |
|   9    | 0.005919428667289758 |  0.028106021670842255 |
|   10   | 0.005629448558487301 |  0.02954747989388072  |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.07664102379754116  | 0.041771746453311075 |
|   2    | 0.06078797900639832  | 0.06504827656788074  |
|   3    | 0.04737939463656641  | 0.07438578211359707  |
|   4    | 0.04072902437270822  | 0.08493422046237996  |
|   5    | 0.03627866848802942  | 0.09460039963326058  |
|   6    | 0.03316797277542113  | 0.10388298131213264  |
|   7    | 0.03136715179277535  | 0.11473932835458159  |
|   8    | 0.029513264792580315 | 0.12323959140226047  |
|   9    | 0.02818319073980873  | 0.13212252320272783  |
|   10   | 0.027111941908117082 |  0.1410217288260186  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9693189080108991

Per User RMSE (best)
+------------+--------------------+-------+
| customerId |        rmse  


Precision and recall summary statistics by cutoff
+--------+-----------------------+----------------------+
| cutoff |     mean_precision    |     mean_recall      |
+--------+-----------------------+----------------------+
|   1    |  0.005751671579552822 | 0.002774839175730674 |
|   2    |  0.007728808685024107 | 0.008126406988654479 |
|   3    |  0.006087185755026701 | 0.009325271033517487 |
|   4    |  0.00537421813214468  | 0.011088808340587803 |
|   5    |  0.006384355453303609 | 0.016572815550096404 |
|   6    | 0.0060752031059026265 | 0.018904039937183915 |
|   7    |  0.006337109579614409 | 0.023447211005309252 |
|   8    |  0.006183046948019293 | 0.02594606928190138  |
|   9    |  0.005919428667289786 | 0.02810602167084215  |
|   10   |  0.005629448558487305 | 0.02954747989388072  |
+--------+-----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+------------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002013229795800978 | 0.0010533505895887285 |
|   2    | 0.0019413287316652297 | 0.0019437565598048798 |
|   3    |  0.002085130859936725 |  0.003673661924309035 |
|   4    | 0.0020671555939027926 |  0.00484899012724724  |
|   5    | 0.0017975266033937308 |  0.005177338320133821 |
|   6    | 0.0017495925606365603 |  0.00579240188278149  |
|   7    | 0.0018796992481202879 |  0.007075641340093466 |
|   8    | 0.0019413287316652266 |  0.008520339150192454 |
|   9    | 0.0018454606461508918 |  0.009171842681333623 |
|   10   |  0.002063560540695997 |  0.011153197655570346 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.13173506196453924

Per User RMSE (best)
+------------+------+-------+
| customerId 


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.009634742594190386 | 0.00524812144203862  |
|   2    | 0.009131435145240134 | 0.00953030504672665  |
|   3    | 0.009179369187997352 | 0.01436869779733453  |
|   4    | 0.009886396318665487 | 0.02067408392368697  |
|   5    | 0.010152430255967836 | 0.02625095121323753  |
|   6    | 0.011048796855526793 |  0.0346921914113804  |
|   7    | 0.012007477710670138 | 0.043789891452277185 |
|   8    | 0.013319672131147596 | 0.05535807622742922  |
|   9    | 0.01568242098871953  | 0.07492121269630338  |
|   10   | 0.01724187517975278  |  0.0934667823042114  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.15837525490811263

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |
+----------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002013229795800984 | 0.0010533505895887287 |
|   2    | 0.0019413287316652319 | 0.0019437565598048802 |
|   3    | 0.0020851308599367205 |  0.00367366192430902  |
|   4    | 0.0020671555939028047 |  0.004848990127247209 |
|   5    | 0.0017975266033937235 |  0.005177338320133822 |
|   6    | 0.0017495925606365614 |  0.005792401882781512 |
|   7    | 0.0018796992481203054 |  0.007075641340093444 |
|   8    | 0.0019413287316652436 |  0.008520339150192471 |
|   9    | 0.0018454606461508853 |  0.009171842681333626 |
|   10   | 0.0020563704342824282 |  0.011101269109250153 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.13143999183892585

Per User RMSE (best)
+------------+-----------------------+-----

Based on RMSE
1. Popularity on purchase counts: 1.1111750034210488
2. Cosine similarity on purchase counts: 1.9230643981653215
3. Pearson similarity on purchase counts: 1.9231102838192284

4. Popularity on purchase dummy: 0.9697374361161925
5. Cosine similarity on purchase dummy: 0.9697509978436404
6. Pearson similarity on purchase dummy: 0.9697745320187097

7. Popularity on scaled purchase counts: 0.16230660626840343
8. Cosine similarity on scaled purchase counts: 0.16229800354111104
9. Pearson similarity on scaled purchase counts: 0.1622982668334026
    
Based on Precision and Recall 
Notes
Popularity v. Collaborative Filtering: We can see that the collaborative filtering algorithms work better than popularity model for purchase counts. Indeed, popularity model doesn’t give any personalizations as it only gives the same list of recommended items to every user.
Precision and recall: Looking at the summary above, we see that the precision and recall for Purchase Counts > Purchase Dummy > Normalized Purchase Counts. However, because the recommendation scores for the normalized purchase data is zero and constant, we choose the dummy. In fact, the RMSE isn’t much different between models on the dummy and those on the normalized data.
RMSE: Since RMSE is higher using pearson distance thancosine, we would choose model the smaller mean squared errors, which in this case would be cosine. Therefore, we select the Cosine similarity on Purchase Dummy approach as our final model.

In [46]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                                    user_id = user_id, 
                                                    item_id = item_id, 
                                                    target = 'purchase_dummy', 
                                                    similarity_type='cosine')
recom = final_model.recommend(users = users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     1     | 0.10348175764083863  |  1   |
|    1553    |     2     |  0.0934672474861145  |  2   |
|    1553    |     35    |  0.0845762014389038  |  3   |
|    1553    |     33    |  0.0668614387512207  |  4   |
|    1553    |     61    | 0.06512556076049805  |  5   |
|    1553    |     15    | 0.06476415395736694  |  6   |
|    1553    |     11    | 0.05467898845672607  |  7   |
|    1553    |     5     | 0.05406981706619263  |  8   |
|    1553    |     36    | 0.05048650503158569  |  9   |
|    1553    |     13    | 0.04985467195510864  |  10  |
|   20400    |     26    | 0.05812269449234009  |  1   |
|   20400    |     6     | 0.05361741781234741  |  2   |
|   20400    |    113    | 0.05312788486480713  |  3   |
|   20400    |     1     | 0.05210459232330322  |  4   |
|   20400    |     15    | 0.04

In [47]:
df_rec = recom.to_dataframe()
df_rec.shape

(10000, 4)

In [48]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))
df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates().sort_values('customerId').set_index('customerId')

In [51]:

def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('../option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [52]:

df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(1000, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
4,226|247|230|125|248|204|276|294|155|129
11,226|247|230|125|248|204|276|294|155|129
12,226|247|230|125|248|204|276|294|155|129
16,226|247|230|125|248|204|276|294|155|129
21,226|247|230|125|248|204|276|294|155|129


In [53]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [54]:
customer_recomendation(4)

recommendedProducts    226|247|230|125|248|204|276|294|155|129
Name: 4, dtype: object

In [55]:
customer_recomendation(21)

recommendedProducts    226|247|230|125|248|204|276|294|155|129
Name: 21, dtype: object

Summary

In this exercise, we were able to traverse a step-by-step process for making recommendations to customers. We used Collaborative Filtering approaches with cosine and pearson measure and compare the models with our baseline popularity model. We also prepared three sets of data that include regular buying count, buying dummy, as well as normalized purchase frequency as our target variable. Using RMSE, precision and recall, we evaluated our models and observed the impact of personalization. Finally, we selected the Cosine approach in dummy purchase data.