In [98]:
import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.cross_validation import train_test_split

import sys
sys.path.append("..")

In [99]:
customers = pd.read_csv('recommend_1.csv') 
transactions = pd.read_csv('trx_data.csv')

In [100]:
customers.head()

Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [101]:
transactions.head()

Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


In [102]:
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()

Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0


In [103]:
pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})

Unnamed: 0,customerId,productId,purchase_count
0,0,20.0,1
1,1,2.0,2
2,1,23.0,1
3,1,29.0,1
4,1,68.0,2
5,1,86.0,1
6,1,107.0,1
7,1,111.0,1
8,1,152.0,1


In [104]:
s=time.time()

data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

print("Execution time:", round((time.time()-s)/60,2), "minutes")

Execution time: 0.23 minutes


In [105]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2


In [106]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

In [107]:
data_dummy = create_data_dummy(data)

In [109]:
data_dummy.head()

Unnamed: 0,customerId,productId,purchase_count,purchase_dummy
0,0,1,2,1
1,0,13,1,1
2,0,19,3,1
3,0,20,1,1
4,0,31,2,1


In [112]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
df_matrix.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [113]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
print(df_matrix_norm.shape)
df_matrix_norm.head()

(24429, 300)


productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.1,,,,,,,,,...,,,,,,,,,,
1,,,0.166667,,,,,,,,...,,,,0.0,,,0.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [114]:
# create a table for input to the modeling

d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchase_freq']
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(133585, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


In [115]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

In [116]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)

(106868, 3) (26717, 3)


In [44]:
train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [117]:
train_data

customerId,productId,purchase_count
19083,228,1
21172,29,3
15880,16,1
4154,225,1
826,19,3
18854,47,1
10515,212,1
12213,126,1
4871,213,2
6816,19,2


In [118]:
test_data

customerId,productId,purchase_count
21073,59,1
1916,25,1
18948,19,1
13584,9,1
7501,59,3
4687,186,2
24312,292,1
10987,49,1
870,224,3
14989,166,1


In [119]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [120]:
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [124]:
user_id = 'customerId'
item_id = 'productId'
target = 'purchase_count'
users_to_recommend = list(transactions[user_id])
n_rec = 10 # number of items to recommend
n_display = 30

In [125]:
popularity_model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)

In [126]:
popularity_recomm = popularity_model.recommend(users=users_to_recommend, k=n_rec)
popularity_recomm.print_rows(n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|     0      |    132    | 3.3962264150943398 |  1   |
|     0      |     37    | 3.0669291338582676 |  2   |
|     0      |     0     | 2.983527131782946  |  3   |
|     0      |     34    | 2.9568627450980394 |  4   |
|     0      |    248    |  2.86046511627907  |  5   |
|     0      |     3     | 2.852320675105485  |  6   |
|     0      |     27    | 2.8106060606060606 |  7   |
|     0      |     32    | 2.695876288659794  |  8   |
|     0      |    110    | 2.691860465116279  |  9   |
|     0      |     10    | 2.611940298507463  |  10  |
|     1      |    132    | 3.3962264150943398 |  1   |
|     1      |     37    | 3.0669291338582676 |  2   |
|     1      |     0     | 2.983527131782946  |  3   |
|     1      |     34    | 2.9568627450980394 |  4   |
|     1      |    248    |  2.86046511627907  |  5   |
|     1   

In [127]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [128]:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to print the head / first few rows in a defined dataset

In [129]:
# these variables will change accordingly
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |     31    |  1.0  |  1   |
|    1553    |    276    |  1.0  |  2   |
|    1553    |     17    |  1.0  |  3   |
|    1553    |     33    |  1.0  |  4   |
|    1553    |     5     |  1.0  |  5   |
|    1553    |    264    |  1.0  |  6   |
|    1553    |    171    |  1.0  |  7   |
|    1553    |     20    |  1.0  |  8   |
|    1553    |     97    |  1.0  |  9   |
|    1553    |    157    |  1.0  |  10  |
|   20400    |     31    |  1.0  |  1   |
|   20400    |    276    |  1.0  |  2   |
|   20400    |     17    |  1.0  |  3   |
|   20400    |     33    |  1.0  |  4   |
|   20400    |     5     |  1.0  |  5   |
|   20400    |    264    |  1.0  |  6   |
|   20400    |    171    |  1.0  |  7   |
|   20400    |     20    |  1.0  |  8   |
|   20400    |     97    |  1.0  |  9   |
|   20400    |    157    |  1.0  |  10  |
|   19750    |     31    |  1.0  |

In [55]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
|    1553    |    226    |  0.7847222222222222 |  1   |
|    1553    |    247    |  0.3624338624338625 |  2   |
|    1553    |    230    |  0.3328467153284666 |  3   |
|    1553    |    294    | 0.26323529411764685 |  4   |
|    1553    |    125    |  0.2585714285714282 |  5   |
|    1553    |    248    | 0.24722222222222223 |  6   |
|    1553    |    155    | 0.24313725490196078 |  7   |
|    1553    |    204    |  0.2390804597701148 |  8   |
|    1553    |    276    | 0.23387096774193547 |  9   |
|    1553    |     72    | 0.22690763052208834 |  10  |
|   20400    |    226    |  0.7847222222222222 |  1   |
|   20400    |    247    |  0.3624338624338625 |  2   |
|   20400    |    230    |  0.3328467153284666 |  3   |
|   20400    |    294    | 0.26323529411764685 |  4   |
|   20400    |    125    |  0.2585714285714282 |

In [56]:
train.groupby(by=item_id)['purchase_count'].mean().sort_values(ascending=False).head(20)

productId
132    3.396226
37     3.066929
0      2.983527
34     2.956863
248    2.860465
3      2.852321
27     2.810606
32     2.695876
110    2.691860
10     2.611940
230    2.604317
82     2.557940
226    2.541667
83     2.459016
58     2.443182
129    2.441341
87     2.426295
68     2.397490
252    2.396226
54     2.385965
Name: purchase_count, dtype: float64

In [57]:
# Collaborative Filering Model

In [130]:
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.12819908062616983  |  1   |
|    1553    |     5     | 0.09454451004664104  |  2   |
|    1553    |     17    | 0.08060463269551595  |  3   |
|    1553    |     1     | 0.08033778270085652  |  4   |
|    1553    |     33    |  0.0745270848274231  |  5   |
|    1553    |     61    |  0.0636434555053711  |  6   |
|    1553    |     21    | 0.06020273764928182  |  7   |
|    1553    |     47    | 0.059981723626454674 |  8   |
|    1553    |    269    | 0.05944593747456869  |  9   |
|    1553    |     76    | 0.05806044737497965  |  10  |
|   20400    |    280    | 0.09019196033477783  |  1   |
|   20400    |    182    | 0.04392021894454956  |  2   |
|   20400    |    265    | 0.04303497076034546  |  3   |
|   20400    |     56    | 0.04117715358734131  |  4   |
|   20400    |    122    | 0.04

In [66]:
# these variables will change accordingly
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.09817783832550049  |  1   |
|    1553    |     35    | 0.07496769428253174  |  2   |
|    1553    |     1     | 0.06144163608551025  |  3   |
|    1553    |     21    | 0.059239089488983154 |  4   |
|    1553    |     8     | 0.05472544431686401  |  5   |
|    1553    |     33    | 0.04865926504135132  |  6   |
|    1553    |     17    | 0.044033932685852054 |  7   |
|    1553    |     61    | 0.04375450611114502  |  8   |
|    1553    |     5     | 0.043473684787750246 |  9   |
|    1553    |     49    | 0.04078166484832764  |  10  |
|   20400    |     6     | 0.04596734046936035  |  1   |
|   20400    |     15    | 0.04551219940185547  |  2   |
|   20400    |    273    | 0.04489678144454956  |  3   |
|   20400    |    246    | 0.04376423358917236  |  4   |
|   20400    |     26    | 0.04

In [131]:
name = 'cosine'
target = 'scaled_purchase_freq'
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-----------------------+------+
| customerId | productId |         score         | rank |
+------------+-----------+-----------------------+------+
|    1553    |     0     |          0.0          |  1   |
|    1553    |    250    |          0.0          |  2   |
|    1553    |    119    |          0.0          |  3   |
|    1553    |     52    |          0.0          |  4   |
|    1553    |    115    |          0.0          |  5   |
|    1553    |     6     |          0.0          |  6   |
|    1553    |     46    |          0.0          |  7   |
|    1553    |     34    |          0.0          |  8   |
|    1553    |     3     |          0.0          |  9   |
|    1553    |    129    |          0.0          |  10  |
|   20400    |     0     |          0.0          |  1   |
|   20400    |    250    |          0.0          |  2   |
|   20400    |    119    |          0.0          |  3   |
|   20400    |     52    |          0.0          |  4   |
|   20400    |

In [68]:
# these variables will change accordingly
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+--------------------+------+
| customerId | productId |       score        | rank |
+------------+-----------+--------------------+------+
|    1553    |    132    | 3.3962264150943393 |  1   |
|    1553    |     37    | 3.066929133858268  |  2   |
|    1553    |     0     | 2.9835271317829477 |  3   |
|    1553    |     34    | 2.9568627450980394 |  4   |
|    1553    |    248    | 2.8558464119600693 |  5   |
|    1553    |     3     | 2.8523206751054846 |  6   |
|    1553    |     27    |  2.81060606060606  |  7   |
|    1553    |     32    | 2.6958762886597927 |  8   |
|    1553    |    110    | 2.6918604651162776 |  9   |
|    1553    |     10    | 2.611940298507464  |  10  |
|   20400    |    132    | 3.3873258323039646 |  1   |
|   20400    |     37    | 3.066929133858268  |  2   |
|   20400    |     0     | 2.9835271317829477 |  3   |
|   20400    |     34    | 2.9568627450980394 |  4   |
|   20400    |    248    | 2.8604651162790695 |  5   |
|   20400 

In [69]:
# these variables will change accordingly
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |     37    |  0.0  |  1   |
|    1553    |     59    |  0.0  |  2   |
|    1553    |     1     |  0.0  |  3   |
|    1553    |     79    |  0.0  |  4   |
|    1553    |    219    |  0.0  |  5   |
|    1553    |    278    |  0.0  |  6   |
|    1553    |    265    |  0.0  |  7   |
|    1553    |    123    |  0.0  |  8   |
|    1553    |    170    |  0.0  |  9   |
|    1553    |     21    |  0.0  |  10  |
|   20400    |     37    |  0.0  |  1   |
|   20400    |     59    |  0.0  |  2   |
|   20400    |     1     |  0.0  |  3   |
|   20400    |     79    |  0.0  |  4   |
|   20400    |    219    |  0.0  |  5   |
|   20400    |    278    |  0.0  |  6   |
|   20400    |    265    |  0.0  |  7   |
|   20400    |    123    |  0.0  |  8   |
|   20400    |    170    |  0.0  |  9   |
|   20400    |     21    |  0.0  |  10  |
|   19750    |     37    |  0.0  |

In [132]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------------+------+
| customerId | productId |        score        | rank |
+------------+-----------+---------------------+------+
|    1553    |    226    |  0.7807314741785508 |  1   |
|    1553    |    247    | 0.32798048853874184 |  2   |
|    1553    |    230    | 0.32264589208822986 |  3   |
|    1553    |    248    | 0.28744950307452166 |  4   |
|    1553    |    125    |  0.2562815622143122 |  5   |
|    1553    |    294    |  0.2507936507936506 |  6   |
|    1553    |    204    | 0.23488372093023258 |  7   |
|    1553    |    213    | 0.22892030586556691 |  8   |
|    1553    |    276    | 0.22855964049379876 |  9   |
|    1553    |     72    | 0.22343561961266148 |  10  |
|   20400    |    226    |  0.7810218978102189 |  1   |
|   20400    |    247    |  0.3281249999999998 |  2   |
|   20400    |    230    |  0.3230769230769231 |  3   |
|   20400    |    248    |  0.2880434782608696 |  4   |
|   20400    |    125    | 0.25612620270770514 |

In [133]:
models_w_counts = [popularity_model, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]

names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [72]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0013577247391739337 | 0.0009170596922490583 |
|   2    | 0.0032156638559382596 |  0.003343269564507245 |
|   3    |  0.008956219332094749 |   0.0152169222485072  |
|   4    |  0.007753322852651116 |  0.01726348877806647  |
|   5    | 0.0065456624267543565 |  0.018269222837251905 |
|   6    |  0.006740984231337223 |  0.02205568120603134  |
|   7    |  0.006216950121480653 |  0.023841685732180786 |
|   8    |  0.005922180934686297 |  0.026685364769228313 |
|   9    |  0.005669096279357821 |   0.0285754038109038  |
|   10   |  0.005630984707731895 |  0.031632666447271855 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0355834520015166

Per User RMSE (best)
+------------+------+-------+
| customerId |


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.07038730884664811  | 0.041781824841885765 |
|   2    | 0.04612691153351424  | 0.05279067337583911  |
|   3    | 0.037230241532085366 | 0.06339388878111243  |
|   4    | 0.03324639131056186  |  0.0747788940938726  |
|   5    | 0.030041446334143306 | 0.08393037455091253  |
|   6    | 0.027773807822400114 | 0.09283717091304698  |
|   7    | 0.026296984421895132 | 0.10230244758927155  |
|   8    | 0.025117907674717794 | 0.11131446270399582  |
|   9    | 0.024359646197576777 | 0.12094413598594826  |
|   10   | 0.023610118622266602 | 0.12983164144172016  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.8732347767102313

Per User RMSE (best)
+------------+---------------------+-------+
| customerId |         rmse


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0013577247391739363 | 0.0009170596922490604 |
|   2    | 0.0032156638559382535 | 0.0033432695645072403 |
|   3    |  0.008956219332094709 |  0.015216922248507155 |
|   4    |  0.00775332285265114  |  0.017263488778066424 |
|   5    |  0.006545662426754344 |  0.018269222837252005 |
|   6    |  0.006740984231337232 |  0.022055681206031352 |
|   7    |  0.006216950121480635 |  0.02384168573218078  |
|   8    | 0.0059221809346862615 |  0.02668536476922838  |
|   9    |  0.005669096279357826 |  0.028575403810903743 |
|   10   | 0.0056309847077318795 |  0.03163266644727202  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0322890490262644

Per User RMSE (best)
+------------+-----------------------+------

In [134]:
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Dummy



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0037453183520599403 |  0.001643971114807202 |
|   2    |  0.004177470469605307 | 0.0040666238885835645 |
|   3    |  0.004009411312782101 |  0.006162047191872041 |
|   4    |  0.004249495822529536 |  0.008677077937239925 |
|   5    |  0.004134255257850762 |  0.010560712405144026 |
|   6    | 0.0036372803226735867 |  0.011138801606828657 |
|   7    | 0.0036630036630036756 |  0.013634841114991145 |
|   8    |  0.003637280322673582 |  0.015435098981584445 |
|   9    |  0.003481225391337741 |  0.016451749279464106 |
|   10   |  0.003673292999135684 |  0.01953540615525574  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.006266205704407937 |  0.002826257180189753 |
|   2    |  0.004645635263612806 | 0.0043295291890797566 |
|   3    |  0.004273504273504273 |  0.006082889228956635 |
|   4    |  0.003925381734370492 |  0.007529398400184933 |
|   5    | 0.0037165082108902266 |  0.008870301566930772 |
|   6    |  0.003625276097186218 |  0.010246271622676122 |
|   7    | 0.0035601103016833506 |  0.011896283595721771 |
|   8    | 0.0036102708153270035 |  0.01387012124371671  |
|   9    | 0.0036492845481609576 |  0.01594853856089816  |
|   10   |  0.003601267646211472 |  0.01803695939931894  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9743970483172888

Per User RMSE (best)
+------------+--------------------+-------+



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0020887352348026465 | 0.0013138624795928163 |
|   2    | 0.0014044943820224753 | 0.0017220061461634612 |
|   3    | 0.0013924901565351008 | 0.0023680049663195716 |
|   4    | 0.0014585133967156445 |  0.00333503107379512  |
|   5    | 0.0013684817055603688 | 0.0037588659779671063 |
|   6    | 0.0014525112839719593 |  0.005078730570303614 |
|   7    | 0.0014199283862205237 |  0.005663336351538589 |
|   8    | 0.0014495102276001212 | 0.0064298061489072526 |
|   9    | 0.0013924901565350995 |  0.006760608305552141 |
|   10   | 0.0014333045231921646 |  0.007594901976924464 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+------+-------+
| customerId | rmse | count |

In [74]:
users_to_recommend = list(customers[user_id])

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', 
                                            similarity_type='cosine')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+----------------------+------+
| customerId | productId |        score         | rank |
+------------+-----------+----------------------+------+
|    1553    |     2     | 0.12324784994125366  |  1   |
|    1553    |     1     | 0.10348175764083863  |  2   |
|    1553    |     35    |  0.0845762014389038  |  3   |
|    1553    |     33    |  0.0668614387512207  |  4   |
|    1553    |     5     | 0.06496070623397827  |  5   |
|    1553    |     61    | 0.060317397117614746 |  6   |
|    1553    |     15    | 0.05949603319168091  |  7   |
|    1553    |     21    | 0.052197158336639404 |  8   |
|    1553    |     17    |  0.0519999623298645  |  9   |
|    1553    |     11    | 0.050322222709655764 |  10  |
|   20400    |     26    | 0.05812269449234009  |  1   |
|   20400    |     6     | 0.05361741781234741  |  2   |
|   20400    |    113    | 0.05312788486480713  |  3   |
|   20400    |     1     | 0.05210459232330322  |  4   |
|   20400    |     15    | 0.04

In [135]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(10000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1553,2,0.123248,1
1,1553,1,0.103482,2
2,1553,35,0.084576,3
3,1553,33,0.066861,4
4,1553,5,0.064961,5


In [138]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))
df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates().sort_values('customerId').set_index('customerId')

In [145]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('../output/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [146]:
# users_to_recommend = list(customers[user_id])
# df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
# print(df_output.shape)
# df_output.head()

In [142]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [143]:
customer_recomendation(4)

recommendedProducts    2|1|36|13|216|61|20|33|25|157
Name: 4, dtype: object

In [147]:
customer_recomendation(21)

recommendedProducts    38|36|48|79|2|1|15|13|44|5
Name: 21, dtype: object