In [1]:
# import libraries
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import turicreate as tc
import time
from sklearn.model_selection import train_test_split

In [3]:
# reading Trimmed data
df_customers = pd.read_csv("customers_data2.csv")
df_articles = pd.read_csv("articles_data2.csv")
df_transactions = pd.read_csv("transactions_data2.csv")

# full article catalog
df_all_articles = pd.read_csv("articles.csv")

In [59]:
# dummy user
def create_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

data_dummy = create_dummy(df_transactions)

In [5]:
# normalize items across users
df_matrix = pd.pivot_table(df_transactions, values='purchase_count', index='customer_id', columns='article_id')
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [6]:
# create table for input modeling
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customer_id'], value_name='scaled_purchase_freq').dropna()

In [8]:
# split the data into testing and training sets
def split_data(data):
    train, test = train_test_split(data, test_size = 0.2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

train_data, test_data = split_data(df_transactions)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [9]:
# defining constant variables for field names
customer_id = 'customer_id'
article_id = 'article_id'
similar_users = list(df_customers[customer_id])
n_rec = 10 # number of reccomendations
n_show = 20 # rows to print from dataset

In [21]:
# define functions for different modeling techniques
def model(train_data, name, customer_id, article_id, target, similar_users, n_rec, n_show):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, user_id = customer_id, item_id = article_id, target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, user_id = customer_id, item_id = article_id, target = target, similarity_type = 'cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, user_id = customer_id, item_id = article_id, target = target, similarity_type = 'pearson')
    recom = model.recommend(users = similar_users, k = n_rec)
    recom.print_rows(n_show)
    return model

In [23]:
# popularity model using purchase count
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+--------------------+------+
|          customer_id          | article_id |       score        | rank |
+-------------------------------+------------+--------------------+------+
| 00006413d8573cd20ed7128e53... | 717464001  | 2.168888888888889  |  1   |
| 00006413d8573cd20ed7128e53... | 779554002  | 2.084848484848485  |  2   |
| 00006413d8573cd20ed7128e53... | 156231001  | 2.0587301587301585 |  3   |
| 00006413d8573cd20ed7128e53... | 228257001  | 2.027251732101617  |  4   |
| 00006413d8573cd20ed7128e53... | 368979001  | 2.0148026315789473 |  5   |
| 00006413d8573cd20ed7128e53... | 158340001  | 1.820475319926874  |  6   |
| 00006413d8573cd20ed7128e53... | 111593001  | 1.8169257340241796 |  7   |
| 00006413d8573cd20ed7128e53... | 570003002  | 1.8051546391752578 |  8   |
| 00006413d8573cd20ed7128e53... | 160442007  | 1.7966643009226402 |  9   |
| 00006413d8573cd20ed7128e53... | 160442010  |       1.7852       |  10  |
| 00007d2de826758b65a93dd

In [24]:
# popularity model using scaled purchase count
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+---------------------+------+
|          customer_id          | article_id |        score        | rank |
+-------------------------------+------------+---------------------+------+
| 00006413d8573cd20ed7128e53... | 699424003  | 0.11365678346810412 |  1   |
| 00006413d8573cd20ed7128e53... | 843872006  |  0.0977182539682541 |  2   |
| 00006413d8573cd20ed7128e53... | 806225008  | 0.09657701711491441 |  3   |
| 00006413d8573cd20ed7128e53... | 777043002  | 0.09451901565995541 |  4   |
| 00006413d8573cd20ed7128e53... | 559630009  | 0.09433085501858736 |  5   |
| 00006413d8573cd20ed7128e53... | 656719015  | 0.09333333333333352 |  6   |
| 00006413d8573cd20ed7128e53... | 542464004  | 0.09069293478260869 |  7   |
| 00006413d8573cd20ed7128e53... | 776237021  | 0.09036939313984169 |  8   |
| 00006413d8573cd20ed7128e53... | 712924012  | 0.09021956087824351 |  9   |
| 00006413d8573cd20ed7128e53... | 832331004  |  0.0899007589025103 |  10  |
| 00007d2de8

In [25]:
# popularity model using dummy user
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+-------+------+
|          customer_id          | article_id | score | rank |
+-------------------------------+------------+-------+------+
| 00006413d8573cd20ed7128e53... | 706016025  |  1.0  |  1   |
| 00006413d8573cd20ed7128e53... | 788261004  |  1.0  |  2   |
| 00006413d8573cd20ed7128e53... | 687034023  |  1.0  |  3   |
| 00006413d8573cd20ed7128e53... | 408875001  |  1.0  |  4   |
| 00006413d8573cd20ed7128e53... | 777043002  |  1.0  |  5   |
| 00006413d8573cd20ed7128e53... | 554450036  |  1.0  |  6   |
| 00006413d8573cd20ed7128e53... | 589599047  |  1.0  |  7   |
| 00006413d8573cd20ed7128e53... | 679853020  |  1.0  |  8   |
| 00006413d8573cd20ed7128e53... | 661306008  |  1.0  |  9   |
| 00006413d8573cd20ed7128e53... | 688558001  |  1.0  |  10  |
| 00007d2de826758b65a93dd24c... | 706016025  |  1.0  |  1   |
| 00007d2de826758b65a93dd24c... | 788261004  |  1.0  |  2   |
| 00007d2de826758b65a93dd24c... | 687034023  |  1.0  |  3   |
| 00007d

In [26]:
# collaborative filtering model via cosine using purchase count
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+----------------------+------+
|          customer_id          | article_id |        score         | rank |
+-------------------------------+------------+----------------------+------+
| 00006413d8573cd20ed7128e53... | 692721005  | 0.19074706236521402  |  1   |
| 00006413d8573cd20ed7128e53... | 727948001  | 0.024825175603230793 |  2   |
| 00006413d8573cd20ed7128e53... | 852584001  | 0.02137303352355957  |  3   |
| 00006413d8573cd20ed7128e53... | 590928013  | 0.021264334519704182 |  4   |
| 00006413d8573cd20ed7128e53... | 590928019  | 0.019364456335703533 |  5   |
| 00006413d8573cd20ed7128e53... | 629758005  | 0.018316805362701416 |  6   |
| 00006413d8573cd20ed7128e53... | 730683001  | 0.017627994219462078 |  7   |
| 00006413d8573cd20ed7128e53... | 717370001  | 0.017570972442626953 |  8   |
| 00006413d8573cd20ed7128e53... | 918292001  | 0.017426252365112305 |  9   |
| 00006413d8573cd20ed7128e53... | 791587009  | 0.017339070638020832 |  10  |

In [27]:
# collaborative filtering model via cosine using scaled purchase count
name = 'cosine'
target = 'scaled_purchase_freq'
cos_norm = model(train_data_norm, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+-------+------+
|          customer_id          | article_id | score | rank |
+-------------------------------+------------+-------+------+
| 00006413d8573cd20ed7128e53... | 688537010  |  0.0  |  1   |
| 00006413d8573cd20ed7128e53... | 660599002  |  0.0  |  2   |
| 00006413d8573cd20ed7128e53... | 817401004  |  0.0  |  3   |
| 00006413d8573cd20ed7128e53... | 811927004  |  0.0  |  4   |
| 00006413d8573cd20ed7128e53... | 832361001  |  0.0  |  5   |
| 00006413d8573cd20ed7128e53... | 706016001  |  0.0  |  6   |
| 00006413d8573cd20ed7128e53... | 819147001  |  0.0  |  7   |
| 00006413d8573cd20ed7128e53... | 562245006  |  0.0  |  8   |
| 00006413d8573cd20ed7128e53... | 554772003  |  0.0  |  9   |
| 00006413d8573cd20ed7128e53... | 706016003  |  0.0  |  10  |
| 00007d2de826758b65a93dd24c... | 688537010  |  0.0  |  1   |
| 00007d2de826758b65a93dd24c... | 660599002  |  0.0  |  2   |
| 00007d2de826758b65a93dd24c... | 817401004  |  0.0  |  3   |
| 00007d

In [28]:
# collaborative filtering model via cosine using dummy user
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+----------------------+------+
|          customer_id          | article_id |        score         | rank |
+-------------------------------+------------+----------------------+------+
| 00006413d8573cd20ed7128e53... | 692721005  | 0.20349427064259848  |  1   |
| 00006413d8573cd20ed7128e53... | 852584001  | 0.02202818791071574  |  2   |
| 00006413d8573cd20ed7128e53... | 918292001  | 0.01737523078918457  |  3   |
| 00006413d8573cd20ed7128e53... | 590928013  | 0.01732941468556722  |  4   |
| 00006413d8573cd20ed7128e53... | 791587009  | 0.01587877670923869  |  5   |
| 00006413d8573cd20ed7128e53... | 791587001  | 0.015266756216684977 |  6   |
| 00006413d8573cd20ed7128e53... | 351484002  | 0.014259437719980875 |  7   |
| 00006413d8573cd20ed7128e53... | 801512004  | 0.014198819796244303 |  8   |
| 00006413d8573cd20ed7128e53... | 712924003  | 0.014109710852305094 |  9   |
| 00006413d8573cd20ed7128e53... | 723529001  | 0.013825674851735434 |  10  |

In [29]:
# collaborative filtering model via pearson using purchase count
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+--------------------+------+
|          customer_id          | article_id |       score        | rank |
+-------------------------------+------------+--------------------+------+
| 00006413d8573cd20ed7128e53... | 717464001  | 2.1688888888888886 |  1   |
| 00006413d8573cd20ed7128e53... | 779554002  | 2.0848484848484863 |  2   |
| 00006413d8573cd20ed7128e53... | 156231001  | 2.058730158730161  |  3   |
| 00006413d8573cd20ed7128e53... | 228257001  | 2.0272517321016226 |  4   |
| 00006413d8573cd20ed7128e53... | 368979001  | 2.0148026315789482 |  5   |
| 00006413d8573cd20ed7128e53... | 158340001  | 1.8204753199268742 |  6   |
| 00006413d8573cd20ed7128e53... | 111593001  | 1.8169257340241771 |  7   |
| 00006413d8573cd20ed7128e53... | 570003002  | 1.8051546391752586 |  8   |
| 00006413d8573cd20ed7128e53... | 160442007  | 1.7966643009226408 |  9   |
| 00006413d8573cd20ed7128e53... | 160442010  | 1.785200000000003  |  10  |
| 00007d2de826758b65a93dd

In [30]:
# collaborative filtering model via pearson using scaled purchase count
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+---------------------+------+
|          customer_id          | article_id |        score        | rank |
+-------------------------------+------------+---------------------+------+
| 00006413d8573cd20ed7128e53... | 699424003  | 0.11365678346810436 |  1   |
| 00006413d8573cd20ed7128e53... | 843872006  | 0.09771825396825383 |  2   |
| 00006413d8573cd20ed7128e53... | 806225008  | 0.09640183906391948 |  3   |
| 00006413d8573cd20ed7128e53... | 777043002  | 0.09441554199662518 |  4   |
| 00006413d8573cd20ed7128e53... | 559630009  | 0.09433085501858735 |  5   |
| 00006413d8573cd20ed7128e53... | 656719015  | 0.09333333333333334 |  6   |
| 00006413d8573cd20ed7128e53... | 542464004  | 0.09069293478260856 |  7   |
| 00006413d8573cd20ed7128e53... | 776237021  | 0.09036939313984163 |  8   |
| 00006413d8573cd20ed7128e53... | 712924012  |  0.0902195608782435 |  9   |
| 00006413d8573cd20ed7128e53... | 832331004  |  0.0899007589025102 |  10  |
| 00007d2de8

In [31]:
# collaborative filtering model via pearson using dummy user
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, customer_id, article_id, target, similar_users, n_rec, n_show)

+-------------------------------+------------+-------+------+
|          customer_id          | article_id | score | rank |
+-------------------------------+------------+-------+------+
| 00006413d8573cd20ed7128e53... | 706016025  |  0.0  |  1   |
| 00006413d8573cd20ed7128e53... | 788261004  |  0.0  |  2   |
| 00006413d8573cd20ed7128e53... | 687034023  |  0.0  |  3   |
| 00006413d8573cd20ed7128e53... | 408875001  |  0.0  |  4   |
| 00006413d8573cd20ed7128e53... | 777043002  |  0.0  |  5   |
| 00006413d8573cd20ed7128e53... | 554450036  |  0.0  |  6   |
| 00006413d8573cd20ed7128e53... | 589599047  |  0.0  |  7   |
| 00006413d8573cd20ed7128e53... | 679853020  |  0.0  |  8   |
| 00006413d8573cd20ed7128e53... | 661306008  |  0.0  |  9   |
| 00006413d8573cd20ed7128e53... | 688558001  |  0.0  |  10  |
| 00007d2de826758b65a93dd24c... | 706016025  |  0.0  |  1   |
| 00007d2de826758b65a93dd24c... | 788261004  |  0.0  |  2   |
| 00007d2de826758b65a93dd24c... | 687034023  |  0.0  |  3   |
| 00007d

In [33]:
# creating initial callable variables for model evaluation
count_models = [popularity, cos, pear]
norm_models = [pop_norm, cos_norm, pear_norm]
dummy_models = [pop_dummy, cos_dummy, pear_dummy]

count_names = ['Popularity - Purchase Counts', 'Cosine Similarity - Purchase Counts', 'Pearson Similarity - Purchase Counts']
norm_names = ['Popularity - Scaled Purchase Counts', 'Cosine Similarity - Scaled Purchase Counts', 'Pearson Similarity - Scaled Purchase Counts']
dummy_names = ['Popularity - Purchase Dummy', 'Cosine Similarity - Purchase Dummy', 'Pearson Similarity - Purchase Dummy']

In [34]:
# Evaluate results using Root Mean Square Errors to calculate precision and recall
eval_counts = tc.recommender.util.compare_models(test_data, count_models, model_names=count_names)
eval_norm = tc.recommender.util.compare_models(test_data_norm, norm_models, model_names=norm_names)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, dummy_models, model_names=dummy_names)

PROGRESS: Evaluate model Popularity - Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.001340609127471159 | 0.0005871077637121909 |
|   2    | 0.0009676932082098116 | 0.0009221236405531425 |
|   3    | 0.0019967523271841544 | 0.0027948807065128318 |
|   4    | 0.0021206642623817482 | 0.0038462536406009148 |
|   5    | 0.0019004550518305248 |  0.004371251629652653 |
|   6    |  0.002192651196163275 |  0.005867264767563453 |
|   7    | 0.0021781526568068806 |  0.006787517951843876 |
|   8    | 0.0021029625573535434 |  0.00745058184750152  |
|   9    | 0.0023670457716263153 |  0.009326267608870445 |
|   10   | 0.0024282018844056685 |  0.010659735470184329 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.7101205440840891

Per User RMSE (best)
+-------------------------------+-----------


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.11781782821321354  | 0.060858784829394184 |
|   2    | 0.08895460810785309  | 0.08880375241947411  |
|   3    | 0.07326775048306255  | 0.10724076154376326  |
|   4    | 0.06340160684277098  | 0.12199241367444581  |
|   5    | 0.056225335624328535 | 0.13394873939175378  |
|   6    | 0.05089358206982468  | 0.14416008107169132  |
|   7    |  0.0466610201037405  | 0.15304227152440925  |
|   8    | 0.04320219123505968  | 0.16082077738798334  |
|   9    | 0.04026967433059354  | 0.16774383120506817  |
|   10   | 0.037839636713809056 | 0.17429648968673872  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.4071330108608728

Per User RMSE (best)
+-------------------------------+-----------------------+-------+
|     


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0014491795849776257 | 0.0006714465538911242 |
|   2    | 0.0010337795736485374 | 0.0010164540597924585 |
|   3    | 0.0020502508134916477 | 0.0029070439545213237 |
|   4    | 0.0021749494911350014 |  0.003988077078812283 |
|   5    |  0.001946715507637636 |  0.004524482833326589 |
|   6    | 0.0023161697601380327 |  0.006339621355859141 |
|   7    | 0.0022826770103069045 |  0.007232915210816224 |
|   8    | 0.0022074026170200794 |  0.007929803412049971 |
|   9    |  0.002397466479526671 |  0.009492139141172043 |
|   10   | 0.0024650214308643793 |  0.010904307471803999 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.707371509110132

Per User RMSE (best)
+-------------------------------+------------


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    |  0.000855520946083274 | 0.00027418159426264595 |
|   2    | 0.0008838807564506758 | 0.0006412811115722556  |
|   3    | 0.0009169672018793153 | 0.0010238882419467228  |
|   4    |  0.000886244073981292 | 0.0013458344319867146  |
|   5    | 0.0008413410408995658 |  0.001588713103200139  |
|   6    | 0.0008413410408995676 |  0.001926984365495887  |
|   7    | 0.0008514695446022054 | 0.0022538118988348387  |
|   8    | 0.0008578842636138917 | 0.0025773159590917892  |
|   9    |  0.000874427486328227 |  0.002933718654959888  |
|   10   | 0.0008649742162057149 |  0.003245918542746187  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.08781360362191774

Per User RMSE (best)
+----------------------------


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.03969900787930111  | 0.017091687695358532 |
|   2    | 0.029253144393974497 | 0.024580128780694314 |
|   3    | 0.02363159976114738  | 0.02930920555825289  |
|   4    | 0.02017800507640604  | 0.03298246642432981  |
|   5    | 0.017752295962978976 | 0.03599011244424692  |
|   6    | 0.015972087644417522 | 0.03862763769910461  |
|   7    | 0.014551283652799219 | 0.04083391246422823  |
|   8    | 0.013486271488464638 | 0.04305698650016803  |
|   9    | 0.012602259436596179 | 0.04509509249689306  |
|   10   | 0.011825568259698268 | 0.04673478691206543  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.09382085212740801

Per User RMSE (best)
+-------------------------------+------+-------+
|          customer_id


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0011769321302471606 | 0.0005250871387631365 |
|   2    |  0.00114857231987975  | 0.0010711010470334447 |
|   3    | 0.0011548744999614136 | 0.0015400480445264247 |
|   4    | 0.0010812177702571794 | 0.0018817148064052278 |
|   5    |  0.001026625135299917 | 0.0022154761662126895 |
|   6    | 0.0010288308983284916 | 0.0026492048246081943 |
|   7    | 0.0010128503702643186 | 0.0029548303120909797 |
|   8    | 0.0009459178416293661 | 0.0030907562455859757 |
|   9    | 0.0009195931102466829 |  0.003332392792272388 |
|   10   | 0.0008980606616343468 | 0.0035693016590506903 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.08760440871966008

Per User RMSE (best)
+-------------------------------+----------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0011463507833397151 | 0.0004697975846632127 |
|   2    |  0.001158144515678584 |  0.000979803108694258 |
|   3    | 0.0011825182291789731 | 0.0014615518819529468 |
|   4    | 0.0012442387617524714 | 0.0020863621662490102 |
|   5    | 0.0012407006420507767 |  0.002706641649158932 |
|   6    | 0.0012603568626155764 |  0.003270793032874129 |
|   7    |  0.001288549498968556 |  0.003871442139375293 |
|   8    | 0.0013020280502129991 |  0.004448988326999505 |
|   9    | 0.0013130355337292946 |  0.005000057004896005 |
|   10   | 0.0013204262726616211 |  0.005567426019423064 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+-------------------------------+------+-------+
|         


Precision and recall summary statistics by cutoff
+--------+---------------------+---------------------+
| cutoff |    mean_precision   |     mean_recall     |
+--------+---------------------+---------------------+
|   1    |  0.1233860277294236 | 0.06444081715723951 |
|   2    | 0.09256664638144696 | 0.09305199486357142 |
|   3    | 0.07569374664861694 | 0.11177429115106025 |
|   4    | 0.06483840227949256 | 0.12587175175802043 |
|   5    |  0.0573250871556832 |  0.1376093215597892 |
|   6    | 0.05160308272438379 | 0.14747863214067974 |
|   7    | 0.04711427587494079 | 0.15592018335407823 |
|   8    | 0.04357548224571519 | 0.16383666612800507 |
|   9    | 0.04061551751160277 | 0.17089347122781473 |
|   10   | 0.03809658595036351 | 0.17727797463941317 |
+--------+---------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9895176526156739

Per User RMSE (best)
+-------------------------------+---------------------+-------+
|          customer_id          |   


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    | 0.0011463507833397132 | 0.00046979758466321193 |
|   2    |  0.001158144515678595 | 0.0009798031086942496  |
|   3    |  0.001182518229178972 | 0.0014615518819529488  |
|   4    |  0.001244238761752444 | 0.0020863621662490263  |
|   5    | 0.0012407006420507571 |  0.00270664164915894   |
|   6    | 0.0012603568626155682 |  0.003270793032874132  |
|   7    | 0.0012885494989685416 |  0.003871442139375286  |
|   8    | 0.0013020280502130035 |  0.004448988326999485  |
|   9    | 0.0013130355337292953 |  0.005000057004896024  |
|   10   | 0.0013204262726616309 | 0.0055674260194230624  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+-------------------------------+------+-----

In [68]:
# final calculation using most accurate model
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), user_id=customer_id, item_id=article_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=similar_users, k=n_rec)
recom.print_rows(n_show)

+-------------------------------+------------+----------------------+------+
|          customer_id          | article_id |        score         | rank |
+-------------------------------+------------+----------------------+------+
| 00006413d8573cd20ed7128e53... | 692721005  |  0.252916157245636   |  1   |
| 00006413d8573cd20ed7128e53... | 852584001  | 0.028556068738301594 |  2   |
| 00006413d8573cd20ed7128e53... | 590928013  | 0.022866229216257732 |  3   |
| 00006413d8573cd20ed7128e53... | 791587009  | 0.01999024550120036  |  4   |
| 00006413d8573cd20ed7128e53... | 918292001  | 0.019870142141977947 |  5   |
| 00006413d8573cd20ed7128e53... | 712924003  | 0.019553224245707195 |  6   |
| 00006413d8573cd20ed7128e53... | 723529001  | 0.01744234561920166  |  7   |
| 00006413d8573cd20ed7128e53... | 791587001  | 0.016432384649912517 |  8   |
| 00006413d8573cd20ed7128e53... | 351484002  | 0.01633914311726888  |  9   |
| 00006413d8573cd20ed7128e53... | 801512004  | 0.016118963559468586 |  10  |

In [74]:
# sending results to a dataframe
df_rec = recom.to_dataframe()
print(df_rec.shape)
print(df_rec.head())

(3497680, 4)
                                         customer_id  article_id     score  \
0  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   692721005  0.252916   
1  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   852584001  0.028556   
2  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   590928013  0.022866   
3  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   791587009  0.019990   
4  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   918292001  0.019870   

   rank  
0     1  
1     2  
2     3  
3     4  
4     5  


In [93]:
# customer recommendation function to show recommendations for single customer
def customer_recomendation (customer_id):
    print("Recommended items for user", customer_id, "\n\n")
    for i in range(len(df_rec.customer_id)):
        if customer_id == df_rec.customer_id[i]:
            rec_article = df_rec.article_id[i]
            info = df_all_articles.loc[df_all_articles['article_id'] == rec_article]
            print(info.iloc[0,2], 
                  "\nSection:\t", info.iloc[0,21], 
                  "\nProduct Type:\t" ,info.iloc[0,4], 
                  "\nColor:\t\t",info.iloc[0,9], 
                  "\nDescription:\t", info.iloc[0,24], "\n")
        
customer_recomendation ('00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a')

Recommended items for user 00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a 


Baby Lock Me Up Tie Tanga 
Section:	 Womens Swimwear, beachwear 
Product Type:	 Swimwear bottom 
Color:		 Light Blue 
Description:	 Fully lined bikini bottoms with a low waist, frill trims and ties at the sides with round, flat pendants at the ends. 

SUPREME RW tights 
Section:	 Ladies H&M Sport 
Product Type:	 Unknown 
Color:		 Black 
Description:	 High-waisted, ankle-length sports tights in fast-drying functional fabric. Wide panel to hold in and shape the waist. Concealed key pocket in the waistband. 

New Girl Push Top 
Section:	 Womens Swimwear, beachwear 
Product Type:	 Bikini top 
Color:		 Blue 
Description:	 Lined, push-up triangle bikini top with moulded, padded cups for a larger bust and fuller cleavage. Shoulder straps that join and tie at the back. 

Speedy Tee 
Section:	 Ladies H&M Sport 
Product Type:	 T-shirt 
Color:		 Greenish Khaki 
Description:	 Straight-cut sports top in f