# Purchase-based Recommendation System
## Based on Moorissa Tjokro tutorial
### Coded by Rebeca Bivar - DB: Armazem Paraíba

### Imports and reading file (data here has only clients, products and purchase count)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import turicreate as tc
from sklearn.model_selection import train_test_split


In [2]:
#READING A FILE WITH ONLY CLIENTS, PRODUCTS BOUGHT AND QTDE
buyers = pd.read_csv('purchase_limpa.csv', sep=';')
buyers.head()

Unnamed: 0.1,Unnamed: 0,COD_CLIENTE,COD_PRODUTO,NOME_PRODUTO,QUANTIDADE,CANAL
0,0,5190001,25172,SMARTPHONE LG K9 TV LM-X210BMW PRETO,1,VENDAWEB
1,1,37578201,25367,TABLET NB729 MINI MS40G BRANCO,1,VENDAWEB
2,2,93168801,25354,"TELEVISOR SMART 32"" UN32J4290 SAMSUNG",1,VENDAWEB
3,3,62982901,26584,SMARTPHONE MOTOROLA MOTO G8 PLUS 64GB CEREJA,1,VENDAWEB
4,4,54509401,21647,DVD D-15 KARAOKE MONDIAL,1,VENDAWEB


##  Data preparation
### Creating dummy table to check if the client has bought a product or not

In [3]:
def create_data_dummy(db):
    data_dummy = db.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

data_dummy = create_data_dummy(buyers)
data_dummy


Unnamed: 0.1,Unnamed: 0,COD_CLIENTE,COD_PRODUTO,NOME_PRODUTO,QUANTIDADE,CANAL,purchase_dummy
0,0,05190001,25172,SMARTPHONE LG K9 TV LM-X210BMW PRETO,1,VENDAWEB,1
1,1,37578201,25367,TABLET NB729 MINI MS40G BRANCO,1,VENDAWEB,1
2,2,93168801,25354,"TELEVISOR SMART 32"" UN32J4290 SAMSUNG",1,VENDAWEB,1
3,3,62982901,26584,SMARTPHONE MOTOROLA MOTO G8 PLUS 64GB CEREJA,1,VENDAWEB,1
4,4,54509401,21647,DVD D-15 KARAOKE MONDIAL,1,VENDAWEB,1
...,...,...,...,...,...,...,...
291400,291400,9OI04701,24716,RECEPTOR MIDIA BOX HDTV B3,1,VENDAWEB,1
291401,291401,99FZPR01,25981,SMARTPHONE SAMSUNG GAL A10 A105 32GB PRETO,1,VENDAWEB,1
291402,291402,9NYVEB01,25981,SMARTPHONE SAMSUNG GAL A10 A105 32GB PRETO,1,VENDAWEB,1
291403,291403,99K0N601,25727,SMARTPHONE SAMSUNG GAL J2 CORE 16GB PRATA,1,VENDAWEB,1


### Normalizing purchase frequency of each item across users 

In [4]:
#Dummy for marking whether a customer bought that item or not

df_matrix = pd.pivot_table(buyers, values = 'QUANTIDADE', index = 'COD_CLIENTE', columns = 'COD_PRODUTO')


df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

# create a table for input to the modeling  
data_input = df_matrix_norm.reset_index()
data_input.index.names = ['FREQ_COMPRAS']
data_norm = pd.melt(data_input, id_vars=['COD_CLIENTE'], 
                    value_name='FREQ_COMPRAS')

#print(data_norm.shape)
#data_norm.head()

In [5]:
#Just cleaning useless values 
data_norm = data_norm.dropna()
data_norm

Unnamed: 0,COD_CLIENTE,COD_PRODUTO,FREQ_COMPRAS
1085593,00017101,11332,0.0
1088205,00449301,11332,0.0
1088477,00846001,11332,0.0
1089371,02130901,11332,0.0
1090227,03152901,11332,0.0
...,...,...,...
152631320,YZ0WFN01,26635,0.0
152631324,YZ0WH901,26635,0.0
152631337,YZ0WM301,26635,0.0
152631435,YZ0XPA01,26635,0.0


### Split data into trainning and testing (80/20)

In [6]:

# Returns train and test datasets as scalable dfs
def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

# Now actually splitting purchase_counts, purchase_dummy and purchase_counts_norm
train_data, test_data = split_data(buyers)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

## Baseline model to compare and evaluate models


In [7]:
# variables to define field names: 

user_id = 'COD_CLIENTE'
item_id = 'COD_PRODUTO'
item_name = 'NOME_PRODUTO'
users_to_recommend = list(buyers[user_id])
n_recommendation = 10 # itens to recommend
n_display = 30 # display the first few rows in an output dataset

# Function for all models using turicreate
def model(train_data, name, user_id, item_id, target, 
          users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id = user_id, 
                                                    item_id = item_id,
                                                    target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

### Popularity Model
   Takes the most popular items for recommendation, which are the products with the highest number of sells across customers.

In [18]:
name = 'popularity'
target = 'QUANTIDADE'
popularity, recom_pop = model(train_data, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)
popularity, recom_pop

+-------------+-------------+--------------------+------+
| COD_CLIENTE | COD_PRODUTO |       score        | rank |
+-------------+-------------+--------------------+------+
|   05190001  |    20249    |        5.0         |  1   |
|   05190001  |    16232    |        4.0         |  2   |
|   05190001  |    14314    |        2.0         |  3   |
|   05190001  |    20250    |        2.0         |  4   |
|   05190001  |    18662    |        2.0         |  5   |
|   05190001  |    17372    |        2.0         |  6   |
|   05190001  |    17196    |        2.0         |  7   |
|   05190001  |    17868    | 1.5294117647058822 |  8   |
|   05190001  |    24598    | 1.5280898876404494 |  9   |
|   05190001  |    23193    | 1.3333333333333333 |  10  |
|   37578201  |    20249    |        5.0         |  1   |
|   37578201  |    16232    |        4.0         |  2   |
|   37578201  |    14314    |        2.0         |  3   |
|   37578201  |    20250    |        2.0         |  4   |
|   37578201  

(Class                            : PopularityRecommender
 
 Schema
 ------
 User ID                          : COD_CLIENTE
 Item ID                          : COD_PRODUTO
 Target                           : QUANTIDADE
 Additional observation features  : 0
 User side features               : []
 Item side features               : []
 
 Statistics
 ----------
 Number of observations           : 233124
 Number of users                  : 182327
 Number of items                  : 675
 
 Training summary
 ----------------
 Training time                    : 0.0079
 
 Model Parameters
 ----------------
 Model class                      : PopularityRecommender,
 Columns:
 	COD_CLIENTE	str
 	COD_PRODUTO	int
 	score	float
 	rank	int
 
 Rows: 2914050
 
 Data:
 +-------------+-------------+--------------------+------+
 | COD_CLIENTE | COD_PRODUTO |       score        | rank |
 +-------------+-------------+--------------------+------+
 |   05190001  |    20249    |        5.0         |  1   |
 |

In [9]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy, recom_pop_dummy = model(train_data_dummy, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)

+-------------+-------------+-------+------+
| COD_CLIENTE | COD_PRODUTO | score | rank |
+-------------+-------------+-------+------+
|   05190001  |    25241    |  1.0  |  1   |
|   05190001  |    25981    |  1.0  |  2   |
|   05190001  |    21653    |  1.0  |  3   |
|   05190001  |    26631    |  1.0  |  4   |
|   05190001  |    26423    |  1.0  |  5   |
|   05190001  |    26006    |  1.0  |  6   |
|   05190001  |    26005    |  1.0  |  7   |
|   05190001  |    19525    |  1.0  |  8   |
|   05190001  |    19729    |  1.0  |  9   |
|   05190001  |    25173    |  1.0  |  10  |
|   37578201  |    25241    |  1.0  |  1   |
|   37578201  |    25981    |  1.0  |  2   |
|   37578201  |    21653    |  1.0  |  3   |
|   37578201  |    26631    |  1.0  |  4   |
|   37578201  |    26423    |  1.0  |  5   |
|   37578201  |    26006    |  1.0  |  6   |
|   37578201  |    26005    |  1.0  |  7   |
|   37578201  |    19525    |  1.0  |  8   |
|   37578201  |    19729    |  1.0  |  9   |
|   375782

### Using scaled purchase count 

In [10]:
name = 'popularity'
target = 'FREQ_COMPRAS'
pop_norm, recom_pop_norm = model(train_data_norm, name, user_id, item_id, target, 
                 users_to_recommend, n_recommendation, n_display)

+-------------+-------------+---------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score        | rank |
+-------------+-------------+---------------------+------+
|   05190001  |    17372    |         0.5         |  1   |
|   05190001  |    23193    |  0.3333333333333333 |  2   |
|   05190001  |    20503    |  0.3333333333333333 |  3   |
|   05190001  |    23575    | 0.14285714285714285 |  4   |
|   05190001  |    22976    | 0.14285714285714285 |  5   |
|   05190001  |    22950    |        0.125        |  6   |
|   05190001  |    24597    | 0.10357142857142858 |  7   |
|   05190001  |    20490    |         0.1         |  8   |
|   05190001  |    20796    | 0.08333333333333333 |  9   |
|   05190001  |    24369    | 0.08333333333333333 |  10  |
|   37578201  |    17372    |         0.5         |  1   |
|   37578201  |    23193    |  0.3333333333333333 |  2   |
|   37578201  |    20503    |  0.3333333333333333 |  3   |
|   37578201  |    23575    | 0.14285714285714285 |  4  

### Collaborative Filtering Model

   Recommends items based on how similar clients purchase items. Meaning: if customer 1 and customer 2 bought similar items, for example, 1 bought X, Y, Z and 2 bought X, Y, we would recommend an item Z to customer 2.
    
   - Lets say X and Y have been rated by costumers 1 and 2. 
   - We then create two item-vectors for both items, then we find the **cosine** or **pearson** distance between these vectors. If the **cosine** value is 1, means total similarity, if it is 0, means no similarity.
   - In this case, we will check the similarity between the target item and other items the customer already bought - using the client's purchase count to items already bought by him as weighing factor (some sort of simulated rating). 

### Using purchase count and purchase frequency
### Cosine

In [11]:
name = 'cosine'
target = 'QUANTIDADE'
cos, cos_recom = model(train_data, name, user_id, item_id, target, 
            users_to_recommend, n_recommendation, n_display)

+-------------+-------------+----------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score         | rank |
+-------------+-------------+----------------------+------+
|   05190001  |    26245    | 0.08027732372283936  |  1   |
|   05190001  |    25173    | 0.04052096605300903  |  2   |
|   05190001  |    25481    | 0.038444697856903076 |  3   |
|   05190001  |    26586    | 0.03501749038696289  |  4   |
|   05190001  |    25933    | 0.018199920654296875 |  5   |
|   05190001  |    22442    | 0.016484498977661133 |  6   |
|   05190001  |    24316    | 0.016484498977661133 |  7   |
|   05190001  |    25726    | 0.014179825782775879 |  8   |
|   05190001  |    21653    | 0.014024078845977783 |  9   |
|   05190001  |    25727    | 0.013837754726409912 |  10  |
|   37578201  |    25366    | 0.07233434915542603  |  1   |
|   37578201  |    22442    | 0.03616034984588623  |  2   |
|   37578201  |    19196    | 0.020877182483673096 |  3   |
|   37578201  |    26177    | 0.01422715

In [12]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy, cos_recom_dummy = model(train_data_dummy, name, user_id, item_id, target,
                  users_to_recommend, n_recommendation, n_display)

+-------------+-------------+----------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score         | rank |
+-------------+-------------+----------------------+------+
|   05190001  |    26245    | 0.09286093711853027  |  1   |
|   05190001  |    25173    | 0.04136532545089722  |  2   |
|   05190001  |    25481    | 0.03923690319061279  |  3   |
|   05190001  |    23810    | 0.017888545989990234 |  4   |
|   05190001  |    24089    | 0.017888545989990234 |  5   |
|   05190001  |    22442    | 0.017888545989990234 |  6   |
|   05190001  |    24316    | 0.017888545989990234 |  7   |
|   05190001  |    26247    | 0.014258146286010742 |  8   |
|   05190001  |    25726    | 0.01263505220413208  |  9   |
|   05190001  |    25727    | 0.011046767234802246 |  10  |
|   37578201  |    25366    | 0.05705153942108154  |  1   |
|   37578201  |    22442    | 0.03636962175369263  |  2   |
|   37578201  |    24315    | 0.01626497507095337  |  3   |
|   37578201  |    19196    | 0.01626497

In [13]:
name = 'cosine'
target = 'FREQ_COMPRAS'
cos_norm, cos_recom_norm = model(train_data_norm, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)


+-------------+-------------+-------+------+
| COD_CLIENTE | COD_PRODUTO | score | rank |
+-------------+-------------+-------+------+
|   05190001  |    21653    |  0.0  |  1   |
|   05190001  |    26569    |  0.0  |  2   |
|   05190001  |    26222    |  0.0  |  3   |
|   05190001  |    21855    |  0.0  |  4   |
|   05190001  |    26245    |  0.0  |  5   |
|   05190001  |    25354    |  0.0  |  6   |
|   05190001  |    26243    |  0.0  |  7   |
|   05190001  |    25481    |  0.0  |  8   |
|   05190001  |    21647    |  0.0  |  9   |
|   05190001  |    26578    |  0.0  |  10  |
|   37578201  |    26569    |  0.0  |  1   |
|   37578201  |    26222    |  0.0  |  2   |
|   37578201  |    21855    |  0.0  |  3   |
|   37578201  |    26245    |  0.0  |  4   |
|   37578201  |    25354    |  0.0  |  5   |
|   37578201  |    26243    |  0.0  |  6   |
|   37578201  |    25172    |  0.0  |  7   |
|   37578201  |    25481    |  0.0  |  8   |
|   37578201  |    21647    |  0.0  |  9   |
|   375782

### Using purchase count and purchase frequency
### Pearson

In [14]:
# PURCHASE COUNT
name = 'pearson'
target = 'QUANTIDADE'
pear, pear_recom = model(train_data, name, user_id, item_id, target,
             users_to_recommend, n_recommendation, n_display)

+-------------+-------------+--------------------+------+
| COD_CLIENTE | COD_PRODUTO |       score        | rank |
+-------------+-------------+--------------------+------+
|   05190001  |    20249    |        5.0         |  1   |
|   05190001  |    16232    |        4.0         |  2   |
|   05190001  |    14314    |        2.0         |  3   |
|   05190001  |    20250    |        2.0         |  4   |
|   05190001  |    18662    |        2.0         |  5   |
|   05190001  |    17372    |        2.0         |  6   |
|   05190001  |    17196    |        2.0         |  7   |
|   05190001  |    24598    | 1.4818840579710144 |  8   |
|   05190001  |    24597    | 1.338709677419355  |  9   |
|   05190001  |    23193    | 1.3333333333333333 |  10  |
|   37578201  |    20249    |        5.0         |  1   |
|   37578201  |    16232    |        4.0         |  2   |
|   37578201  |    14314    |        2.0         |  3   |
|   37578201  |    20250    |        2.0         |  4   |
|   37578201  

In [15]:
# PURCHASE DUMMY
name = 'pearson'
target = 'purchase_dummy'
pear_dummy, pear_recom_dummy = model(train_data_dummy, name, user_id, item_id, target,
                   users_to_recommend, n_recommendation, n_display)

+-------------+-------------+-------+------+
| COD_CLIENTE | COD_PRODUTO | score | rank |
+-------------+-------------+-------+------+
|   05190001  |    25241    |  0.0  |  1   |
|   05190001  |    25981    |  0.0  |  2   |
|   05190001  |    21653    |  0.0  |  3   |
|   05190001  |    26631    |  0.0  |  4   |
|   05190001  |    26423    |  0.0  |  5   |
|   05190001  |    26006    |  0.0  |  6   |
|   05190001  |    26005    |  0.0  |  7   |
|   05190001  |    19525    |  0.0  |  8   |
|   05190001  |    19729    |  0.0  |  9   |
|   05190001  |    25173    |  0.0  |  10  |
|   37578201  |    25241    |  0.0  |  1   |
|   37578201  |    25981    |  0.0  |  2   |
|   37578201  |    21653    |  0.0  |  3   |
|   37578201  |    26631    |  0.0  |  4   |
|   37578201  |    26423    |  0.0  |  5   |
|   37578201  |    26006    |  0.0  |  6   |
|   37578201  |    26005    |  0.0  |  7   |
|   37578201  |    19525    |  0.0  |  8   |
|   37578201  |    19729    |  0.0  |  9   |
|   375782

In [16]:
# PURCHASE FREQUENCY
name = 'pearson'
target = 'FREQ_COMPRAS'
pear_norm, pear_recom_norm = model(train_data_norm, name, user_id, item_id, target,
                  users_to_recommend, n_recommendation, n_display)

+-------------+-------------+---------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score        | rank |
+-------------+-------------+---------------------+------+
|   05190001  |    17372    |         0.5         |  1   |
|   05190001  |    20503    | 0.33333333333333337 |  2   |
|   05190001  |    23193    |  0.3333333333333333 |  3   |
|   05190001  |    22976    | 0.14285714285714288 |  4   |
|   05190001  |    23575    | 0.14285714285714285 |  5   |
|   05190001  |    22950    | 0.12499994039535522 |  6   |
|   05190001  |    24597    | 0.10357142857142858 |  7   |
|   05190001  |    20490    |         0.1         |  8   |
|   05190001  |    20796    | 0.08333333333333333 |  9   |
|   05190001  |    24369    | 0.08333333333333331 |  10  |
|   37578201  |    17372    |         0.5         |  1   |
|   37578201  |    20503    | 0.33333333333333337 |  2   |
|   37578201  |    23193    |  0.3333333333333333 |  3   |
|   37578201  |    22976    | 0.14285714285714288 |  4  

## Model Evaluation 

### RMSE - Root Mean Squared Errors

   - Measures the error of predicted values
   - Lesser the RMSE values, better the recommendations
   
### Precision-Recall

   - Recall: Percentage of products that a customer buys that are actually recommended. 
   - Precision: How many itens the customer liked out of the recommended?
   - The idea is to optimze both recall and precision to be close as 1 as possible


In [19]:
# Variables for model evaluation

models_counts = [popularity, cos, pear]
models_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_norm = [pop_norm, cos_norm, pear_norm]

names_counts = ['Popularity Model on Purchase Counts', 
                  'Cosine Similarity on Purchase Counts', 
                  'Pearson Similarity on Purchase Counts']
names_dummy = ['Popularity Model on Purchase Dummy', 
                 'Cosine Similarity on Purchase Dummy', 
                 'Pearson Similarity on Purchase Dummy']
names_norm = ['Popularity Model on Scaled Purchase Counts', 
                'Cosine Similarity on Scaled Purchase Counts', 
                'Pearson Similarity on Scaled Purchase Counts']


eval_counts = tc.recommender.util.compare_models(test_data, models_counts, 
                                                 model_names=names_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_dummy,
                                                model_names=names_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_norm, 
                                               model_names=names_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    | 3.0815121596469818e-06 | 1.8489072957881888e-05 |
|   7    | 2.6412961368402736e-06 | 1.8489072957881868e-05 |
|   8    | 1.3866804718411384e-05 | 0.00010168990126835003 |
|   9    | 3.9032487355528216e-05 | 0.0003281810450024031  |
|   10   | 4.622268239470468e-05  | 0.0004083003611532251  |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.4505909329593613

Per User RMSE (best)
+-------------+-


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.04100876382058199  | 0.038409186155830416 |
|   2    | 0.029610250342047846 | 0.05537398912394288  |
|   3    | 0.031875161779387894 | 0.08914089121813853  |
|   4    | 0.03134360093184928  | 0.11658637926803418  |
|   5    | 0.026413489627631556 | 0.12254263412141597  |
|   6    | 0.02359822011857672  | 0.13121788323966327  |
|   7    | 0.02114357557540611  | 0.13695488850288665  |
|   8    | 0.020925008320082907 | 0.15502025353881613  |
|   9    | 0.020013394306185785 |  0.1667839482190699  |
|   10   | 0.018688754945828025 | 0.17298456698671066  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.1161338349609993

Per User RMSE (best)
+-------------+---------------------+-------+
| COD_CLIENTE |         rm


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    | 6.163024319293969e-06  | 1.8489072957881895e-05 |
|   4    | 9.244536478940929e-06  | 3.6978145915763715e-05 |
|   5    |  7.39562918315276e-06  | 3.697814591576376e-05  |
|   6    | 6.163024319293941e-06  | 3.6978145915763736e-05 |
|   7    |  5.28259227368054e-06  | 3.697814591576362e-05  |
|   8    |  3.46670117960283e-05  | 0.0002634692896498177  |
|   9    | 4.314117023505773e-05  |  0.000343588605800638  |
|   10   | 3.8827053211551945e-05 | 0.00034358860580063793 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.4372152136368149

Per User RMSE (best)
+-------------+-


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    | 9.257373497991066e-06  | 1.8514746995982132e-05 |
|   3    | 6.1715823319941015e-06 | 1.851474699598225e-05  |
|   4    | 4.628686748995574e-06  | 1.8514746995982295e-05 |
|   5    | 2.5920645794375186e-05 | 0.00012034585547388491 |
|   6    | 2.160053816197934e-05  | 0.00012034585547388508 |
|   7    | 2.1159710852551222e-05 | 0.00013886060246986724 |
|   8    | 2.082909037048007e-05  | 0.00015737534946584942 |
|   9    | 1.8514746995982305e-05 | 0.0001573753494658498  |
|   10   | 1.6663272296384076e-05 | 0.00015737534946584942 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+-------------+------+-------+



Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.03812186406472747  | 0.03559444681021758 |
|   2    | 0.030336412952916985 | 0.05665240002550924 |
|   3    | 0.02394573944813719  | 0.06687624331669081 |
|   4    | 0.020208846346114653 | 0.07495615237695578 |
|   5    | 0.01746681231600991  |  0.0806625737320412 |
|   6    | 0.015654218585102894 | 0.08659685872336965 |
|   7    | 0.014301319572468262 | 0.09229940079813262 |
|   8    | 0.01311075521652996  | 0.09657167866745525 |
|   9    | 0.01215184561169614  |  0.1005349365928692 |
|   10   | 0.01141804447242242  | 0.10481863188950617 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9945969034812345

Per User RMSE (best)
+-------------+--------------------+-------+
| COD_CLIENTE |        rmse        | coun


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    | 6.171582331994096e-06  | 1.8514746995982305e-05 |
|   4    | 1.851474699598232e-05  | 7.405898798392928e-05  |
|   5    | 1.8514746995982332e-05 | 9.257373497991091e-05  |
|   6    | 1.5428955829985214e-05 | 9.257373497991141e-05  |
|   7    | 1.851474699598233e-05  | 0.00012960322897187645 |
|   8    | 1.6200403621484465e-05 | 0.00012960322897187572 |
|   9    | 1.851474699598218e-05  | 0.00016663272296384097 |
|   10   | 1.6663272296384035e-05 | 0.00016663272296384057 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+-------------+------+-------+



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    | 1.9916747993387658e-05 | 7.966699197355063e-05  |
|   5    | 5.178354478280798e-05  | 0.00024895934991734443 |
|   6    | 4.9791869983469025e-05 | 0.0002887928459041202  |
|   7    | 5.1214494840139454e-05 | 0.00034854308988428376 |
|   8    | 4.481268298512219e-05  | 0.00034854308988428425 |
|   9    | 4.425943998530586e-05  | 0.00037841821187436516 |
|   10   | 3.983349598677532e-05  | 0.00037841821187436424 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.052547743783315025

Per User RMSE (best)
+-------------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    | 0.0019916747993387737 | 0.0017725905714115022 |
|   2    |  0.00171284032743134  |  0.003043942984989403 |
|   3    | 0.0016862846634401536 |  0.004554296374487954 |
|   4    |  0.001404130733533831 |  0.005067152635317705 |
|   5    | 0.0015296062458921786 |  0.006986463250280496 |
|   6    | 0.0014705198935117847 |  0.008012839663539743 |
|   7    | 0.0015108275977841247 |  0.009636054625000843 |
|   8    | 0.0015410583759883668 |  0.011245991754466302 |
|   9    | 0.0014981820435026212 |  0.012266725089127437 |
|   10   |  0.002007608197733449 |  0.01861684824101917  |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.05259305907165062

Per User RMSE (best)
+-------------+------+-------+
| COD_CLIENT


Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    |          0.0          |          0.0           |
|   2    |          0.0          |          0.0           |
|   3    |          0.0          |          0.0           |
|   4    | 5.477105698181566e-05 | 0.00018920910593718204 |
|   5    | 5.975024398016292e-05 | 0.00026887609791073397 |
|   6    | 5.643078598126479e-05 | 0.0003087095938975075  |
|   7    | 5.690499426682184e-05 | 0.0003684598378776719  |
|   8    | 4.979186998346907e-05 | 0.0003684598378776714  |
|   9    | 4.647241198457114e-05 | 0.00037841821187436635 |
|   10   | 4.381684558545269e-05 | 0.00039833495986775307 |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.05246546764609195

Per User RMSE (best)
+-------------+------+-------

## Final Model

### Cosine Similarity - Purchase Dummy

In [29]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy),
                                                   user_id = user_id,
                                                   item_id = item_id,
                                                   target = 'purchase_dummy',
                                                   similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=10)
recom.print_rows(n_display)

+-------------+-------------+----------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score         | rank |
+-------------+-------------+----------------------+------+
|   05190001  |    26245    | 0.11014425754547119  |  1   |
|   05190001  |    25481    | 0.050211191177368164 |  2   |
|   05190001  |    25173    | 0.04718977212905884  |  3   |
|   05190001  |    24089    | 0.016027212142944336 |  4   |
|   05190001  |    24316    | 0.016027212142944336 |  5   |
|   05190001  |    22442    | 0.016027212142944336 |  6   |
|   05190001  |    25726    | 0.014931023120880127 |  7   |
|   05190001  |    25727    | 0.014776825904846191 |  8   |
|   05190001  |    26247    | 0.014118075370788574 |  9   |
|   05190001  |    25729    | 0.011492371559143066 |  10  |
|   37578201  |    25366    | 0.07738715410232544  |  1   |
|   37578201  |    22442    | 0.03291541337966919  |  2   |
|   37578201  |    19196    | 0.01472020149230957  |  3   |
|   37578201  |    25870    | 0.01328217

In [30]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(2914050, 4)


Unnamed: 0,COD_CLIENTE,COD_PRODUTO,score,rank
0,5190001,26245,0.110144,1
1,5190001,25481,0.050211,2
2,5190001,25173,0.04719,3
3,5190001,24089,0.016027,4
4,5190001,24316,0.016027,5


## Output Dataframe

    Contains all recommendations

In [38]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['COD_CLIENTE', 'recommendedProducts']].drop_duplicates() \
        .sort_values('COD_CLIENTE').set_index('COD_CLIENTE')
    if print_csv:
        df_output.to_csv('../output/option1_recommendation.csv', sep=';')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [37]:
df_output = create_output(final_model, users_to_recommend, 10, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(217115, 1)


Unnamed: 0_level_0,recommendedProducts
COD_CLIENTE,Unnamed: 1_level_1
301,15558|14537|19729|20966|24654|21823|21821|2524...
1501,25727|25729|25728|25983|25981|25982|25172|2165...
2701,26243|21653|24716|25727|25546|25726|25997|1780...
3101,25726|21256|25729|25982|25983|25728|21854|2548...
4001,25982|25983|25729|25727|25726|26003|25728|2535...


## Customer Recommendation

### Given a 'Client Code', shows all suggested products

In [41]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [45]:
a = customer_recomendation('00003101')

In [46]:
a

recommendedProducts    25726|21256|25729|25982|25983|25728|21854|2548...
Name: 00003101, dtype: object