# Product-based Recommendation System
## Based on Moorissa Tjokro tutorial
### Coded by Rebeca Bivar - DB: Armazem Paraíba

### Imports and reading file (data here has only clients, products and purchase count)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import turicreate as tc
from sklearn.model_selection import train_test_split


In [2]:
#READING A FILE WITH ONLY CLIENTS, PRODUCTS BOUGHT AND QTDE
buyers = pd.read_csv('teste.csv', sep=';')
buyers.head()

Unnamed: 0,COD_CLIENTE,COD_PRODUTO,QUANTIDADE
0,301,25243,1
1,1501,25726,1
2,2001,21653,1
3,2701,25303,1
4,3101,21255,1


##  Data preparation
### Creating dummy table to check if the client has bought a product or not

In [3]:
def create_data_dummy(db):
    data_dummy = db.copy()
    data_dummy['purchase_dummy'] = '1'
    return data_dummy

data_dummy = create_data_dummy(buyers)
data_dummy


Unnamed: 0,COD_CLIENTE,COD_PRODUTO,QUANTIDADE,purchase_dummy
0,00000301,25243,1,1
1,00001501,25726,1,1
2,00002001,21653,1,1
3,00002701,25303,1,1
4,00003101,21255,1,1
...,...,...,...,...
311736,YZ399601,25996,1,1
311737,YZ399801,26327,1,1
311738,YZ400601,26432,1,1
311739,YZ401401,26432,1,1


### Normalizing purchase frequency of each item across users 

In [4]:
#Dummy for marking whether a customer bought that item or not

df_matrix = pd.pivot_table(buyers, values = 'QUANTIDADE', index = 'COD_CLIENTE', columns = 'COD_PRODUTO')

df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
#df_matrix_norm

# create a table for input to the modeling  
data_input = df_matrix_norm.reset_index()
data_input.index.names = ['FREQ_COMPRAS']
data_norm = pd.melt(data_input, id_vars=['COD_CLIENTE'], 
                    value_name='FREQ_COMPRAS')

#print(data_norm.shape)
#data_norm.head()

In [9]:
#Just cleaning useless values 
data_norm = data_norm.dropna()
data_norm.head()

Unnamed: 0,COD_CLIENTE,COD_PRODUTO,FREQ_COMPRAS
1181239,17101,11332,0.25
1184028,449301,11332,1.0
1184343,846001,11332,0.0
1185377,2130901,11332,0.0
1186365,3152901,11332,0.0


### Split data into trainning and testing (80/20)

In [12]:

# Returns train and test datasets as scalable dfs
def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

# Now actually splitting purchase_counts, purchase_dummy and purchase_counts_norm
train_data, test_data = split_data(buyers)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

## Baseline model to compare and evaluate models
### Popularity Model
   Takes the most popular items for recommendation, which are the products with the highest number of sells across customers.


In [25]:
# variables to define field names: 

user_id = 'COD_CLIENTE'
item_id = 'COD_PRODUTO'
users_to_recommend = list(buyers[user_id])
n_recommendation = 10 # itens to recommend
n_display = 30 # display the first few rows in an output dataset

# Function for all models using turicreate
def model(train_data, name, user_id, item_id, target, 
          users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id = user_id, 
                                                    item_id = item_id,
                                                    target = target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

### Using purchase count - Buyers

In [27]:
name = 'popularity'
target = 'QUANTIDADE'
popularity = model(train_data, name, user_id, item_id, target, 
                   users_to_recommend, n_recommendation, n_display)
popularity

+-------------+-------------+--------------------+------+
| COD_CLIENTE | COD_PRODUTO |       score        | rank |
+-------------+-------------+--------------------+------+
|   00000301  |    25099    |        4.0         |  1   |
|   00000301  |    21601    |        2.0         |  2   |
|   00000301  |    21126    |        2.0         |  3   |
|   00000301  |    18941    |        1.5         |  4   |
|   00000301  |    21602    | 1.3333333333333333 |  5   |
|   00000301  |    22313    | 1.3333333333333333 |  6   |
|   00000301  |    20503    | 1.3333333333333333 |  7   |
|   00000301  |    23813    | 1.263157894736842  |  8   |
|   00000301  |    24598    |        1.25        |  9   |
|   00000301  |    19729    | 1.1617999042604117 |  10  |
|   00001501  |    25099    |        4.0         |  1   |
|   00001501  |    21601    |        2.0         |  2   |
|   00001501  |    21126    |        2.0         |  3   |
|   00001501  |    18941    |        1.5         |  4   |
|   00001501  

Class                            : PopularityRecommender

Schema
------
User ID                          : COD_CLIENTE
Item ID                          : COD_PRODUTO
Target                           : QUANTIDADE
Additional observation features  : 0
User side features               : []
Item side features               : []

Statistics
----------
Number of observations           : 249392
Number of users                  : 198072
Number of items                  : 670

Training summary
----------------
Training time                    : 0.0083

Model Parameters
----------------
Model class                      : PopularityRecommender

### Using scaled purchase count 

In [29]:
name = 'popularity'
target = 'FREQ_COMPRAS'
pop_norm = model(train_data_norm, name, user_id, item_id, target, 
                 users_to_recommend, n_recommendation, n_display)

+-------------+-------------+---------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score        | rank |
+-------------+-------------+---------------------+------+
|   00000301  |    18941    |         0.5         |  1   |
|   00000301  |    21126    |  0.3333333333333333 |  2   |
|   00000301  |    21602    |  0.3333333333333333 |  3   |
|   00000301  |    20796    |  0.2727272727272727 |  4   |
|   00000301  |    24619    |         0.2         |  5   |
|   00000301  |    22313    | 0.16666666666666666 |  6   |
|   00000301  |    25418    | 0.14285714285714285 |  7   |
|   00000301  |    23575    |        0.125        |  8   |
|   00000301  |    23814    | 0.10526315789473684 |  9   |
|   00000301  |    24392    |         0.1         |  10  |
|   00001501  |    18941    |         0.5         |  1   |
|   00001501  |    21126    |  0.3333333333333333 |  2   |
|   00001501  |    21602    |  0.3333333333333333 |  3   |
|   00001501  |    20796    |  0.2727272727272727 |  4  

### Collaborative Filtering Model

   Recommends items based on how similar clients purchase items. Meaning: if customer 1 and customer 2 bought similar items, for example, 1 bought X, Y, Z and 2 bought X, Y, we would recommend an item Z to customer 2.
    
   - Lets say X and Y have been rated by costumers 1 and 2. 
   - We then create two item-vectors for both items, then we find the **cosine** or **pearson** distance between these vectors. If the **cosine** value is 1, means total similarity, if it is 0, means no similarity.
   - In this case, we will check the similarity between the target item and other items the customer already bought - using the client's purchase count to items already bought by him as weighing factor (some sort of simulated rating). 

### Using purchase count - Buyers

In [30]:
name = 'cosine'
target = 'QUANTIDADE'
cos = model(train_data, name, user_id, item_id, target, 
            users_to_recommend, n_recommendation, n_display)

+-------------+-------------+----------------------+------+
| COD_CLIENTE | COD_PRODUTO |        score         | rank |
+-------------+-------------+----------------------+------+
|   00000301  |    19729    | 0.10099858045578003  |  1   |
|   00000301  |    15558    | 0.07256853580474854  |  2   |
|   00000301  |    14537    | 0.053633272647857666 |  3   |
|   00000301  |    20966    | 0.023092269897460938 |  4   |
|   00000301  |    21821    | 0.021824538707733154 |  5   |
|   00000301  |    21823    | 0.018758118152618408 |  6   |
|   00000301  |    25240    | 0.01746886968612671  |  7   |
|   00000301  |    21515    | 0.013010084629058838 |  8   |
|   00000301  |    21653    |  0.0129014253616333  |  9   |
|   00000301  |    25241    | 0.009778738021850586 |  10  |
|   00001501  |    25727    | 0.07368814945220947  |  1   |
|   00001501  |    25729    | 0.022751033306121826 |  2   |
|   00001501  |    25728    | 0.021451711654663086 |  3   |
|   00001501  |    25996    | 0.02052903