In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")

In [3]:
transactions = pd.read_csv('kado.csv', nrows=10000) 
print(transactions)

      TICKET_ID  MOIS_VENTE  PRIX_NET          FAMILLE  \
0      35592159          10      1.67          HYGIENE   
1      35592159          10      1.66          HYGIENE   
2      35592159          10      7.45  SOINS DU VISAGE   
3      35592159          10      5.95  SOINS DU VISAGE   
4      35592159          10      1.67          HYGIENE   
...         ...         ...       ...              ...   
9995   33934312           4      6.95          HYGIENE   
9996   33988514           4      4.75       MAQUILLAGE   
9997   33988514           4      7.90       MAQUILLAGE   
9998   34302529           6     13.90          HYGIENE   
9999   34302529           6      6.95          HYGIENE   

                          UNIVERS                MAILLE  \
0          HYG_DOUCHE JARDINMONDE               HYG_JDM   
1          HYG_DOUCHE JARDINMONDE               HYG_JDM   
2     VIS_CJOUR Jeunes Specifique    VIS_JEUNE_ET_LEVRE   
3                  VIS_DEMAQ AAAR  VIS_AAAR_DEMAQLOTION   
4       

In [4]:
data = pd.melt(transactions.set_index('CLI_ID')['TICKET_ID'].apply(pd.Series).reset_index(), 
             id_vars=['CLI_ID'],
             value_name='TICKET_ID') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['CLI_ID', 'TICKET_ID']) \
    .agg({'TICKET_ID': 'count'}) \
    .rename(columns={'TICKET_ID': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'TICKET_ID': 'TICKET_ID'})
data['TICKET_ID'] = data['TICKET_ID'].astype(np.int64)

In [5]:
#Data preparation 
#liste des articles de la colonne produits en lignes et compte le nombre de produits achetés par un utilisateur  
print(data.shape)
data.head

(4051, 3)


<bound method NDFrame.head of          CLI_ID  TICKET_ID  purchase_count
0       1490281   35592159               5
1      13290776   35509899               2
2      13290776   36417517               7
3      20163348   33002894               1
4      20200041   33064616               2
...         ...        ...             ...
4046  903871816   33710338               1
4047  903871816   33819548               1
4048  903871816   33934312               2
4049  903871816   33988514               2
4050  903871816   34302529               2

[4051 rows x 3 columns]>

In [6]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)
print(data_dummy)

         CLI_ID  TICKET_ID  purchase_count  purchase_dummy
0       1490281   35592159               5               1
1      13290776   35509899               2               1
2      13290776   36417517               7               1
3      20163348   33002894               1               1
4      20200041   33064616               2               1
...         ...        ...             ...             ...
4046  903871816   33710338               1               1
4047  903871816   33819548               1               1
4048  903871816   33934312               2               1
4049  903871816   33988514               2               1
4050  903871816   34302529               2               1

[4051 rows x 4 columns]


In [7]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='CLI_ID', columns='TICKET_ID')

In [8]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())

In [9]:
print(df_matrix_norm)

TICKET_ID  32932270  32932561  32935308  32935359  32935579  32935930  \
CLI_ID                                                                  
1490281         NaN       NaN       NaN       NaN       NaN       NaN   
13290776        NaN       NaN       NaN       NaN       NaN       NaN   
20163348        NaN       NaN       NaN       NaN       NaN       NaN   
20200041        NaN       NaN       NaN       NaN       NaN       NaN   
20561854        NaN       NaN       NaN       NaN       NaN       NaN   
...             ...       ...       ...       ...       ...       ...   
903808381       NaN       NaN       NaN       NaN       NaN       NaN   
903813373       NaN       NaN       NaN       NaN       NaN       NaN   
903816590       NaN       NaN       NaN       NaN       NaN       NaN   
903864631       NaN       NaN       NaN       NaN       NaN       NaN   
903871816       NaN       NaN       NaN       NaN       NaN       NaN   

TICKET_ID  32936527  32937458  32938156  32941252 

In [12]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['CLI_ID'], value_name='scaled_purchase_freq')
print(data_norm.shape)
data_norm.head()

(3378534, 3)


Unnamed: 0,CLI_ID,TICKET_ID,scaled_purchase_freq
0,1490281,32932270,
1,13290776,32932270,
2,20163348,32932270,
3,20200041,32932270,
4,20561854,32932270,


In [18]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='CLI_ID', columns='TICKET_ID')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['CLI_ID'], value_name='scaled_purchase_freq').dropna()

In [19]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [20]:
train_data, test_data = split_data(data)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [22]:
# constant variables to define field names include:
user_id = 'CLI_ID'
item_id = 'TICKET_ID'
users_to_recommend = list(transactions[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [27]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, user_id=user_id,item_id=item_id, target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [28]:
name = 'popularity'
target = 'purchase_count'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-----------+-------+------+
|  CLI_ID | TICKET_ID | score | rank |
+---------+-----------+-------+------+
| 1490281 |  34430302 |  22.0 |  1   |
| 1490281 |  35812103 |  18.0 |  2   |
| 1490281 |  33170055 |  16.0 |  3   |
| 1490281 |  32974762 |  15.0 |  4   |
| 1490281 |  33807357 |  14.0 |  5   |
| 1490281 |  34773724 |  14.0 |  6   |
| 1490281 |  35922164 |  14.0 |  7   |
| 1490281 |  33930382 |  14.0 |  8   |
| 1490281 |  35968166 |  13.0 |  9   |
| 1490281 |  36260094 |  13.0 |  10  |
| 1490281 |  34430302 |  22.0 |  1   |
| 1490281 |  35812103 |  18.0 |  2   |
| 1490281 |  33170055 |  16.0 |  3   |
| 1490281 |  32974762 |  15.0 |  4   |
| 1490281 |  33807357 |  14.0 |  5   |
| 1490281 |  34773724 |  14.0 |  6   |
| 1490281 |  35922164 |  14.0 |  7   |
| 1490281 |  33930382 |  14.0 |  8   |
| 1490281 |  35968166 |  13.0 |  9   |
| 1490281 |  36260094 |  13.0 |  10  |
| 1490281 |  34430302 |  22.0 |  1   |
| 1490281 |  35812103 |  18.0 |  2   |
| 1490281 |  33170055 |  

In [29]:
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-----------+-------+------+
|  CLI_ID | TICKET_ID | score | rank |
+---------+-----------+-------+------+
| 1490281 |  34575885 |  1.0  |  1   |
| 1490281 |  34545883 |  1.0  |  2   |
| 1490281 |  34713684 |  1.0  |  3   |
| 1490281 |  33068440 |  1.0  |  4   |
| 1490281 |  34792117 |  1.0  |  5   |
| 1490281 |  33326041 |  1.0  |  6   |
| 1490281 |  33780617 |  1.0  |  7   |
| 1490281 |  35832190 |  1.0  |  8   |
| 1490281 |  33988514 |  1.0  |  9   |
| 1490281 |  35937981 |  1.0  |  10  |
| 1490281 |  34575885 |  1.0  |  1   |
| 1490281 |  34545883 |  1.0  |  2   |
| 1490281 |  34713684 |  1.0  |  3   |
| 1490281 |  33068440 |  1.0  |  4   |
| 1490281 |  34792117 |  1.0  |  5   |
| 1490281 |  33326041 |  1.0  |  6   |
| 1490281 |  33780617 |  1.0  |  7   |
| 1490281 |  35832190 |  1.0  |  8   |
| 1490281 |  33988514 |  1.0  |  9   |
| 1490281 |  35937981 |  1.0  |  10  |
| 1490281 |  34575885 |  1.0  |  1   |
| 1490281 |  34545883 |  1.0  |  2   |
| 1490281 |  34713684 |  

In [30]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-----------+-------+------+
|  CLI_ID | TICKET_ID | score | rank |
+---------+-----------+-------+------+
| 1490281 |  35929105 |  nan  |  1   |
| 1490281 |  33512354 |  nan  |  2   |
| 1490281 |  34164655 |  nan  |  3   |
| 1490281 |  34943586 |  nan  |  4   |
| 1490281 |  36390834 |  nan  |  5   |
| 1490281 |  35812103 |  nan  |  6   |
| 1490281 |  33716134 |  nan  |  7   |
| 1490281 |  33339960 |  nan  |  8   |
| 1490281 |  34185155 |  nan  |  9   |
| 1490281 |  34755041 |  nan  |  10  |
| 1490281 |  35929105 |  nan  |  1   |
| 1490281 |  33512354 |  nan  |  2   |
| 1490281 |  34164655 |  nan  |  3   |
| 1490281 |  34943586 |  nan  |  4   |
| 1490281 |  36390834 |  nan  |  5   |
| 1490281 |  35812103 |  nan  |  6   |
| 1490281 |  33716134 |  nan  |  7   |
| 1490281 |  33339960 |  nan  |  8   |
| 1490281 |  34185155 |  nan  |  9   |
| 1490281 |  34755041 |  nan  |  10  |
| 1490281 |  35929105 |  nan  |  1   |
| 1490281 |  33512354 |  nan  |  2   |
| 1490281 |  34164655 |  

In [31]:
#Collaborative Filtering Model
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-----------+----------------------+------+
|  CLI_ID | TICKET_ID |        score         | rank |
+---------+-----------+----------------------+------+
| 1490281 |  35248566 | 0.010476189851760864 |  1   |
| 1490281 |  33834593 | 0.010476189851760864 |  2   |
| 1490281 |  35968166 | 0.005714285373687744 |  3   |
| 1490281 |  34869237 | 0.005714285373687744 |  4   |
| 1490281 |  34053370 | 0.005714285373687744 |  5   |
| 1490281 |  33363224 | 0.005714285373687744 |  6   |
| 1490281 |  34450331 | 0.005714285373687744 |  7   |
| 1490281 |  33403520 | 0.005714285373687744 |  8   |
| 1490281 |  33920967 | 0.005714285373687744 |  9   |
| 1490281 |  34096507 | 0.005714285373687744 |  10  |
| 1490281 |  35248566 | 0.010476189851760864 |  1   |
| 1490281 |  33834593 | 0.010476189851760864 |  2   |
| 1490281 |  35968166 | 0.005714285373687744 |  3   |
| 1490281 |  34869237 | 0.005714285373687744 |  4   |
| 1490281 |  34053370 | 0.005714285373687744 |  5   |
| 1490281 |  33363224 | 0.00

In [33]:
#person similarity
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-----------+--------------------+------+
|  CLI_ID | TICKET_ID |       score        | rank |
+---------+-----------+--------------------+------+
| 1490281 |  34430302 |        1.0         |  1   |
| 1490281 |  35812103 | 0.8095238095238095 |  2   |
| 1490281 |  33170055 | 0.7142857142857143 |  3   |
| 1490281 |  32974762 | 0.6666666666666666 |  4   |
| 1490281 |  33807357 | 0.6190476190476191 |  5   |
| 1490281 |  34773724 | 0.6190476190476191 |  6   |
| 1490281 |  35922164 | 0.6190476190476191 |  7   |
| 1490281 |  33930382 | 0.6190476190476191 |  8   |
| 1490281 |  35968166 | 0.5714285714285714 |  9   |
| 1490281 |  36260094 | 0.5714285714285714 |  10  |
| 1490281 |  34430302 |        1.0         |  1   |
| 1490281 |  35812103 | 0.8095238095238095 |  2   |
| 1490281 |  33170055 | 0.7142857142857143 |  3   |
| 1490281 |  32974762 | 0.6666666666666666 |  4   |
| 1490281 |  33807357 | 0.6190476190476191 |  5   |
| 1490281 |  34773724 | 0.6190476190476191 |  6   |
| 1490281 | 

In [34]:
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+---------+-----------+-------+------+
|  CLI_ID | TICKET_ID | score | rank |
+---------+-----------+-------+------+
| 1490281 |  34575885 |  0.0  |  1   |
| 1490281 |  34545883 |  0.0  |  2   |
| 1490281 |  34713684 |  0.0  |  3   |
| 1490281 |  33068440 |  0.0  |  4   |
| 1490281 |  34792117 |  0.0  |  5   |
| 1490281 |  33326041 |  0.0  |  6   |
| 1490281 |  33780617 |  0.0  |  7   |
| 1490281 |  35832190 |  0.0  |  8   |
| 1490281 |  33988514 |  0.0  |  9   |
| 1490281 |  35937981 |  0.0  |  10  |
| 1490281 |  34575885 |  0.0  |  1   |
| 1490281 |  34545883 |  0.0  |  2   |
| 1490281 |  34713684 |  0.0  |  3   |
| 1490281 |  33068440 |  0.0  |  4   |
| 1490281 |  34792117 |  0.0  |  5   |
| 1490281 |  33326041 |  0.0  |  6   |
| 1490281 |  33780617 |  0.0  |  7   |
| 1490281 |  35832190 |  0.0  |  8   |
| 1490281 |  33988514 |  0.0  |  9   |
| 1490281 |  35937981 |  0.0  |  10  |
| 1490281 |  34575885 |  0.0  |  1   |
| 1490281 |  34545883 |  0.0  |  2   |
| 1490281 |  34713684 |  

In [None]:
#Model Evaluation
