## Imports

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [9]:
from sklearn.feature_selection import r_regression as pearson_correlation
from sklearn.decomposition import NMF

## Non negative matrice factorization recommandation model

In [10]:
# On reprend la fonction de recommandation NNF
def person_based_recommandation_nnf(data, utilisateur, n_categories=5, n_recommandations=5):
    model = NMF(n_components=n_categories, init='random', max_iter=2000, random_state=0)
    W = model.fit_transform(data)
    H = model.components_
    approximated_matrix = np.matmul(W, H)
    approximated_matrix_df = pd.DataFrame(approximated_matrix, index=data.index, columns=data.columns)

    # Pour recommander un item, on prend le mieux noté parmi ceux qu'il n'avait pas notés.
    data_utilisateur = data.filter(items=[utilisateur], axis=0).transpose()
    data_utilisateur['has_rated'] = (data_utilisateur[utilisateur] != 0)
    
    predicted_ratings_utilisateur = approximated_matrix_df.filter(items=[utilisateur], axis=0).transpose()

    # Un peu pareil que pour Pearson, on prend les meilleurs de ceux qu'on a pas encore notés
    items_sorted = predicted_ratings_utilisateur[data_utilisateur['has_rated'] == False].sort_values(utilisateur, ascending=False)
    return items_sorted[:n_recommandations].index        


In [11]:
# On sépare entre train et apply, pour gagner du temps
def person_based_recommandation_nnf_train(data, n_categories=5):
    model = NMF(n_components=n_categories, init='random', max_iter=2000, random_state=0)
    W = model.fit_transform(data)
    H = model.components_
    approximated_matrix = np.matmul(W, H)
    approximated_matrix_df = pd.DataFrame(approximated_matrix, index=data.index, columns=data.columns)
    return approximated_matrix_df

def person_based_recommandation_nnf_apply(data, approximated_matrix_df, utilisateur, n_recommandations=5):
    # Pour recommander un item, on prend le mieux noté parmi ceux qu'il n'avait pas notés.
    data_utilisateur = data.filter(items=[utilisateur], axis=0).transpose()
    data_utilisateur['has_rated'] = (data_utilisateur[utilisateur] != 0)
    
    predicted_ratings_utilisateur = approximated_matrix_df.filter(items=[utilisateur], axis=0).transpose()

    # Un peu pareil que pour Pearson, on prend les meilleurs de ceux qu'on a pas encore notés
    items_sorted = predicted_ratings_utilisateur[data_utilisateur['has_rated'] == False].sort_values(utilisateur, ascending=False)
    return items_sorted[:n_recommandations].index        


In [12]:
# Calcul de score (MAE) pour une matrice NNF
def nnf_mae(data, approximated_matrix_df):
    return (data - approximated_matrix_df).abs().mean().mean()

In [13]:
# On reprend la fonction de recommandation Pearson
def person_based_recommandation(data, utilisateur, n_proches=5, n_recommandations=5):
    # On transpose la matrice, parceque la fonction pearson fait une correlation entre les colonnes
    data_transposed = data.transpose()

    # On fait la corrélation de cust_5 avec toutes les autres colonnes
    correlations = pearson_correlation(data_transposed, data_transposed[utilisateur])

    # On reformate pour avoir les données dans un dataframe propre
    correlations_col = pd.DataFrame(index=data.index)
    correlations_col['correlations'] = correlations

    # On trie par ordre décroissant
    sorted_correlations = correlations_col.sort_values('correlations', ascending=False)
    
    utilisateurs_proches = sorted_correlations[1:n_proches+1].index
    
    # On ne garde que les ratings de ces utilisateurs
    data_utilisateurs_proches = data.filter(items=utilisateurs_proches, axis=0).transpose()
    
    # Les ratings de notre utilisateur
    data_utilisateur = data.filter(items=[utilisateur], axis=0).transpose()

    # Nombre d'utilisateurs proches qui ont noté
    n_ratings = (data_utilisateurs_proches != 0).sum(axis=1)

    # Note moyenne
    mean_ratings = (data_utilisateurs_proches).mean(axis=1)

    utilisateur_has_rated = (data_utilisateur[utilisateur] != 0)

    data_utilisateur['n_ratings'] = n_ratings
    data_utilisateur['mean_ratings'] = mean_ratings
    data_utilisateur['has_rated'] = utilisateur_has_rated
    
    items_sorted = data_utilisateur[data_utilisateur['has_rated'] == False].sort_values(['n_ratings', 'mean_ratings'], ascending=False)

    return items_sorted[:n_recommandations].index

## Chargement des données

In [14]:
orders = pd.read_json("../raw_data/order_scoring.json",lines=True)

In [15]:
counted_sku_df = orders.groupby('sku').count()['LO'].copy()
counted_sku_df.sort_values(ascending=False,inplace=True)
sku_lenght = 500
short_sku_list = list(counted_sku_df.head(sku_lenght).index)
orders_short = orders[orders['sku'].isin(short_sku_list)]

In [16]:
coef = {
    1:0.4,
    2:0.3,
    3:0.2,
    4:0.1
    }
orders_short['score'] = orders_short['LO'].map(coef)
table_scoring = orders_short.groupby(by=['customer_id','sku']).sum()[['score']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders_short['score'] = orders_short['LO'].map(coef)


In [17]:
penality = 0.5
table_penality = pd.pivot_table(orders_short, 
                       values=['score'], 
                       index=['customer_id','sku'],
                       columns=['LO'],
                       aggfunc='count',
                       fill_value=0)

table_penality['penality'] = table_penality[[('score',4),('score',3)]].max(axis=1)*(1-table_penality[('score',2)])*(1-table_penality[('score',1)])*penality
final_table = pd.DataFrame(table_scoring['score'] - table_penality['penality'],columns=["score"])

In [18]:
matrix = pd.pivot_table(final_table, values=['score'], index=['customer_id'],
                    columns=['sku'],aggfunc='sum')
matrix = matrix.fillna(0)

In [19]:
matrix.columns = matrix.columns.droplevel(0)
data = matrix

In [20]:
data

sku,1-ABB-101,1-ABO-103,1-ABO-105,1-ACN-106,1-ACN-107,1-ACN-108,1-ACN-113,1-ACN-116,1-ACP-101,1-ACP-102,...,8-GRI-104,8-ILB-103,8-ILB-106,8-LCM-119,8-RES-102,8-RES-106,8-VLF-104,8-VLF-105,8-VLF-107,8-VLF-108
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00HCaCqtG9aMGVCDyBUbvzIzY073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00KfpQoUiRTi8RfGFcdjFM80s5s1,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00RZel3ipuTJMrADRnZ3Lr1C1XX2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00V8dG1xBHM84aVChJyEgIL21XK2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00zoJSpuLHUBtthekFUQ3sjnJy63,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzqXh3hccWTjkxco7Qdz1iEy7bu1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzqaeuDHkFcnofZkDUHPJ7fbCS33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzrZzibp5CbzXtjld4reTvX7bdR2,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzxqL7l0M2fuSto96UNjWi3pg9b2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
matrix_positive = pd.pivot_table(final_table, values=['score'], index=['customer_id'],
                    columns=['sku'],aggfunc='sum')

In [22]:
matrix_positive = matrix_positive + 1
matrix_positive = matrix_positive.fillna(0)
matrix_positive.columns = matrix_positive.columns.droplevel(0)
data_positive = matrix_positive

### Tests de recommandations

In [23]:
person_based_recommandation(data, '00V8dG1xBHM84aVChJyEgIL21XK2')

Index(['1-TAH-102', '3-PAP-101', '6-COT-105', '4-BOF-144', '6-BOI-101'], dtype='object', name='sku')

In [24]:
person_based_recommandation_nnf(data_positive, '00V8dG1xBHM84aVChJyEgIL21XK2')

Index(['4-GRA-105', '1-SNT-103', '3-PAP-101', '6-GAB-101', '6-CAS-105'], dtype='object', name='sku')

In [25]:
nnf_model = person_based_recommandation_nnf_train(data_positive, n_categories=25)

In [26]:
nnf_mae(data_positive, nnf_model)

0.08600592979140803

In [27]:
person_based_recommandation_nnf_apply(data_positive, nnf_model, '00V8dG1xBHM84aVChJyEgIL21XK2')

Index(['6-CAS-105', '6-COT-105', '3-PAP-101', '1-SNT-103', '1-BID-121'], dtype='object', name='sku')

## Tests

In [28]:
customer_id = "h6IvomPZxOTKuKGuvS8b3nZrzVk2"

In [29]:

person_based_recommandation(data, customer_id, n_recommandations=3)

Index(['1-PHL-160', '1-VIT-115', '6-RIS-103'], dtype='object', name='sku')

In [30]:

# https://lafourche.fr/products/philia-bouillon-de-legumes-en-poudre-bio-125g-bio
# https://lafourche.fr/products/lot-de-12-compotes-cool-fruits-pomme-acerola
# https://lafourche.fr/products/la-fourche-tomates-concassees-bio-0-4kg

In [31]:
person_based_recommandation_nnf_apply(data_positive, nnf_model, customer_id, n_recommandations=3)

Index(['1-ELB-100', '4-LAF-117', '3-PAP-101'], dtype='object', name='sku')

In [32]:
# https://lafourche.fr/products/elibio-mais-doux-bio-300g
# https://lafourche.fr/products/la-fourche-1kg-de-pates-cocciolette-blanches-bio-en-vrac
# https://lafourche.fr/products/papeco-essuie-tout-blanc-100prct-recycle-origine-france-200-feuilles-ecologique

In [33]:
customer_id = "3630465646727"

In [34]:

person_based_recommandation(data, customer_id, n_recommandations=3)

Index(['6-COT-103', '4-BOF-130', '6-RIS-103'], dtype='object', name='sku')

In [35]:
#https://lafourche.fr/products/la-fourche-puree-pommes-mangues-bio-0-915kg
#https://lafourche.fr/products/vrac-1kg-de-sucre-de-coco-en-vrac-bio
#https://lafourche.fr/products/la-fourche-tomates-concassees-bio-0-4kg

In [36]:
person_based_recommandation_nnf_apply(data_positive, nnf_model, customer_id, n_recommandations=3)

Index(['6-COT-103', '6-COT-102', '1-CNT-133'], dtype='object', name='sku')

In [37]:
#https://lafourche.fr/products/elibio-mais-doux-bio-300g
#https://lafourche.fr/products/la-fourche-puree-100-cacahuetes-bio-0-5kg
#https://lafourche.fr/products/la-fourche-1kg-de-pates-cocciolette-blanches-bio-en-vrac

In [38]:
customer_id = "2074102857788"

In [39]:

person_based_recommandation(data, customer_id, n_recommandations=3)

Index(['1-ELB-105', '4-COS-141', '1-ELB-114'], dtype='object', name='sku')

In [40]:
person_based_recommandation_nnf_apply(data_positive, nnf_model, customer_id, n_recommandations=3)

Index(['1-ELB-100', '6-SOF-104', '4-LAF-117'], dtype='object', name='sku')

### Un estimator qui peut rentrer dans un pipeline

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [52]:
train_data, test_data = train_test_split(data_positive, test_size=0.3)

In [69]:
class NMFPredictor(NMF):
    # Je rajoute une fonction predict() sur le NMF pour pouvoir l'utiliser et faire des predictions directement
    # dans un pipeline
    def predict(self, test_data):
        W = self.transform(test_data)
        H = self.components_
        
        approximated_matrix = np.matmul(W, H)
        return approximated_matrix
    
def train_test_1(train_data, test_data, n_categories=15):
    model = NMFPredictor(n_components=n_categories, init='random', max_iter=2000, random_state=0)
    model.fit(train_data)
    return model.predict(test_data)
    
predicted_scores = train_test_1(train_data, test_data)

In [61]:
mean_squared_error(test_data, predicted_scores)

0.05885597665615573

In [63]:
mean_absolute_error(test_data, predicted_scores)

0.09094355924549226

In [67]:
predicted_scores

array([[0.03082638, 0.04234494, 0.03370959, ..., 0.00172931, 0.00189051,
        0.00196528],
       [0.02027558, 0.00156275, 0.00778404, ..., 0.00053767, 0.00110217,
        0.00060208],
       [0.01467177, 0.04197304, 0.01990994, ..., 0.08618027, 0.15861105,
        0.11225453],
       ...,
       [0.01293661, 0.0417947 , 0.02306146, ..., 0.05392925, 0.0965279 ,
        0.06452812],
       [0.00455697, 0.006457  , 0.00595532, ..., 0.01738455, 0.02795246,
        0.02021635],
       [0.02370042, 0.01570524, 0.01648801, ..., 0.00086113, 0.00263834,
        0.00455016]])

In [66]:
# Une fonction qui vient à la fin, après le NMF et la prédiction des scores
# Elle fait la partie à la fin, elle prend tous les scores prédits pour tous les items, et elle renvoie une
# prediction des 3 top items
def predictions_from_scores(test_data, predictions, utilisateur, n_recommandations=3):
    approximated_matrix_df = pd.DataFrame(predictions, index=test_data.index, columns=test_data.columns)
    
    # Pour recommander un item, on prend le mieux noté parmi ceux qu'il n'avait pas notés.
    data_utilisateur = data.filter(items=[utilisateur], axis=0).transpose()
    data_utilisateur['has_rated'] = (data_utilisateur[utilisateur] != 0)

    predicted_ratings_utilisateur = approximated_matrix_df.filter(items=[utilisateur], axis=0).transpose()

    # Un peu pareil que pour Pearson, on prend les meilleurs de ceux qu'on a pas encore notés
    items_sorted = predicted_ratings_utilisateur[data_utilisateur['has_rated'] == False].sort_values(utilisateur, ascending=False)
    return items_sorted[:n_recommandations].index        

predictions_from_scores(test_data, predicted_scores, "2112572620860")

Index(['4-BOF-125', '4-GRA-105', '4-CLN-112'], dtype='object', name='sku')

## Grid search on NMF

In [41]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [None]:
param_grid = {
    # 'nmf__n_components': [5, 25, 50, 75, 100, 200, 300]
    'nmf__n_components': [5, 15, 25, 50, 75, 100, 200, 300]
}

pipe = Pipeline([
    ('nmf', NMFPredictor(init='random', max_iter=2000, random_state=0)),
])

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3)
grid_search.fit(data_positive, data_positive)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..............nmf__n_components=5;, score=-0.061 total time=   4.2s
[CV 2/5] END ..............nmf__n_components=5;, score=-0.062 total time=   3.6s
[CV 3/5] END ..............nmf__n_components=5;, score=-0.065 total time=   4.8s
[CV 4/5] END ..............nmf__n_components=5;, score=-0.065 total time=   4.9s
[CV 5/5] END ..............nmf__n_components=5;, score=-0.066 total time=   5.5s
[CV 1/5] END .............nmf__n_components=15;, score=-0.057 total time=  19.0s
[CV 2/5] END .............nmf__n_components=15;, score=-0.058 total time=   9.0s
[CV 3/5] END .............nmf__n_components=15;, score=-0.060 total time=   6.5s
[CV 4/5] END .............nmf__n_components=15;, score=-0.060 total time=   8.6s
[CV 5/5] END .............nmf__n_components=15;, score=-0.061 total time=   9.2s
[CV 1/5] END .............nmf__n_components=25;, score=-0.054 total time=  31.4s
[CV 2/5] END .............nmf__n_components=25;, 

In [None]:
grid_search.best_estimator_