In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from fpgrowth_py import fpgrowth
import time

In [2]:
data = pd.read_csv('ecomdata.csv', encoding = 'unicode_escape')
data['GroupPrice']=data['Quantity']*data['UnitPrice']
data=data.dropna()
print('The dimensions of the dataset are : ', data.shape)
print('---------')
print(data.head(5))

The dimensions of the dataset are :  (406829, 9)
---------
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  GroupPrice  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom       15.30  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom       20.34  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom       22.00  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom       20.34  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom       20.34  


In [3]:
liste= data['StockCode'].unique()
stock_to_del=[]
for el in liste:
    if el[0] not in ['1','2','3','4','5','6','7','8','9','10']: # products corresponding to gifts.
        stock_to_del.append(el)

data=data[data['StockCode'].map(lambda x: x not in stock_to_del)] # delete these products

basket = data.groupby(['InvoiceNo','CustomerID']).agg({'StockCode': lambda s: list(set(s))}) # grouping product from the same invoice.

print('Dimension of the new grouped dataset : ', basket.shape)
print('----------')
print(basket.head(5))


Dimension of the new grouped dataset :  (21788, 1)
----------
                                                              StockCode
InvoiceNo CustomerID                                                   
536365    17850.0     [84406B, 71053, 21730, 84029E, 84029G, 22752, ...
536366    17850.0                                        [22633, 22632]
536367    13047.0     [21755, 22748, 22622, 22749, 84879, 84969, 217...
536368    13047.0                          [22960, 22913, 22912, 22914]
536369    13047.0                                               [21756]


In [4]:
a=time.time()
freqItemSet, rules = fpgrowth(basket['StockCode'].values, minSupRatio=0.005, minConf=0.3)
b=time.time()
print('time to execute in seconds : ',b-a, ' s.')
print('Number of rules generated : ', len(rules))

association=pd.DataFrame(rules,columns =['basket','next_product','proba'])
association=association.sort_values(by='proba',ascending=False)
print('Dimensions of the association table are : ', association.shape)
print(association.head(5))

time to execute in seconds :  345.8799262046814  s.
Number of rules generated :  4958
Dimensions of the association table are :  (4958, 3)
                                  basket next_product     proba
202  {22919, 22921, 22920, 22917, 22916}      {22918}  0.992537
424         {22917, 22919, 22921, 22916}      {22918}  0.986014
306         {22917, 22921, 22918, 22920}      {22916}  0.985714
310         {22917, 22921, 22916, 22920}      {22918}  0.985714
96          {22917, 22919, 22921, 22920}      {22918}  0.985401


In [5]:
def compute_next_best_product(basket_el):
    """
    parameter : basket_el = list of consumer basket elements
    return : next_pdt, proba = next product to recommend, buying probability. Or (0,0) if no product is found.


    Description : from the basket of a user, returns the product to recommend if it was not found
    in the list of associations of the table associated with the FP Growth model.
    To do this, we search in the table of associations for the product to recommend from each
    individual product in the consumer's basket.

    """

    for k in basket_el:  # for each element in the consumer basket
        k = {k}
        if len(association[association[
                               'basket'] == k].values) != 0:  # if we find a corresponding association in the fp growth table
            next_pdt = list(association[association['basket'] == k]['next_product'].values[0])[
                0]  # we take the consequent product
            if next_pdt not in basket_el:  # We verify that the customer has not previously purchased the product
                proba = association[association['basket'] == k]['proba'].values[0]  # Find the associated probability.
                return (next_pdt, proba)

            return (0, 0)  # return (0,0) if no product was found.

In [6]:
def find_next_product(basket):
    """
    Parameter : basket = consumer basket dataframe
    Return : list_next_pdt, list_proba = list of next elements to recommend and the buying probabilities associated.

    description : Main function that uses the one above. For each client in the dataset we look for a corresponding
    association in the Fp Growth model table. If no association is found, we call the compute_next_best_product
    function which searches for individual product associations.
    If no individual ssociations are found, the function returns (0,0).

    """
    n = basket.shape[0]
    list_next_pdt = []
    list_proba = []
    for i in range(n):  # for each customer
        el = set(basket['StockCode'][i])  # customer's basket
        if len(association[association[
                               'basket'] == el].values) != 0:  # if we find a association in the fp growth table corresponding to all the customer's basket.
            next_pdt = list(association[association['basket'] == el]['next_product'].values[0])[
                0]  # We take the consequent product
            proba = association[association['basket'] == el]['proba'].values[0]  # Probability as sociated in the table
            list_next_pdt.append(next_pdt)
            list_proba.append(proba)


        elif len(association[association['basket']==el].values) ==0: # If no antecedent to all the basket was found in the table
            next_pdt,proba= compute_next_best_product(basket['StockCode'][i]) # previous function
            list_next_pdt.append(next_pdt)
            list_proba.append(proba)

    return(list_next_pdt, list_proba)

In [7]:
a=time.time()
list_next_pdt, list_proba = find_next_product(basket)
b=time.time()
print(b-a)
basket['Recommended Product']= list_next_pdt # Set of recommended products
basket['Probability']= list_proba # Set of rprobabilities associated
print(basket.head(5))

TypeError: cannot unpack non-iterable NoneType object