In [1]:
import numpy as np 
import pandas as pd 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
import matplotlib.pyplot as plt
import time


In [2]:
df = pd.read_csv(r'C:\Users\omnan\Documents\Groceries data.csv')
print(df)

       Member_number        Date        itemDescription  year  month  day  \
0               1808  2015-07-21         tropical fruit  2015      7   21   
1               2552  2015-05-01             whole milk  2015      5    1   
2               2300  2015-09-19              pip fruit  2015      9   19   
3               1187  2015-12-12       other vegetables  2015     12   12   
4               3037  2015-01-02             whole milk  2015      1    2   
...              ...         ...                    ...   ...    ...  ...   
38760           4471  2014-08-10          sliced cheese  2014      8   10   
38761           2022  2014-02-23                  candy  2014      2   23   
38762           1097  2014-04-16               cake bar  2014      4   16   
38763           1510  2014-03-12  fruit/vegetable juice  2014      3   12   
38764           1521  2014-12-26               cat food  2014     12   26   

       day_of_week  
0                1  
1                4  
2           

In [3]:
# review the unique values in the member ID and item columns
print(len(df['Member_number'].unique()))
print(len(df['itemDescription'].unique()))

3898
167


In [4]:
# review the value counts
freq_items = df['itemDescription'].value_counts()
item=freq_items.reset_index()
top_itm=item['index'].head(30).to_list()
print(top_itm)
freq_items.head(10)

['whole milk', 'other vegetables', 'rolls/buns', 'soda', 'yogurt', 'root vegetables', 'tropical fruit', 'bottled water', 'sausage', 'citrus fruit', 'pastry', 'pip fruit', 'shopping bags', 'canned beer', 'bottled beer', 'whipped/sour cream', 'newspapers', 'frankfurter', 'brown bread', 'pork', 'domestic eggs', 'butter', 'fruit/vegetable juice', 'beef', 'curd', 'margarine', 'coffee', 'frozen vegetables', 'chicken', 'white bread']


whole milk          2502
other vegetables    1898
rolls/buns          1716
soda                1514
yogurt              1334
root vegetables     1071
tropical fruit      1032
bottled water        933
sausage              924
citrus fruit         812
Name: itemDescription, dtype: int64

In [5]:
# list items by member IDs
user_id = df['Member_number'].unique()
items = [list(df.loc[df['Member_number'] == id, 'itemDescription']) for id in user_id]
print(items[0])

['tropical fruit', 'long life bakery product', 'meat', 'sugar', 'rolls/buns', 'semi-finished bread', 'whole milk', 'citrus fruit', 'candy', 'napkins']


In [6]:
# create a item matrix
TE = TransactionEncoder()
TE.fit(items)
item_transformed = TE.transform(items)
item_matrix = pd.DataFrame(item_transformed, columns = TE.columns_)
item_matrix

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3894,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3896,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# get the support value by Apriori algorithm
freq_items = apriori(item_matrix, min_support=0.01, use_colnames=True, max_len=2)
freq_items.sort_values(by = "support", ascending = False)

Unnamed: 0,support,itemsets
113,0.458184,(whole milk)
69,0.376603,(other vegetables)
84,0.349666,(rolls/buns)
94,0.313494,(soda)
114,0.282966,(yogurt)
...,...,...
414,0.010005,"(butter milk, frankfurter)"
705,0.010005,"(whole milk, dog food)"
882,0.010005,"(newspapers, hygiene articles)"
1106,0.010005,"(waffles, pork)"


In [8]:
# creating a dataframe with product support, confidence , and lift values
rules = association_rules(freq_items, metric = "confidence", min_threshold = 0)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(UHT-milk),(beef),0.078502,0.119548,0.010518,0.133987,1.120775,0.001133,1.016672
1,(beef),(UHT-milk),0.119548,0.078502,0.010518,0.087983,1.120775,0.001133,1.010396
2,(bottled beer),(UHT-milk),0.158799,0.078502,0.014879,0.093700,1.193597,0.002413,1.016769
3,(UHT-milk),(bottled beer),0.078502,0.158799,0.014879,0.189542,1.193597,0.002413,1.037933
4,(UHT-milk),(bottled water),0.078502,0.213699,0.021293,0.271242,1.269268,0.004517,1.078960
...,...,...,...,...,...,...,...,...,...
2247,(whole milk),(white wine),0.458184,0.044125,0.023602,0.051512,1.167400,0.003384,1.007788
2248,(white wine),(yogurt),0.044125,0.282966,0.016419,0.372093,1.314976,0.003933,1.141944
2249,(yogurt),(white wine),0.282966,0.044125,0.016419,0.058024,1.314976,0.003933,1.014754
2250,(whole milk),(yogurt),0.458184,0.282966,0.150590,0.328667,1.161510,0.020940,1.068076


In [9]:
# get the support value by Apriori algorithm
freq_items = apriori(item_matrix, min_support=0.01, use_colnames=True, max_len=5)
freq_items.sort_values(by = "support", ascending = False)

Unnamed: 0,support,itemsets
113,0.458184,(whole milk)
69,0.376603,(other vegetables)
84,0.349666,(rolls/buns)
94,0.313494,(soda)
114,0.282966,(yogurt)
...,...,...
2269,0.010005,"(whole milk, sausage, ice cream)"
2266,0.010005,"(rolls/buns, other vegetables, ice cream)"
2263,0.010005,"(whole milk, herbs, other vegetables)"
956,0.010005,"(pork, misc. beverages)"


In [10]:
#Get the support value using FPGrowth Algorithm
freq_itemsfp=fpgrowth(item_matrix, min_support=0.01, use_colnames=True,max_len=2)
freq_itemsfp.sort_values(by = "support", ascending = False)

Unnamed: 0,support,itemsets
0,0.458184,(whole milk)
10,0.376603,(other vegetables)
1,0.349666,(rolls/buns)
26,0.313494,(soda)
32,0.282966,(yogurt)
...,...,...
238,0.010005,"(semi-finished bread, root vegetables)"
540,0.010005,"(butter, ham)"
531,0.010005,"(fruit/vegetable juice, ham)"
338,0.010005,"(hygiene articles, shopping bags)"


In [11]:
# creating a dataframe with product support, confidence , and lift values 
rulesfp = association_rules(freq_itemsfp, metric = "confidence", min_threshold = 0)
rulesfp

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(whole milk),(rolls/buns),0.458184,0.349666,0.178553,0.389698,1.114484,0.018342,1.065592
1,(rolls/buns),(whole milk),0.349666,0.458184,0.178553,0.510638,1.114484,0.018342,1.107190
2,(rolls/buns),(other vegetables),0.349666,0.376603,0.146742,0.419663,1.114335,0.015056,1.074197
3,(other vegetables),(rolls/buns),0.376603,0.349666,0.146742,0.389646,1.114335,0.015056,1.065502
4,(whole milk),(tropical fruit),0.458184,0.233710,0.116470,0.254199,1.087672,0.009388,1.027473
...,...,...,...,...,...,...,...,...,...
2247,(dog food),(whole milk),0.017188,0.458184,0.010005,0.582090,1.270428,0.002130,1.296489
2248,(rolls/buns),(pet care),0.349666,0.021806,0.010262,0.029347,1.345820,0.002637,1.007769
2249,(pet care),(rolls/buns),0.021806,0.349666,0.010262,0.470588,1.345820,0.002637,1.228408
2250,(other vegetables),(pet care),0.376603,0.021806,0.010262,0.027248,1.249559,0.002049,1.005594


In [12]:
# regarding the whole milk has the highest support, choose it as the item for the basket analysis(using FPGrowth)
rules_fpgrowth = rulesfp[rulesfp["antecedents"].apply(lambda x: "whole milk" in x)]
rules_fpgrowth.sort_values('confidence', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
248,(whole milk),(other vegetables),0.458184,0.376603,0.191380,0.417693,1.109106,0.018827,1.070564
0,(whole milk),(rolls/buns),0.458184,0.349666,0.178553,0.389698,1.114484,0.018342,1.065592
662,(whole milk),(soda),0.458184,0.313494,0.151103,0.329787,1.051973,0.007465,1.024310
854,(whole milk),(yogurt),0.458184,0.282966,0.150590,0.328667,1.161510,0.020940,1.068076
4,(whole milk),(tropical fruit),0.458184,0.233710,0.116470,0.254199,1.087672,0.009388,1.027473
...,...,...,...,...,...,...,...,...,...
2228,(whole milk),(frozen dessert),0.458184,0.023089,0.012057,0.026316,1.139766,0.001479,1.003314
2024,(whole milk),(canned vegetables),0.458184,0.020523,0.011544,0.025196,1.227674,0.002141,1.004793
2244,(whole milk),(salt),0.458184,0.022832,0.011288,0.024636,1.079004,0.000826,1.001849
2150,(whole milk),(dish cleaner),0.458184,0.018728,0.010005,0.021837,1.166010,0.001424,1.003178


In [13]:
rules_eda= rules
rules_eda['antecedents'] = rules['antecedents'].apply(lambda a: ', '.join(list(a)))
rules_eda['consequents'] = rules['consequents'].apply(lambda a: ', '.join(list(a)))


rules_eda.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,UHT-milk,beef,0.078502,0.119548,0.010518,0.133987,1.120775,0.001133,1.016672
1,beef,UHT-milk,0.119548,0.078502,0.010518,0.087983,1.120775,0.001133,1.010396
2,bottled beer,UHT-milk,0.158799,0.078502,0.014879,0.0937,1.193597,0.002413,1.016769
3,UHT-milk,bottled beer,0.078502,0.158799,0.014879,0.189542,1.193597,0.002413,1.037933
4,UHT-milk,bottled water,0.078502,0.213699,0.021293,0.271242,1.269268,0.004517,1.07896


In [14]:
rules1=rules_eda
rules1['scorelftlvg']=(rules1['lift']*0.5 +rules1['leverage']*0.5)*100
rules1['scoreconcnf']=(rules1['confidence']*0.5 +rules1['conviction']*0.5)*100
rules1=rules1.sort_values(['scorelftlvg', 'scoreconcnf'], ascending=False)
rules1

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,scorelftlvg,scoreconcnf
228,beverages,white bread,0.062083,0.088763,0.010518,0.169421,1.908685,0.005008,1.097111,95.684618,63.326607
229,white bread,beverages,0.088763,0.062083,0.010518,0.118497,1.908685,0.005008,1.063998,95.684618,59.124731
828,waffles,chicken,0.069010,0.100564,0.011801,0.171004,1.700440,0.004861,1.084969,85.265051,62.798656
829,chicken,waffles,0.100564,0.069010,0.011801,0.117347,1.700440,0.004861,1.054764,85.265051,58.605523
1748,specialty bar,newspapers,0.052591,0.139815,0.012314,0.234146,1.674683,0.004961,1.123171,83.982217,67.865873
...,...,...,...,...,...,...,...,...,...,...,...
1001,pork,coffee,0.132376,0.114931,0.013597,0.102713,0.893696,-0.001617,0.986384,44.603953,54.454853
1122,dessert,domestic eggs,0.086455,0.133145,0.010262,0.118694,0.891466,-0.001249,0.983603,44.510811,55.114864
1123,domestic eggs,dessert,0.133145,0.086455,0.010262,0.077071,0.891466,-0.001249,0.989833,44.510811,53.345221
884,cream cheese,citrus fruit,0.088507,0.185480,0.014110,0.159420,0.859502,-0.002306,0.968998,42.859802,56.420927


In [15]:
def suggestitem(k):
   
    df=pd.DataFrame()
    rules_eda = rules1[(rules1['lift']>1) &(rules1['leverage'] >  0) ]
    if len(rules_eda[rules_eda['antecedents']==k])!=0: 
        dataf=rules_eda[rules_eda['antecedents']==k][['antecedents','consequents', 'lift', 'leverage','scorelftlvg']]
        dat= dataf[['antecedents','consequents', 'lift','scorelftlvg']]
        dat=dat.sort_values(['lift'], ascending=False)
        daf=dataf[['antecedents','consequents', 'leverage','scorelftlvg']]
        daf=daf.sort_values(['leverage'], ascending=False)
        da=dataf[['antecedents','consequents', 'scorelftlvg']]
        da=da.sort_values(['scorelftlvg'], ascending=False)
        its=(da['consequents'].head(5)).to_list()        
        itmlst=(daf['consequents'].head(3)).to_list()+(dat['consequents'].head(3)).to_list()
        itmlst=list(set(itmlst))
        score=[]
        for itm in itmlst:
            f =daf[daf['consequents']==itm]['scorelftlvg'].item()
            score.append(f)
                
        frames=[itmlst, score]
        df['nextitem']=itmlst
        df['Score']= score
        #itm=', '.join(itmlst)   
        for i in range(len(df)):
            print(k, "is frequently bought with ",itmlst[i], " with score ","{:.2f}".format(score[i]) )
           
suggestitem('beef')           

beef is frequently bought with  whole milk  with score  59.01
beef is frequently bought with  butter milk  with score  69.58
beef is frequently bought with  butter  with score  68.14
beef is frequently bought with  other vegetables  with score  56.70
beef is frequently bought with  citrus fruit  with score  65.71
beef is frequently bought with  chocolate  with score  70.96


In [16]:
grp1= ['coffee', 'frozen vegetables', 'chicken', 'white bread',
       'cream cheese ', 'chocolate', 'dessert', 'napkins', 'berries',
       'hamburger meat', 'UHT-milk', 'onions', 'salty snack', 'waffles',
       'long life bakery product', 'sugar', 'butter milk', 'ham', 'meat']
itmlst=[]
for i in range(len( grp1)):
       
    itmlst= suggestitem( grp1[i])

coffee is frequently bought with  UHT-milk  with score  71.28
coffee is frequently bought with  misc. beverages  with score  73.93
coffee is frequently bought with  domestic eggs  with score  69.02
coffee is frequently bought with  other vegetables  with score  57.20
coffee is frequently bought with  soda  with score  60.16
frozen vegetables is frequently bought with  UHT-milk  with score  70.22
frozen vegetables is frequently bought with  canned beer  with score  71.48
frozen vegetables is frequently bought with  whole milk  with score  59.06
frozen vegetables is frequently bought with  napkins  with score  72.43
frozen vegetables is frequently bought with  other vegetables  with score  59.43
chicken is frequently bought with  butter  with score  74.94
chicken is frequently bought with  waffles  with score  85.27
chicken is frequently bought with  rolls/buns  with score  60.55
chicken is frequently bought with  fruit/vegetable juice  with score  72.77
white bread is frequently bought 