#### Load data

In [147]:
from mlxtend.frequent_patterns import apriori, association_rules
import os
import pandas as pd
cwd= os.getcwd() # current working directory
path = os.path.join(cwd,'data')
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

Groceries = pd.read_csv('Groceries_dataset.csv')

#Renaming the columns to simple words
Groceries.rename(columns = {'Member_number':'Id','itemDescription':'Item'}, inplace = True)

Groceries.head()

Unnamed: 0,Id,Date,Item
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [148]:
Groceries.shape

(38765, 3)

In [149]:
Groceries.isnull().any()

Id      False
Date    False
Item    False
dtype: bool

In [150]:
print("Total number of unique products are:", len(Groceries['Item'].unique()))

Total number of unique products are: 167


In [151]:
#Top 10 frequently sold products
print("Top 10 frequently sold products(Tabular Representation)")
x = Groceries['Item'].value_counts().sort_values(ascending=False)[:10]
x

Top 10 frequently sold products(Tabular Representation)


whole milk          2502
other vegetables    1898
rolls/buns          1716
soda                1514
yogurt              1334
root vegetables     1071
tropical fruit      1032
bottled water        933
sausage              924
citrus fruit         812
Name: Item, dtype: int64

In [152]:
transactions = Groceries.groupby(['Id','Date'])
transactions.count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item
Id,Date,Unnamed: 2_level_1
1000,15-03-2015,4
1000,24-06-2014,3
1000,24-07-2015,2
1000,25-11-2015,2
1000,27-05-2015,2
...,...,...
4999,24-01-2015,6
4999,26-12-2015,2
5000,09-03-2014,2
5000,10-02-2015,3


In [153]:
#Calculating support of each item
support = (Groceries['Item'].value_counts()/14963*100)
support.head()

whole milk         16.72
other vegetables   12.68
rolls/buns         11.47
soda               10.12
yogurt              8.92
Name: Item, dtype: float64

In [154]:
fig = px.bar(x= x.index, y= x.values)
fig.update_layout(title_text= "Top 10 frequently sold products (Graphical Representation)", xaxis_title= "Products", yaxis_title="Count")
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [155]:
# Exploring Higher sales by time of the year:
Groceries["Year"] = Groceries['Date'].str.split("-").str[-1]
Groceries["Month-Year"] = Groceries['Date'].str.split("-").str[1] + "-" + Groceries['Date'].str.split("-").str[-1]
Groceries.head()

Unnamed: 0,Id,Date,Item,Year,Month-Year
0,1808,21-07-2015,tropical fruit,2015,07-2015
1,2552,05-01-2015,whole milk,2015,01-2015
2,2300,19-09-2015,pip fruit,2015,09-2015
3,1187,12-12-2015,other vegetables,2015,12-2015
4,3037,01-02-2015,whole milk,2015,02-2015


In [156]:
fig1 = px.bar(Groceries["Month-Year"].value_counts(ascending=False), 
              orientation= "v", 
              color = Groceries["Month-Year"].value_counts(ascending=False),
               labels={'value':'Count', 'index':'Date','color':'Meter'})

fig1.update_layout(title_text="Exploring higher sales by the date")

fig1.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [157]:
products = Groceries['Item'].unique()

In [158]:
#one hot encoding the products:

dummy = pd.get_dummies(Groceries['Item'])
Groceries.drop(['Item'], inplace =True, axis=1)

Groceries = Groceries.join(dummy)

Groceries.head()

Unnamed: 0,Id,Date,Year,Month-Year,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,21-07-2015,2015,07-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,05-01-2015,2015,01-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,19-09-2015,2015,09-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,12-12-2015,2015,12-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,01-02-2015,2015,02-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [159]:
# Transaction: If a customer bought multiple products in one day, it will be considered as 1 transaction:

Groceries1 = Groceries.groupby(['Id', 'Date'])[products[:]].sum()
Groceries1 = Groceries1.reset_index()[products]

print("New Dimension", Groceries1.shape)
Groceries1.head()

New Dimension (14963, 167)


Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
#Replacing all non-zero values with the name of the product:

def product_names(x):
    for product in products:
        if x[product] >0:
            x[product] = product
    return x

Groceries1 = Groceries1.apply(product_names, axis=1)
Groceries1.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
print("Total Number of Transactions:", len(Groceries1))

Total Number of Transactions: 14963


In [162]:
#Removing Zeros, Extracting the list of items bought per customer

x = Groceries1.values
x = [sub[~(sub==0)].tolist() for sub in x if sub [sub != 0].tolist()]
transactions = x
transactions[0:10]

[['whole milk', 'yogurt', 'sausage', 'semi-finished bread'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables'],
 ['frankfurter', 'curd'],
 ['whole milk', 'rolls/buns', 'sausage'],
 ['whole milk', 'soda'],
 ['beef', 'white bread'],
 ['frankfurter', 'soda', 'whipped/sour cream']]

In [163]:
rules = apriori(transactions, min_support = 0.00030, min_confidence = 0.05, min_lift = 3, max_length = 2, target = "rules")
association_results = list(rules)
print(association_results[0])

TypeError: apriori() got an unexpected keyword argument 'min_confidence'