# Market Basket Analysis Using Apriori Algorithm

Importing necessary libraries

In [79]:
!pip install apyori



In [80]:
import numpy as np
import pandas as pd
import plotly.express as px
import apyori
from apyori import apriori

Loading the dataset

In [81]:
df = pd.read_csv('Groceries_dataset.csv')
df.head(10)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
5,4941,14-02-2015,rolls/buns
6,4501,08-05-2015,other vegetables
7,3803,23-12-2015,pot plants
8,2762,20-03-2015,whole milk
9,4119,12-02-2015,tropical fruit


Let's first see the top 10 selling products

In [82]:
print("Top 10 Frequent Sold Products")
x = df['itemDescription'].value_counts().sort_values(ascending=False)[:10]
fig = px.bar(x=x.index, y=x.values)
fig.update_layout(title_text='Top 10 Frequently Sold Products', xaxis_title='Products', yaxis_title='Count')
fig.show()

Top 10 Frequent Sold Products


In [83]:
df['Year'] = df['Date'].str.split("-").str[-1]
df['Month-Year'] = df['Date'].str.split("-").str[1] + "-" + df['Date'].str.split("-").str[-1]
df.head()

Unnamed: 0,Member_number,Date,itemDescription,Year,Month-Year
0,1808,21-07-2015,tropical fruit,2015,07-2015
1,2552,05-01-2015,whole milk,2015,01-2015
2,2300,19-09-2015,pip fruit,2015,09-2015
3,1187,12-12-2015,other vegetables,2015,12-2015
4,3037,01-02-2015,whole milk,2015,02-2015


In [84]:
# Making a new column nd extracting Year

df['Year'] = df['Date'].str.split("-").str[-1]

df['Month-Year'] = df['Date'].str.split("-").str[1] + "-" + df['Date'].str.split("-").str[-1]

fig_1 = px.bar(df['Month-Year'].value_counts(ascending=False),
    orientation ="v",
    color = df['Month-Year'].value_counts(ascending=False),
    labels = {'value' : 'Count', 'index' : 'Date', 'color' : 'Meter'})
    

fig_1.update_layout(title_text='Higher Sales by Date')
fig_1.show()

From the observation above. we could see that:
   - Milk and other vegetables have been bought the most.
   - Most shopping takes place in August and September meanwhile least shopping has done in February and March.

One hot encoding the products

In [85]:
df.isnull().any()

Member_number      False
Date               False
itemDescription    False
Year               False
Month-Year         False
dtype: bool

In [86]:
products = df['itemDescription'].unique()

In [87]:
dummy = pd.get_dummies(df['itemDescription'])
df.drop(['itemDescription'], inplace=True, axis=1)

df = df.join(dummy)

df.head()

Unnamed: 0,Member_number,Date,Year,Month-Year,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,21-07-2015,2015,07-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,05-01-2015,2015,01-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,19-09-2015,2015,09-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,12-12-2015,2015,12-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,01-02-2015,2015,02-2015,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [88]:
df.shape

(38765, 171)

If a customer has bought multiple items in one day

In [89]:
df1 = df.groupby(['Member_number', 'Date'])[products[:]].sum()
df1 = df1.reset_index()[products]

print("New Dimension", df1.shape)
df1.head()

New Dimension (14963, 167)


Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
# Replacing all non-zero values with the name of products

def product_names(x):
    for product in products:
        if x[product] > 0:
            x[product] = product
    return x

df1 = df1.apply(product_names, axis=1)
df1.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
print('Total number of Transactions:', len(df1))

Total number of Transactions: 14963


In [92]:
# Removing zeros, extracting the list of items bought per customer

x = df1.values
x = [sub[~(sub==0)].tolist() for sub in x if sub [sub != 0].tolist()]
transactions = x
transactions[0:10]

[['whole milk', 'yogurt', 'sausage', 'semi-finished bread'],
 ['whole milk', 'pastry', 'salty snack'],
 ['canned beer', 'misc. beverages'],
 ['sausage', 'hygiene articles'],
 ['soda', 'pickled vegetables'],
 ['frankfurter', 'curd'],
 ['whole milk', 'rolls/buns', 'sausage'],
 ['whole milk', 'soda'],
 ['beef', 'white bread'],
 ['frankfurter', 'soda', 'whipped/sour cream']]

# Implementing Apriori Algorithm

In [137]:
rules = apriori(transactions, min_support=0.00030, min_confidence=0.05, min_left=3, max_length=2, target='rules')

association_result = list(rules)

print(association_result[0][2])


[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'bottled water'}), confidence=0.06068301811134131, lift=1.0)]


In [143]:
for item in association_result:
    items = list(item)
    print('Item', items)
    
    confidence = list(item)
    print('Confidence', item[1])
    
    stats = list(item)
    print('Statistics', item[2])
    

    d = {
        'Items' : items,
        'Confidence' : confidence,
        'Statistics' : stats
    }

    df = pd.DataFrame(d)

    df.head()
    

Item [frozenset({'bottled water'}), 0.06068301811134131, [OrderedStatistic(items_base=frozenset(), items_add=frozenset({'bottled water'}), confidence=0.06068301811134131, lift=1.0)]]
Confidence 0.06068301811134131
Statistics [OrderedStatistic(items_base=frozenset(), items_add=frozenset({'bottled water'}), confidence=0.06068301811134131, lift=1.0)]
Item [frozenset({'citrus fruit'}), 0.05313105660629553, [OrderedStatistic(items_base=frozenset(), items_add=frozenset({'citrus fruit'}), confidence=0.05313105660629553, lift=1.0)]]
Confidence 0.05313105660629553
Statistics [OrderedStatistic(items_base=frozenset(), items_add=frozenset({'citrus fruit'}), confidence=0.05313105660629553, lift=1.0)]
Item [frozenset({'other vegetables'}), 0.12210118291786407, [OrderedStatistic(items_base=frozenset(), items_add=frozenset({'other vegetables'}), confidence=0.12210118291786407, lift=1.0)]]
Confidence 0.12210118291786407
Statistics [OrderedStatistic(items_base=frozenset(), items_add=frozenset({'other ve

Item [frozenset({'seasonal products', 'whole milk'}), 0.0006683151774376796, [OrderedStatistic(items_base=frozenset({'seasonal products'}), items_add=frozenset({'whole milk'}), confidence=0.09433962264150943, lift=0.5973778136203579)]]
Confidence 0.0006683151774376796
Statistics [OrderedStatistic(items_base=frozenset({'seasonal products'}), items_add=frozenset({'whole milk'}), confidence=0.09433962264150943, lift=0.5973778136203579)]
Item [frozenset({'yogurt', 'seasonal products'}), 0.0005346521419501437, [OrderedStatistic(items_base=frozenset({'seasonal products'}), items_add=frozenset({'yogurt'}), confidence=0.07547169811320756, lift=0.8788194699361281)]]
Confidence 0.0005346521419501437
Statistics [OrderedStatistic(items_base=frozenset({'seasonal products'}), items_add=frozenset({'yogurt'}), confidence=0.07547169811320756, lift=0.8788194699361281)]
Item [frozenset({'soda', 'semi-finished bread'}), 0.0008688097306689834, [OrderedStatistic(items_base=frozenset({'semi-finished bread'})