# Market basket analysis

## Introduction

Market basket analysis - analysis of custumer behavior, if you buy produsts A and B, you are more likely to buy product C.

Our tutorial: https://www.kaggle.com/code/khusheekapoor/market-basket-analysis-in-python/notebook

Our data: https://www.kaggle.com/datasets/mittalvasu95/the-bread-basket

In [16]:
# Data download from Kaggle
# should be used once

# Install Kaggle API
%pip install kaggle

# Download the dataset from Kaggle
!kaggle datasets download -d mittalvasu95/the-bread-basket

# Unzip the downloaded file
import zipfile
with zipfile.ZipFile('the-bread-basket.zip', 'r') as zip_ref:
	zip_ref.extractall('.')

Note: you may need to restart the kernel to use updated packages.
Dataset URL: https://www.kaggle.com/datasets/mittalvasu95/the-bread-basket
License(s): CC0-1.0
the-bread-basket.zip: Skipping, found more recently modified local copy (use --force to force download)


In [17]:
# Import libraries

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [18]:
# Load the data
data = pd.read_csv('bread basket.csv')

# Head of the data
data.head()

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
0,1,Bread,30-10-2016 09:58,morning,weekend
1,2,Scandinavian,30-10-2016 10:05,morning,weekend
2,2,Scandinavian,30-10-2016 10:05,morning,weekend
3,3,Hot chocolate,30-10-2016 10:07,morning,weekend
4,3,Jam,30-10-2016 10:07,morning,weekend


## Exploratory Data Analysis

In [19]:
# Info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20507 entries, 0 to 20506
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Transaction      20507 non-null  int64 
 1   Item             20507 non-null  object
 2   date_time        20507 non-null  object
 3   period_day       20507 non-null  object
 4   weekday_weekend  20507 non-null  object
dtypes: int64(1), object(4)
memory usage: 801.2+ KB


In [20]:
data.value_counts('period_day')

period_day
afternoon    11569
morning       8404
evening        520
night           14
Name: count, dtype: int64

In [21]:
data.value_counts('weekday_weekend')

weekday_weekend
weekday    12807
weekend     7700
Name: count, dtype: int64

## Data Transformation

In [22]:
# Grouping by Transaction and aggregating the Items
basket_data = data.groupby(['Transaction']).agg({'Item':lambda x: list(x)})

# Joining with the original data by Transaction
trans_data = data[['Transaction', 'weekday_weekend', 'period_day']].drop_duplicates().set_index('Transaction').join(basket_data).reset_index()
data = trans_data[['Item', 'weekday_weekend', 'period_day']]

data

Unnamed: 0,Item,weekday_weekend,period_day
0,[Bread],weekend,morning
1,"[Scandinavian, Scandinavian]",weekend,morning
2,"[Hot chocolate, Jam, Cookies]",weekend,morning
3,[Muffin],weekend,morning
4,"[Coffee, Pastry, Bread]",weekend,morning
...,...,...,...
9460,[Bread],weekend,afternoon
9461,"[Truffles, Tea, Spanish Brunch, Christmas common]",weekend,afternoon
9462,"[Muffin, Tacos/Fajita, Coffee, Tea]",weekend,afternoon
9463,"[Coffee, Pastry]",weekend,afternoon


In [23]:
# fitting the list and converting the transactions to true and false
te = TransactionEncoder()
te_ary = te.fit(data['Item']).transform(data['Item'])
bdata = pd.DataFrame(te_ary, columns=te.columns_)

# # joining the data with the weekday_weekend and period_day
# bdata = bdata.join(data[['weekday_weekend', 'period_day']])

# # convert weekday_weekend and period_day to boolean
# bdata['weekday_weekend'] = bdata['weekday_weekend'].replace({'weekday': False, 'weekend': True})
# bdata['period_day'] = bdata['period_day'].replace({'morning': False, 'afternoon': True, 'evening': False, 'night': False})

bdata

Unnamed: 0,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9460,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9461,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
9462,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9463,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Apriori Algorithm

In [24]:
# finding the dimensions of the dataframe
bdata.shape

(9465, 94)

In [25]:
# applying the apriori algorithm
frequent_itemsets = apriori(bdata, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets = frequent_itemsets.sort_values(by='support', ascending=False).reset_index(drop=True)
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.478394,(Coffee),1
1,0.327205,(Bread),1
2,0.142631,(Tea),1
3,0.103856,(Cake),1
4,0.090016,"(Coffee, Bread)",2
...,...,...,...
56,0.010565,(Hearty & Seasonal),1
57,0.010460,(Salad),1
58,0.010354,"(Alfajores, Bread)",2
59,0.010037,"(Coffee, Bread, Cake)",3


In [26]:
# finding top 5 single items
frequent_itemsets[(frequent_itemsets['length'] == 1)].sort_values(by='support', ascending=False).head()

Unnamed: 0,support,itemsets,length
0,0.478394,(Coffee),1
1,0.327205,(Bread),1
2,0.142631,(Tea),1
3,0.103856,(Cake),1
5,0.086107,(Pastry),1


In [27]:
frequent_itemsets[(frequent_itemsets['length'] > 1)].sort_values(by='support', ascending=False).head()

Unnamed: 0,support,itemsets,length
4,0.090016,"(Coffee, Bread)",2
9,0.054728,"(Coffee, Cake)",2
11,0.049868,"(Coffee, Tea)",2
12,0.047544,"(Pastry, Coffee)",2
17,0.038246,"(Coffee, Sandwich)",2


In [28]:
# finding top 10 association rules with minimum confidence of 1%
rules = association_rules(frequent_itemsets, 1, metric='support', min_threshold=0.01)

# sorting the rules in the descending order by confidence
rules.sort_values(by='confidence', ascending=False).reset_index(drop=True).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Toast),(Coffee),0.033597,0.478394,0.023666,0.704403,1.472431,1.0,0.007593,1.764582,0.332006,0.048464,0.433293,0.376936
1,(Spanish Brunch),(Coffee),0.018172,0.478394,0.010882,0.598837,1.251766,1.0,0.002189,1.300235,0.204851,0.022406,0.230908,0.310792
2,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,1.0,0.005614,1.210871,0.170091,0.069665,0.174148,0.321387
3,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,1.0,0.006351,1.164682,0.146161,0.091968,0.141396,0.325764
4,(Alfajores),(Coffee),0.036344,0.478394,0.019651,0.540698,1.130235,1.0,0.002264,1.135648,0.119574,0.039693,0.119446,0.290888
5,(Juice),(Coffee),0.038563,0.478394,0.020602,0.534247,1.11675,1.0,0.002154,1.119919,0.108738,0.041507,0.107078,0.288656
6,(Sandwich),(Coffee),0.071844,0.478394,0.038246,0.532353,1.112792,1.0,0.003877,1.115384,0.109205,0.074701,0.103448,0.30615
7,(Cake),(Coffee),0.103856,0.478394,0.054728,0.526958,1.101515,1.0,0.005044,1.102664,0.10284,0.103745,0.093105,0.320679
8,(Scone),(Coffee),0.034548,0.478394,0.018067,0.522936,1.093107,1.0,0.001539,1.093366,0.088224,0.036507,0.085393,0.28035
9,(Cookies),(Coffee),0.054411,0.478394,0.028209,0.518447,1.083723,1.0,0.002179,1.083174,0.0817,0.055905,0.076787,0.288707


In [30]:
# finding association rules with minimum support of 2% and having lift more than 1
rules[(rules['support'] >= 0.02) &
      (rules['lift'] > 1.0)]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
2,(Coffee),(Cake),0.478394,0.103856,0.054728,0.114399,1.101515,1.0,0.005044,1.011905,0.176684,0.103745,0.011765,0.320679
3,(Cake),(Coffee),0.103856,0.478394,0.054728,0.526958,1.101515,1.0,0.005044,1.102664,0.10284,0.103745,0.093105,0.320679
6,(Pastry),(Coffee),0.086107,0.478394,0.047544,0.552147,1.154168,1.0,0.006351,1.164682,0.146161,0.091968,0.141396,0.325764
7,(Coffee),(Pastry),0.478394,0.086107,0.047544,0.099382,1.154168,1.0,0.006351,1.01474,0.256084,0.091968,0.014526,0.325764
8,(Coffee),(Sandwich),0.478394,0.071844,0.038246,0.079947,1.112792,1.0,0.003877,1.008807,0.194321,0.074701,0.008731,0.30615
9,(Sandwich),(Coffee),0.071844,0.478394,0.038246,0.532353,1.112792,1.0,0.003877,1.115384,0.109205,0.074701,0.103448,0.30615
10,(Coffee),(Medialuna),0.478394,0.061807,0.035182,0.073542,1.189878,1.0,0.005614,1.012667,0.305936,0.069665,0.012509,0.321387
11,(Medialuna),(Coffee),0.061807,0.478394,0.035182,0.569231,1.189878,1.0,0.005614,1.210871,0.170091,0.069665,0.174148,0.321387
12,(Coffee),(Hot chocolate),0.478394,0.05832,0.029583,0.061837,1.060311,1.0,0.001683,1.003749,0.109048,0.058333,0.003735,0.284542
13,(Hot chocolate),(Coffee),0.05832,0.478394,0.029583,0.507246,1.060311,1.0,0.001683,1.058553,0.060403,0.058333,0.055314,0.284542


In [32]:
rules[rules['antecedents'] == {'Coffee'}].sort_values(by='confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Coffee),(Bread),0.478394,0.327205,0.090016,0.188163,0.575059,1.0,-0.066517,0.828731,-0.58621,0.125794,-0.206665,0.231634
2,(Coffee),(Cake),0.478394,0.103856,0.054728,0.114399,1.101515,1.0,0.005044,1.011905,0.176684,0.103745,0.011765,0.320679
4,(Coffee),(Tea),0.478394,0.142631,0.049868,0.10424,0.73084,1.0,-0.018366,0.957142,-0.413856,0.08731,-0.044777,0.226935
7,(Coffee),(Pastry),0.478394,0.086107,0.047544,0.099382,1.154168,1.0,0.006351,1.01474,0.256084,0.091968,0.014526,0.325764
8,(Coffee),(Sandwich),0.478394,0.071844,0.038246,0.079947,1.112792,1.0,0.003877,1.008807,0.194321,0.074701,0.008731,0.30615


In [33]:
rules[rules['antecedents'] == {'Bread'}].sort_values(by='confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
1,(Bread),(Coffee),0.327205,0.478394,0.090016,0.275105,0.575059,1.0,-0.066517,0.719561,-0.523431,0.125794,-0.389737,0.231634
15,(Bread),(Pastry),0.327205,0.086107,0.02916,0.089119,1.034977,1.0,0.000985,1.003306,0.050231,0.075908,0.003296,0.213884
18,(Bread),(Tea),0.327205,0.142631,0.028104,0.08589,0.602181,1.0,-0.018566,0.937927,-0.495438,0.063621,-0.066181,0.141463
24,(Bread),(Cake),0.327205,0.103856,0.023349,0.071359,0.687097,1.0,-0.010633,0.965006,-0.403653,0.057269,-0.036263,0.148091
36,(Bread),(Sandwich),0.327205,0.071844,0.01701,0.051986,0.723596,1.0,-0.006498,0.979053,-0.362147,0.044524,-0.021395,0.144375


## Visualization

In [15]:
import networkx as nx
from pyvis.network import Network

# Create a networkx graph
G = nx.Graph()

# Add nodes
for item in frequent_itemsets['itemsets']:
    for sub_item in item:
        G.add_node(sub_item)

# Add edges
for _, row in rules.iterrows():
    for antecedent in row['antecedents']:
        for consequent in row['consequents']:
            G.add_edge(antecedent, consequent, weight=row['support'])

# Create a pyvis network
net = Network(notebook=True)
net.from_nx(G)

# Show the network
net.show("rules_graph.html")

rules_graph.html
