### Market Basket Analysis: Unsupervised Learning 


### Loading Libraries

In [23]:
# Libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules

### Processing Data and Loading

In [24]:
# import into colab
from google.colab import files
uploaded = files.upload()


Saving groceries.csv to groceries (1).csv


In [25]:
# Read the dataset
df = pd.read_csv('groceries.csv')


In [26]:
#df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Member_number    38765 non-null  int64 
 1   Date             38765 non-null  object
 2   itemDescription  38765 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.7+ KB


In [27]:
# df head
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [28]:
# Change date to date object
df['Date'] = pd.to_datetime(df.Date)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38765 entries, 0 to 38764
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Member_number    38765 non-null  int64         
 1   Date             38765 non-null  datetime64[ns]
 2   itemDescription  38765 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 908.7+ KB


In [30]:
# Sorting by member_number and date
df = df.sort_values(by = ['Member_number', 'Date'])
df.head()

Unnamed: 0,Member_number,Date,itemDescription
13331,1000,2014-06-24,whole milk
29480,1000,2014-06-24,pastry
32851,1000,2014-06-24,salty snack
4843,1000,2015-03-15,sausage
8395,1000,2015-03-15,whole milk


In [31]:
# Dropping dupicates
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38006 entries, 13331 to 34885
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Member_number    38006 non-null  int64         
 1   Date             38006 non-null  datetime64[ns]
 2   itemDescription  38006 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 1.2+ MB


In [32]:
# adding quantity variable
df['Quantity'] = 1 
df.head()

Unnamed: 0,Member_number,Date,itemDescription,Quantity
13331,1000,2014-06-24,whole milk,1
29480,1000,2014-06-24,pastry,1
32851,1000,2014-06-24,salty snack,1
4843,1000,2015-03-15,sausage,1
8395,1000,2015-03-15,whole milk,1


In [35]:
# Creating transactions 
df = df.pivot_table(index = ['Member_number', 'Date'],
                    columns = 'itemDescription', values = 'Quantity', aggfunc = 'sum').fillna(0)
df.head()

Unnamed: 0_level_0,itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1000,2014-06-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1000,2015-03-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1000,2015-05-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,2015-07-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,2015-11-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# resetting index
df = df.reset_index()
df.head()

itemDescription,Member_number,Date,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1000,2014-06-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1000,2015-03-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,1000,2015-05-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1000,2015-07-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1000,2015-11-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
# removing member_number and date
df = df.iloc[:, 4:]
df.head()

itemDescription,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,beverages,bottled beer,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Association Analysis

In [38]:
# create frequent itemsets
pd.options.display.max_rows = None
itemssets = apriori(df, min_support = 0.01, use_colnames= True)
itemssets


Unnamed: 0,support,itemsets
0,0.03395,(beef)
1,0.021787,(berries)
2,0.016574,(beverages)
3,0.045312,(bottled beer)
4,0.060683,(bottled water)
5,0.037626,(brown bread)
6,0.03522,(butter)
7,0.017577,(butter milk)
8,0.014369,(candy)
9,0.046916,(canned beer)


In [None]:
# create frequent itemsets
pd.options.display.max_rows = None
itemssets = apriori(df, min_support = 0.1, use_colnames= True)
itemssets


Unnamed: 0,support,itemsets
0,0.394,(ChildBks)
1,0.23825,(YouthBks)
2,0.4155,(CookBks)
3,0.25475,(DoItYBks)
4,0.20475,(RefBks)
5,0.223,(ArtBks)
6,0.26675,(GeogBks)
7,0.1075,(ItalCook)
8,0.1475,"(ChildBks, YouthBks)"
9,0.242,"(ChildBks, CookBks)"


In [None]:
# Extract Rules Based on Support 
rules = association_rules(itemssets, metric = 'support', min_threshold= 0.2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
1,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256


In [None]:
# Extract Rules Based on lift 
rules = association_rules(itemssets, metric = 'lift', min_threshold= 0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ChildBks),(YouthBks),0.394,0.23825,0.1475,0.374365,1.571314,0.053629,1.217564
1,(YouthBks),(ChildBks),0.23825,0.394,0.1475,0.619098,1.571314,0.053629,1.590959
2,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
3,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256
4,(ChildBks),(DoItYBks),0.394,0.25475,0.1615,0.409898,1.609022,0.061129,1.262918
5,(DoItYBks),(ChildBks),0.25475,0.394,0.1615,0.633955,1.609022,0.061129,1.655534
6,(ChildBks),(RefBks),0.394,0.20475,0.12825,0.325508,1.589781,0.047579,1.179035
7,(RefBks),(ChildBks),0.20475,0.394,0.12825,0.626374,1.589781,0.047579,1.621941
8,(ChildBks),(ArtBks),0.394,0.223,0.10525,0.267132,1.197901,0.017388,1.060218
9,(ArtBks),(ChildBks),0.223,0.394,0.10525,0.471973,1.197901,0.017388,1.147669


In [None]:
# Extract Rules Based on conviction
rules = association_rules(itemssets, metric = 'conviction', min_threshold= 0.2)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ChildBks),(YouthBks),0.394,0.23825,0.1475,0.374365,1.571314,0.053629,1.217564
1,(YouthBks),(ChildBks),0.23825,0.394,0.1475,0.619098,1.571314,0.053629,1.590959
2,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
3,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256
4,(ChildBks),(DoItYBks),0.394,0.25475,0.1615,0.409898,1.609022,0.061129,1.262918
5,(DoItYBks),(ChildBks),0.25475,0.394,0.1615,0.633955,1.609022,0.061129,1.655534
6,(ChildBks),(RefBks),0.394,0.20475,0.12825,0.325508,1.589781,0.047579,1.179035
7,(RefBks),(ChildBks),0.20475,0.394,0.12825,0.626374,1.589781,0.047579,1.621941
8,(ChildBks),(ArtBks),0.394,0.223,0.10525,0.267132,1.197901,0.017388,1.060218
9,(ArtBks),(ChildBks),0.223,0.394,0.10525,0.471973,1.197901,0.017388,1.147669


In [None]:
# Sort values 
rules.sort_values(by = ('lift'), ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
45,(RefBks),"(ChildBks, CookBks)",0.20475,0.242,0.1035,0.505495,2.08882,0.05395,1.532844
40,"(ChildBks, CookBks)",(RefBks),0.242,0.20475,0.1035,0.427686,2.08882,0.05395,1.389534
29,"(ChildBks, CookBks)",(YouthBks),0.242,0.23825,0.12,0.495868,2.081292,0.062344,1.511012
32,(YouthBks),"(ChildBks, CookBks)",0.23825,0.242,0.12,0.503673,2.081292,0.062344,1.527218
34,"(ChildBks, CookBks)",(DoItYBks),0.242,0.25475,0.12775,0.527893,2.072198,0.066101,1.57856


In [None]:
# Sort values 
rules.sort_values(by = ('leverage'), ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
3,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256
34,"(ChildBks, CookBks)",(DoItYBks),0.242,0.25475,0.12775,0.527893,2.072198,0.066101,1.57856
39,(DoItYBks),"(ChildBks, CookBks)",0.25475,0.242,0.12775,0.501472,2.072198,0.066101,1.520476
19,(DoItYBks),(CookBks),0.25475,0.4155,0.16875,0.662414,1.594258,0.062901,1.731411


In [None]:
# Sort values 
rules.sort_values(by = ('conviction'), ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
28,"(ChildBks, YouthBks)",(CookBks),0.1475,0.4155,0.12,0.813559,1.958025,0.058714,3.135045
41,"(ChildBks, RefBks)",(CookBks),0.12825,0.4155,0.1035,0.807018,1.94228,0.050212,3.028773
35,"(ChildBks, DoItYBks)",(CookBks),0.1615,0.4155,0.12775,0.791022,1.903783,0.060647,2.796941
36,"(CookBks, DoItYBks)",(ChildBks),0.16875,0.394,0.12775,0.757037,1.921414,0.061262,2.494207
30,"(YouthBks, CookBks)",(ChildBks),0.161,0.394,0.12,0.745342,1.89173,0.056566,2.379659


In [None]:
# Sort values 
rules.sort_values(by = ('support'), ascending = False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
2,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
3,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256
18,(CookBks),(DoItYBks),0.4155,0.25475,0.16875,0.406137,1.594258,0.062901,1.254919
19,(DoItYBks),(CookBks),0.25475,0.4155,0.16875,0.662414,1.594258,0.062901,1.731411
4,(ChildBks),(DoItYBks),0.394,0.25475,0.1615,0.409898,1.609022,0.061129,1.262918


In [None]:
# Extract rules based on lift
rules = association_rules(itemssets, metric = 'lift', min_threshold= 1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(ChildBks),(YouthBks),0.394,0.23825,0.1475,0.374365,1.571314,0.053629,1.217564
1,(YouthBks),(ChildBks),0.23825,0.394,0.1475,0.619098,1.571314,0.053629,1.590959
2,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
3,(CookBks),(ChildBks),0.4155,0.394,0.242,0.582431,1.478251,0.078293,1.451256
4,(ChildBks),(DoItYBks),0.394,0.25475,0.1615,0.409898,1.609022,0.061129,1.262918
5,(DoItYBks),(ChildBks),0.25475,0.394,0.1615,0.633955,1.609022,0.061129,1.655534
6,(ChildBks),(RefBks),0.394,0.20475,0.12825,0.325508,1.589781,0.047579,1.179035
7,(RefBks),(ChildBks),0.20475,0.394,0.12825,0.626374,1.589781,0.047579,1.621941
8,(ChildBks),(ArtBks),0.394,0.223,0.10525,0.267132,1.197901,0.017388,1.060218
9,(ArtBks),(ChildBks),0.223,0.394,0.10525,0.471973,1.197901,0.017388,1.147669


In [None]:
# Extract rules based on conviction
rules = association_rules(itemssets, metric = 'conviction', min_threshold= 1.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(YouthBks),(ChildBks),0.23825,0.394,0.1475,0.619098,1.571314,0.053629,1.590959
1,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
2,(DoItYBks),(ChildBks),0.25475,0.394,0.1615,0.633955,1.609022,0.061129,1.655534
3,(RefBks),(ChildBks),0.20475,0.394,0.12825,0.626374,1.589781,0.047579,1.621941
4,(YouthBks),(CookBks),0.23825,0.4155,0.161,0.675761,1.62638,0.062007,1.802681
5,(DoItYBks),(CookBks),0.25475,0.4155,0.16875,0.662414,1.594258,0.062901,1.731411
6,(RefBks),(CookBks),0.20475,0.4155,0.13975,0.68254,1.642695,0.054676,1.841175
7,"(ChildBks, YouthBks)",(CookBks),0.1475,0.4155,0.12,0.813559,1.958025,0.058714,3.135045
8,"(ChildBks, CookBks)",(YouthBks),0.242,0.23825,0.12,0.495868,2.081292,0.062344,1.511012
9,"(YouthBks, CookBks)",(ChildBks),0.161,0.394,0.12,0.745342,1.89173,0.056566,2.379659


In [None]:
# Sort values 
rules.sort_values(by = ('support'), ascending = False).head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
1,(ChildBks),(CookBks),0.394,0.4155,0.242,0.614213,1.478251,0.078293,1.515086
5,(DoItYBks),(CookBks),0.25475,0.4155,0.16875,0.662414,1.594258,0.062901,1.731411
2,(DoItYBks),(ChildBks),0.25475,0.394,0.1615,0.633955,1.609022,0.061129,1.655534
4,(YouthBks),(CookBks),0.23825,0.4155,0.161,0.675761,1.62638,0.062007,1.802681
0,(YouthBks),(ChildBks),0.23825,0.394,0.1475,0.619098,1.571314,0.053629,1.590959
6,(RefBks),(CookBks),0.20475,0.4155,0.13975,0.68254,1.642695,0.054676,1.841175
