In [2]:
# for basic operations
import numpy as np
import pandas as pd

# for visualizations
import matplotlib.pyplot as plt
import squarify
import seaborn as sns
plt.style.use('fivethirtyeight')

# for defining path
import os


# for market basket analysis
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [3]:
# Importing the dataset : 
data = pd.read_csv('Market_Basket_Optimisation.csv', header = None)


In [4]:
data.shape

(7501, 20)

In [5]:

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


In [6]:
# checkng the tail of the data

data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
7496,butter,light mayo,fresh bread,,,,,,,,,,,,,,,,,
7497,burgers,frozen vegetables,eggs,french fries,magazines,green tea,,,,,,,,,,,,,,
7498,chicken,,,,,,,,,,,,,,,,,,,
7499,escalope,green tea,,,,,,,,,,,,,,,,,,
7500,eggs,frozen smoothie,yogurt cake,low fat yogurt,,,,,,,,,,,,,,,,


In [7]:
# checking the random entries in the data

data.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
2008,ham,herb & pepper,ground beef,chocolate,light cream,frozen smoothie,champagne,fresh bread,,,,,,,,,,,,
6483,cookies,,,,,,,,,,,,,,,,,,,
1376,french fries,brownies,,,,,,,,,,,,,,,,,,
3421,chocolate,ground beef,mineral water,eggs,whole wheat rice,,,,,,,,,,,,,,,
3783,champagne,,,,,,,,,,,,,,,,,,,
6966,muffins,,,,,,,,,,,,,,,,,,,
4580,muffins,escalope,white wine,green tea,,,,,,,,,,,,,,,,
1074,burgers,grated cheese,,,,,,,,,,,,,,,,,,
2768,burgers,ham,spaghetti,salmon,eggs,,,,,,,,,,,,,,,
7421,cookies,,,,,,,,,,,,,,,,,,,


In [8]:
# let's describe the dataset

data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
count,7501,5747,4389,3345,2529,1864,1369,981,654,395,256,154,87,47,25,8,4,4,3,1
unique,115,117,115,114,110,106,102,98,88,80,66,50,43,28,19,8,3,3,3,1
top,mineral water,mineral water,mineral water,mineral water,green tea,french fries,green tea,green tea,green tea,green tea,low fat yogurt,green tea,green tea,green tea,magazines,sparkling water,frozen smoothie,protein bar,spinach,olive oil
freq,577,484,375,201,153,107,96,67,57,31,22,15,8,4,3,1,2,2,1,1


In [None]:
# making each customers shopping items an identical list
trans = []
for i in range(0, 7501):
    trans.append([str(data.values[i,j]) for j in range(0, 20)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

In [14]:
# Using Transaction encoder 

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data = te.fit_transform(trans)
data = pd.DataFrame(data, columns = te.columns_)

# getting the shape of the data
data.shape

(7500, 121)

In [15]:
import warnings
warnings.filterwarnings('ignore')

# getting correlations for 121 items would be messy 
# so let's reduce the items from 121 to 50

data = data.loc[:, ['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables', 'spaghetti',
                    'shrimp', 'grated cheese', 'eggs', 'cookies', 'french fries', 'herb & pepper', 'ground beef',
                    'tomatoes', 'milk', 'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
                    'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar', 'olive oil', 'champagne', 
                    'avocado', 'pepper', 'butter', 'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 
                    'chicken', 'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie', 'yogurt cake']]

# checking the shape
data.shape

(7500, 40)

In [16]:
# let's check the columns

data.columns

Index(['mineral water', 'burgers', 'turkey', 'chocolate', 'frozen vegetables',
       'spaghetti', 'shrimp', 'grated cheese', 'eggs', 'cookies',
       'french fries', 'herb & pepper', 'ground beef', 'tomatoes', 'milk',
       'escalope', 'fresh tuna', 'red wine', 'ham', 'cake', 'green tea',
       'whole wheat pasta', 'pancakes', 'soup', 'muffins', 'energy bar',
       'olive oil', 'champagne', 'avocado', 'pepper', 'butter',
       'parmesan cheese', 'whole wheat rice', 'low fat yogurt', 'chicken',
       'vegetables mix', 'pickles', 'meatballs', 'frozen smoothie',
       'yogurt cake'],
      dtype='object')

In [17]:
# getting the head of the data

data.head()

Unnamed: 0,mineral water,burgers,turkey,chocolate,frozen vegetables,spaghetti,shrimp,grated cheese,eggs,cookies,...,butter,parmesan cheese,whole wheat rice,low fat yogurt,chicken,vegetables mix,pickles,meatballs,frozen smoothie,yogurt cake
0,False,True,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [19]:
# Applying priori : 
from mlxtend.frequent_patterns import apriori

#Now, let us return the items and itemsets with at least 5% support:
apriori(data, min_support = 0.01, use_colnames = True)


Unnamed: 0,support,itemsets
0,0.238267,(mineral water)
1,0.087200,(burgers)
2,0.062533,(turkey)
3,0.163867,(chocolate)
4,0.095333,(frozen vegetables)
...,...,...
206,0.010133,"(ground beef, eggs, mineral water)"
207,0.013067,"(eggs, milk, mineral water)"
208,0.011067,"(ground beef, milk, mineral water)"
209,0.010533,"(chocolate, eggs, spaghetti)"


In [21]:
# Selecting and Filtering the Results : 
frequent_itemsets = apriori(data, min_support = 0.05, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1
1,0.0872,(burgers),1
2,0.062533,(turkey),1
3,0.163867,(chocolate),1
4,0.095333,(frozen vegetables),1
5,0.174133,(spaghetti),1
6,0.071333,(shrimp),1
7,0.0524,(grated cheese),1
8,0.179733,(eggs),1
9,0.0804,(cookies),1


In [22]:
# getting th item sets with length = 2 and support more han 10%

frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.01) ]

Unnamed: 0,support,itemsets,length
24,0.052667,"(chocolate, mineral water)",2
25,0.059733,"(spaghetti, mineral water)",2
26,0.050933,"(eggs, mineral water)",2


In [23]:
# getting th item sets with length = 2 and support more han 10%

frequent_itemsets[ (frequent_itemsets['length'] == 1) &
                   (frequent_itemsets['support'] >= 0.01) ]

Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1
1,0.0872,(burgers),1
2,0.062533,(turkey),1
3,0.163867,(chocolate),1
4,0.095333,(frozen vegetables),1
5,0.174133,(spaghetti),1
6,0.071333,(shrimp),1
7,0.0524,(grated cheese),1
8,0.179733,(eggs),1
9,0.0804,(cookies),1


In [24]:
#Association Mining : 

In [25]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'eggs', 'mineral water'} ]


Unnamed: 0,support,itemsets,length
26,0.050933,"(eggs, mineral water)",2


In [26]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'mineral water'} ]


Unnamed: 0,support,itemsets,length
0,0.238267,(mineral water),1


In [27]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'milk'} ]


Unnamed: 0,support,itemsets,length
13,0.1296,(milk),1


In [28]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'chicken'} ]

Unnamed: 0,support,itemsets,length
22,0.06,(chicken),1


In [29]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'frozen vegetables'} ]


Unnamed: 0,support,itemsets,length
4,0.095333,(frozen vegetables),1


In [30]:
frequent_itemsets[ frequent_itemsets['itemsets'] == {'chocolate'} ]


Unnamed: 0,support,itemsets,length
3,0.163867,(chocolate),1
