In [175]:
# Import Libraries
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules
import matplotlib.pyplot as plt
from pyECLAT import ECLAT

In [176]:
# Read the dataset 
df = pd.read_csv('retail_dataset.csv', sep=',')

# Print the first 5 rows
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,Bread,Wine,Eggs,Meat,Cheese,Pencil,Diaper
1,Bread,Cheese,Meat,Diaper,Wine,Milk,Pencil
2,Cheese,Meat,Eggs,Milk,Wine,,
3,Cheese,Meat,Eggs,Milk,Wine,,
4,Meat,Pencil,Wine,,,,


In [177]:
# Find the unique items in the table
items = set()
for col in df:
    items.update(df[col].unique())
print(items)

{'Meat', 'Cheese', 'Bread', 'Wine', nan, 'Pencil', 'Bagel', 'Eggs', 'Diaper', 'Milk'}


In [178]:
itemset = set(items)
encoded_vals = []
for index, row in df.iterrows():
    rowset = set(row) 
    labels = {}
    uncommons = list(itemset - rowset)
    commons = list(itemset.intersection(rowset))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
encoded_vals[0]
ohe_df = pd.DataFrame(encoded_vals)

fpgrowth

In [179]:
%%time
# Applying fpgrowth
freq_items = fpgrowth(ohe_df, min_support=0.1, use_colnames=True)

# Mining association rules
fpgrowth_rules = association_rules(freq_items, metric="confidence", min_threshold=0.5)

# sets
a = set(['Milk', 'Meat', 'Cheese'])
egg = set(['Eggs'])
meat = set(['Meat'])
milk = set(['Milk'])
cheese = set(['Cheese'])

egg_meat = set(['Eggs', 'Meat'])
egg_cheese = set(['Eggs', 'Cheese'])
meat_cheese = set(['Meat', 'Cheese'])
egg_meat_cheese = set(['Eggs', 'Meat', 'Cheese'])

sum = 0
idx = -1

for idx, values in freq_items.items():
    if idx == 'itemsets':
        val_list = values.tolist()
        for i, v in enumerate(val_list):
            if v == a:
                idx_a = i
            elif v == egg or v == meat or v == cheese or v == egg_cheese_meat:
                sum += freq_items.at[i, 'support']
            elif v == egg_meat or v == egg_cheese or v == meat_cheese or v == egg_cheese_meat:
                sum -= freq_items.at[i, 'support']


print("Rate of Milk, Meat, Cheese being bought together: ", freq_items.at[idx_a,'support'])
print("Percentage of Customers who buy Eggs, Meat, Cheese: ", sum * 100)


Rate of Milk, Meat, Cheese being bought together:  0.20317460317460317
Percentage of Customers who buy Eggs, Meat, Cheese:  74.28571428571428
CPU times: total: 15.6 ms
Wall time: 20 ms




ECLAT

In [180]:
attrCount = len(df.count())
i = 0
for col in df.columns[:attrCount].tolist():
    df.rename(columns={col : i}, inplace=True)
    i += 1

In [181]:
%%time

# Init ECLAT
eclat = ECLAT(df)

# Apply ECLAT
x, y = eclat.fit(min_support=0.1, min_combination=1, max_combination=7, separator=' ')

# sets
a = set(['Milk', 'Meat', 'Cheese'])
egg = set(['Eggs'])
meat = set(['Meat'])
milk = set(['Milk'])
cheese = set(['Cheese'])

egg_meat = set(['Eggs', 'Meat'])
egg_cheese = set(['Eggs', 'Cheese'])
meat_cheese = set(['Meat', 'Cheese'])
egg_meat_cheese = set(['Eggs', 'Meat', 'Cheese'])

sum = 0
idx = -1

for k, v in y.items():
    s = set(k.split())
    if s == a:
        support = v
    elif s == egg or s == meat or s == cheese or s == egg_meat_cheese:
        sum += v
    elif s == egg_meat or s == egg_cheese or s == meat_cheese:
        sum -= v

print("Rate of Milk, Meat, Cheese being bought together: ", support)
print("Percentage of Customers who buy Eggs, Meat, Cheese: ", sum * 100)

Combination 1 by 1


9it [00:00, 204.48it/s]


Combination 2 by 2


36it [00:00, 163.58it/s]


Combination 3 by 3


84it [00:00, 192.62it/s]


Combination 4 by 4


126it [00:00, 260.26it/s]


Combination 5 by 5


126it [00:00, 220.22it/s]


Combination 6 by 6


84it [00:00, 218.68it/s]


Combination 7 by 7


36it [00:00, 160.67it/s]

Rate of Milk, Meat, Cheese being bought together:  0.20317460317460317
Percentage of Customers who buy Eggs, Meat, Cheese:  74.28571428571428
CPU times: total: 2.48 s
Wall time: 2.43 s



