# Содержание
<ol>
<li><a href='#preparing'>Подготовка данных</a></li>
<li><a href='#settings'>Настройка параметров</a></li>
<li><a href='#apriori'>Apriori</a></li>
<li><a href='#eclat'>Eclat</a></li>
<li><a href='#fpgrowth'>FP-Growth</a></li>
<li><a href='#comparing'>Сравнение результатов</a></li>
<li><a href='#links'>Источники</a></li>
</ol>

In [1]:
import time
from collections import defaultdict

import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules
from pyECLAT import ECLAT, Example1, Example2

In [33]:
def prepare_to_eclat_preprocess_df(table):
    transactions_values = defaultdict(list)
    columns = table.columns
    for i, row in table.iterrows():
        for col in columns:
            if row[col]:
                transactions_values[i].append(col)
                
    return pd.DataFrame.from_dict(transactions_values, orient='index')

# Подготовка данных <a name='preparing'></a>

Source: [basket_analysis](https://www.kaggle.com/datasets/ahmtcnbs/datasets-for-appiori/data)

In [34]:
basket_dataset = pd.read_csv('./basket_analysis.csv', index_col='Unnamed: 0')
basket_dataset.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [35]:
basket_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 0 to 998
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Apple         999 non-null    bool 
 1   Bread         999 non-null    bool 
 2   Butter        999 non-null    bool 
 3   Cheese        999 non-null    bool 
 4   Corn          999 non-null    bool 
 5   Dill          999 non-null    bool 
 6   Eggs          999 non-null    bool 
 7   Ice cream     999 non-null    bool 
 8   Kidney Beans  999 non-null    bool 
 9   Milk          999 non-null    bool 
 10  Nutmeg        999 non-null    bool 
 11  Onion         999 non-null    bool 
 12  Sugar         999 non-null    bool 
 13  Unicorn       999 non-null    bool 
 14  Yogurt        999 non-null    bool 
 15  chocolate     999 non-null    bool 
dtypes: bool(16)
memory usage: 23.4 KB


In [36]:
basket_transactions = prepare_to_eclat_preprocess_df(basket_dataset)
basket_transactions.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Bread,Corn,Dill,Ice cream,Sugar,Yogurt,chocolate,,,,,,
1,Milk,,,,,,,,,,,,
2,Apple,Butter,Dill,Ice cream,Milk,Yogurt,chocolate,,,,,,
3,Butter,Cheese,Dill,Milk,Nutmeg,Onion,,,,,,,
4,Apple,Bread,,,,,,,,,,,


ECLAT Example

In [6]:
first_test_transactions = Example1().get()
second_test_transactions = Example2().get()

Bread table
Source: [Bread table](https://github.com/Roh1702/Association-Mining-Rule-from-Scratch)

In [7]:
bread_dataset = pd.read_excel('./raw_bread.xlsx')
bread_dataset = bread_dataset.drop_duplicates()
bread_dataset = bread_dataset['Date,Time,Transaction,Item'].str.split(',', n = 3, expand = True)
bread_dataset.columns = ['Date', 'Time', 'Transaction', 'Item']
bread_dataset.head()

Unnamed: 0,Date,Time,Transaction,Item
0,2016-10-30,09:58:11,1,Bread
1,2016-10-30,10:05:34,2,Scandinavian
3,2016-10-30,10:07:57,3,Hot chocolate
4,2016-10-30,10:07:57,3,Jam
5,2016-10-30,10:07:57,3,Cookies


In [8]:
bread_dataset = pd.crosstab(index= bread_dataset['Transaction'], columns= bread_dataset['Item']).astype(bool)
bread_dataset = bread_dataset.drop(['NONE'], axis = 1)
bread_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9531 entries, 1 to 999
Data columns (total 94 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   Adjustment                     9531 non-null   bool 
 1   Afternoon with the baker       9531 non-null   bool 
 2   Alfajores                      9531 non-null   bool 
 3   Argentina Night                9531 non-null   bool 
 4   Art Tray                       9531 non-null   bool 
 5   Bacon                          9531 non-null   bool 
 6   Baguette                       9531 non-null   bool 
 7   Bakewell                       9531 non-null   bool 
 8   Bare Popcorn                   9531 non-null   bool 
 9   Basket                         9531 non-null   bool 
 10  Bowl Nic Pitt                  9531 non-null   bool 
 11  Bread                          9531 non-null   bool 
 12  Bread Pudding                  9531 non-null   bool 
 13  Brioche and salami      

In [9]:
bread_dataset.head()

Item,Adjustment,Afternoon with the baker,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
100,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [10]:
bread_transactions = prepare_to_eclat_preprocess_df(bread_dataset)
bread_transactions.index = list(range(len(bread_transactions)))
bread_transactions.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,Bread,,,,,,,,,
1,Medialuna,Scandinavian,,,,,,,,
2,Bread,,,,,,,,,
3,Chimichurri Oil,Scandinavian,,,,,,,,
4,Bread,Truffles,,,,,,,,


Init Eclat

In [37]:
test_dataframe = basket_transactions  # choose *_transactions

In [38]:
eclat_inst = ECLAT(test_dataframe)
test_dataframe = eclat_inst.df_bin.astype(bool).drop([None], axis=1, errors='ignore')
eclat_inst.df_bin = test_dataframe
columns = sorted(test_dataframe.columns)
test_dataframe = test_dataframe[columns]
test_dataframe.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


# Settings <a name='settings'></a>

In [39]:
min_support = 0.15
metric = 'lift'
min_threshold = 0.5

# Apriori <a name='apriori'></a>

Source: [rasbt/mlxtend](https://github.com/rasbt/mlxtend)
*Alternative: [ymoch/apyori](https://github.com/ymoch/apyori)*

In [40]:
def get_apriori_itemsets(data, min_supp=0.15, use_colnames=True):
    return apriori(data, min_support=min_supp, use_colnames=use_colnames)

In [240]:
%timeit get_apriori_itemsets(basket_dataset)

4.96 ms ± 141 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
apriori_itemsets = get_apriori_itemsets(test_dataframe, min_supp=0.1)
apriori_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
15,0.421421,(chocolate)
2,0.420420,(Butter)
14,0.420420,(Yogurt)
7,0.410410,(Ice cream)
12,0.409409,(Sugar)
...,...,...
136,0.100100,"(Sugar, Apple, Butter)"
155,0.100100,"(Cheese, Nutmeg, Onion)"
149,0.100100,"(Yogurt, Nutmeg, Butter)"
166,0.100100,"(Milk, Nutmeg, Kidney Beans)"


# Eclat <a name='eclat'></a>

Source: [pyECLAT](https://github.com/jeffrichardchemistry/pyECLAT)
[Использование вместе с mlxtend](https://github.com/rasbt/mlxtend/discussions/959)

In [18]:
def get_eclat_support(eclat_instance: ECLAT, min_supp=0.15):
    _, ECLAT_supports_list = eclat_instance.fit(min_support=min_supp, verbose=False)
    return ECLAT_supports_list

In [138]:
%timeit get_eclat_support(eclat_inst)

1.98 s ± 296 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
ECLAT_supports_list = get_eclat_support(eclat_inst, min_supp=min_support)
eclat_itemsets = pd.DataFrame(ECLAT_supports_list.items(), columns=['itemsets', 'support'])
eclat_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,itemsets,support
14,chocolate,0.421421
2,Yogurt,0.420420
15,Butter,0.420420
9,Ice cream,0.410410
0,Sugar,0.409409
...,...,...
105,Eggs & Bread,0.157157
72,Dill & Eggs,0.157157
101,Eggs & Ice cream,0.157157
102,Eggs & Apple,0.156156


In [20]:
eclat_itemsets['itemsets'] = eclat_itemsets['itemsets'].map(lambda row: tuple(i.strip() for i in row.split('&')))
eclat_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,itemsets,support
14,"(chocolate,)",0.421421
2,"(Yogurt,)",0.420420
15,"(Butter,)",0.420420
9,"(Ice cream,)",0.410410
0,"(Sugar,)",0.409409
...,...,...
105,"(Eggs, Bread)",0.157157
72,"(Dill, Eggs)",0.157157
101,"(Eggs, Ice cream)",0.157157
102,"(Eggs, Apple)",0.156156


# FP-Growth <a name='fpgrowth'></a>

Source: [rasbt/mlxtend](https://github.com/rasbt/mlxtend)

In [21]:
def get_fpgrowth_itemsets(data, min_supp=0.15, use_colnames=True):
    return fpgrowth(data, min_support=min_supp, use_colnames=use_colnames)

In [22]:
%timeit get_fpgrowth_itemsets(basket_dataset)

52.9 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
fpgrowth_itemsets = get_fpgrowth_itemsets(test_dataframe, min_supp=min_support)
fpgrowth_itemsets.sort_values(by='support', ascending=False)

Unnamed: 0,support,itemsets
0,0.421421,(chocolate)
8,0.420420,(Butter)
1,0.420420,(Yogurt)
2,0.410410,(Ice cream)
3,0.409409,(Sugar)
...,...,...
129,0.157157,"(Eggs, Ice cream)"
52,0.157157,"(Bread, Eggs)"
135,0.157157,"(Eggs, Dill)"
75,0.156156,"(Eggs, Apple)"


# Сравнение результатов (Algorithm -- Time) <a name='comparing'></a>

| Algorithm\Dataset | basket_analysis   | Example 1         | Example 2          | Bread table       |
|-------------------|-------------------|-------------------|--------------------|-------------------|
| Apriori           | 6.30 ms ± 1.25 ms | 8.01 ms ± 1.34 ms | 6.84 ms ± 1.58 ms  | 6.24 ms ± 1.95 ms |
| Eclat             | 1.81 s ± 190.0 ms | 401 ms ± 238 ms   | 369 ms ± 115 ms    | 75.8 ms ± 10.6 ms |
| FP-Growth         | 45.0 ms ± 5.06 ms | 3.08 ms ± 1.51 ms | 25.5 ms ± 6.44 ms  | 71.5 ms ± 24.1 ms |  

In [25]:
def time_counter_rule_calculation(dataset, alg, min_supp = 0.001, logs = True):
    start_time = 0
    total_time = 0
    itemsets = dict()
    
    match alg:
        case 'apriori':
            start_time = time.time()
            itemsets = get_apriori_itemsets(dataset, min_supp=min_supp, use_colnames=True)
            total_time = time.time() - start_time
            if logs: print("Computed Apriori")
        case 'eclat':
            eclat_df = prepare_to_eclat_preprocess_df(dataset)
            eclat_df.index = list(range(len(eclat_df)))
            eclat = ECLAT(eclat_df)
            start_time = time.time()
            itemsets = get_eclat_support(eclat, min_supp)
            total_time = time.time() - start_time
            itemsets = pd.DataFrame(itemsets.items(), columns=['itemsets', 'support'])
            itemsets['itemsets'] = itemsets['itemsets'].map(lambda row: tuple(i.strip() for i in row.split('&')))
            if logs: print("Computed Eclat")
        case 'fpg':
            start_time = time.time()
            itemsets = get_fpgrowth_itemsets(dataset, min_supp=min_supp, use_colnames=True)
            total_time = time.time() - start_time
            if logs: print("Computed FP-Growth")
    
    itemsets['number_of_items'] = itemsets['itemsets'].apply(lambda x: len(x))
    
    return itemsets, total_time
    

In [26]:
def compare_algs(dataset):
    apriori_itemsets, apriori_exec_time = time_counter_rule_calculation(dataset, 'apriori', logs=False)
    eclat_itemsets, eclat_exec_time = time_counter_rule_calculation(dataset, 'eclat', logs=False)
    fpgrowth_itemsets, fpgrowth_exec_time = time_counter_rule_calculation(dataset, 'fpg', logs=False)
    print(
        f'Apriori:\t{round(apriori_exec_time, 3)} seconds\n'
        f'Eclat:\t\t{round(eclat_exec_time, 3)} seconds\n'
        f'FP-Growth:\t{round(fpgrowth_exec_time, 3)} seconds\n'
    )

In [117]:
compare_algs(basket_dataset)

Apriori:	2.044 seconds
Eclat:		2.908 seconds
FP-Growth:	0.971 seconds


In [27]:
total_itemsets_dict = defaultdict(list)
for i, row in apriori_itemsets.iterrows():
    total_itemsets_dict[tuple(row['itemsets'])].append(row['support'])
for i, row in eclat_itemsets.iterrows():
    total_itemsets_dict[row['itemsets']].append(row['support'])
for i, row in eclat_itemsets.iterrows():
    total_itemsets_dict[tuple(row['itemsets'])].append(row['support'])

total_itemsets = pd.DataFrame(total_itemsets_dict.values(), columns=['apriori_support', 'eclat_support', 'fpgrowth_support'])
total_itemsets['itemsets'] = total_itemsets_dict.keys()
total_itemsets['compare'] = np.where(
    (total_itemsets['apriori_support'] == total_itemsets['eclat_support']) & 
    (total_itemsets['apriori_support'] == total_itemsets['fpgrowth_support']), 
    True, False
)
total_itemsets.fillna(-1, inplace=True)
total_itemsets['existed'] = np.where(
    (total_itemsets['apriori_support'] != -1) & 
    (total_itemsets['eclat_support'] != -1) & 
    (total_itemsets['fpgrowth_support'] != -1), 
    True, False
)
total_itemsets

Unnamed: 0,apriori_support,eclat_support,fpgrowth_support,itemsets,compare,existed
0,0.383383,0.383383,0.383383,"(Apple,)",True,True
1,0.384384,0.384384,0.384384,"(Bread,)",True,True
2,0.420420,0.420420,0.420420,"(Butter,)",True,True
3,0.404404,0.404404,0.404404,"(Cheese,)",True,True
4,0.407407,0.407407,0.407407,"(Corn,)",True,True
...,...,...,...,...,...,...
183,0.186186,0.186186,-1.000000,"(Apple, Corn)",False,False
184,0.154154,0.154154,-1.000000,"(Apple, Bread)",False,False
185,0.174174,0.174174,-1.000000,"(Corn, Bread)",False,False
186,0.178178,0.178178,-1.000000,"(Onion, Bread)",False,False


In [28]:
total_itemsets.tail()

Unnamed: 0,apriori_support,eclat_support,fpgrowth_support,itemsets,compare,existed
183,0.186186,0.186186,-1.0,"(Apple, Corn)",False,False
184,0.154154,0.154154,-1.0,"(Apple, Bread)",False,False
185,0.174174,0.174174,-1.0,"(Corn, Bread)",False,False
186,0.178178,0.178178,-1.0,"(Onion, Bread)",False,False
187,0.196196,0.196196,-1.0,"(Onion, chocolate)",False,False


In [29]:
print(
    f'Count matched supports: {len(total_itemsets[total_itemsets["compare"] == True])}\n'
    f'Count existed supports: {len(total_itemsets[total_itemsets["existed"] == True])}\n'
    f'Count existed apriori: {len(total_itemsets[total_itemsets["apriori_support"] != -1])}\n'
    f'Count existed eclat: {len(total_itemsets[total_itemsets["eclat_support"] != -1])}\n'
    f'Count existed fp-growth: {len(total_itemsets[total_itemsets["fpgrowth_support"] != -1])}\n'
    f'Total supports count: {len(total_itemsets)}')

Count matched supports: 84
Count existed supports: 84
Count existed apriori: 188
Count existed eclat: 136
Count existed fp-growth: 84
Total supports count: 188


In [30]:
apriori_rules = association_rules(apriori_itemsets, metric=metric, min_threshold=min_threshold)
apriori_rules.sort_values(by='lift', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
208,(Milk),(chocolate),0.405405,0.421421,0.211211,0.520988,1.236263,0.040365,1.207857,0.321413
209,(chocolate),(Milk),0.421421,0.405405,0.211211,0.501188,1.236263,0.040365,1.192021,0.33031
93,(Kidney Beans),(Cheese),0.408408,0.404404,0.2002,0.490196,1.212143,0.035038,1.168284,0.295838
92,(Cheese),(Kidney Beans),0.404404,0.408408,0.2002,0.49505,1.212143,0.035038,1.171583,0.293849
210,(Nutmeg),(Onion),0.401401,0.403403,0.195195,0.486284,1.205454,0.033269,1.161336,0.284727


In [31]:
eclat_rules = association_rules(eclat_itemsets, metric=metric, min_threshold=min_threshold)
eclat_rules.sort_values(by='lift', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
54,(Milk),(chocolate),0.405405,0.421421,0.211211,0.520988,1.236263,0.040365,1.207857,0.321413
55,(chocolate),(Milk),0.421421,0.405405,0.211211,0.501188,1.236263,0.040365,1.192021,0.33031
93,(Kidney Beans),(Cheese),0.408408,0.404404,0.2002,0.490196,1.212143,0.035038,1.168284,0.295838
92,(Cheese),(Kidney Beans),0.404404,0.408408,0.2002,0.49505,1.212143,0.035038,1.171583,0.293849
160,(Nutmeg),(Onion),0.401401,0.403403,0.195195,0.486284,1.205454,0.033269,1.161336,0.284727


In [32]:
fp_rules = association_rules(fpgrowth_itemsets, metric=metric, min_threshold=min_threshold)
fp_rules.sort_values(by='lift', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
78,(Milk),(chocolate),0.405405,0.421421,0.211211,0.520988,1.236263,0.040365,1.207857,0.321413
79,(chocolate),(Milk),0.421421,0.405405,0.211211,0.501188,1.236263,0.040365,1.192021,0.33031
138,(Cheese),(Kidney Beans),0.404404,0.408408,0.2002,0.49505,1.212143,0.035038,1.171583,0.293849
139,(Kidney Beans),(Cheese),0.408408,0.404404,0.2002,0.490196,1.212143,0.035038,1.168284,0.295838
162,(Nutmeg),(Onion),0.401401,0.403403,0.195195,0.486284,1.205454,0.033269,1.161336,0.284727


# Использованная литература <a name='links'></a>

https://analyticsindiamag.com/guide-to-association-rule-mining-from-scratch/
https://habr.com/ru/companies/ods/articles/353502/
https://loginom.ru/blog/fpg
https://www.kaggle.com/code/keitazoumana/comparative-analysis-between-apriori-and-fp-growth