In [8]:
import os
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import warnings

warnings.filterwarnings('ignore')

In [9]:
path = os.path.join(os.getcwd(), 'DATA/Groceries_dataset.csv')

In [18]:
df = pd.read_csv(path)
df

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk
...,...,...,...
38760,4471,08-10-2014,sliced cheese
38761,2022,23-02-2014,candy
38762,1097,16-04-2014,cake bar
38763,1510,03-12-2014,fruit/vegetable juice


In [42]:
df['itemDescription'].value_counts()

itemDescription
whole milk               2502
other vegetables         1898
rolls/buns               1716
soda                     1514
yogurt                   1334
                         ... 
rubbing alcohol             5
bags                        4
baby cosmetics              3
kitchen utensil             1
preservation products       1
Name: count, Length: 167, dtype: int64

In [51]:
transactions = df.groupby(['Member_number', 'itemDescription'])['Date'].count().unstack().reset_index().fillna(0).set_index('Member_number')
transactions = transactions.astype('int')
transactions

itemDescription,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
Member_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,1,0
1001,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,1,0,2,0,0
1002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4999,0,0,0,0,0,0,0,0,0,2,...,0,0,0,1,0,0,0,0,1,0


In [60]:
encoder = TransactionEncoder()
transactions_encoded = encoder.fit_transform(transactions)
transactions_encoded

array([[ True, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [62]:
df_transactions_encoded = pd.DataFrame(transactions_encoded, columns=encoder.columns_)
df_transactions_encoded

Unnamed: 0,Unnamed: 1,(,),-,.,/,H,I,T,U,...,p,q,r,s,t,u,v,w,y,z
0,True,False,False,False,False,False,False,True,False,False,...,True,False,True,True,True,True,False,False,False,False
1,False,False,False,True,False,False,True,False,True,True,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,True,True,False,False,True,False,False,False
3,True,False,False,False,True,False,False,False,False,False,...,False,False,True,True,True,False,False,True,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3894,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3895,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3896,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [58]:
frequent_itemsets = apriori(df_transactions_encoded, min_support=0.01, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.025398,( )
1,0.026680,(a)
2,0.010005,(b)
3,0.020780,(c)
4,0.014366,(d)
...,...,...
100,0.010005,"(i, e, s)"
101,0.010005,"(n, r, e)"
102,0.012314,"(r, e, s)"
103,0.010005,"(r, e, t)"


In [59]:
# Виведення частих наборів
print("Часті набори:")
print(frequent_itemsets)

# Генерація асоціативних правил
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.2)

# Виведення асоціативних правил
print("\nАсоціативні правила:")
print(rules)

Часті набори:
      support      itemsets
0    0.025398           ( )
1    0.026680           (a)
2    0.010005           (b)
3    0.020780           (c)
4    0.014366           (d)
..        ...           ...
100  0.010005     (i, e, s)
101  0.010005     (n, r, e)
102  0.012314     (r, e, s)
103  0.010005     (r, e, t)
104  0.011801  ( , r, a, e)

[105 rows x 2 columns]

Асоціативні правила:
    antecedents consequents  antecedent support  consequent support   support  \
0           ( )         (a)            0.025398            0.026680  0.018214   
1           (a)         ( )            0.026680            0.025398  0.018214   
2           ( )         (c)            0.025398            0.020780  0.016162   
3           (c)         ( )            0.020780            0.025398  0.016162   
4           ( )         (d)            0.025398            0.014366  0.011544   
..          ...         ...                 ...                 ...       ...   
341      (a, e)      ( , r)          