In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
import json

In [None]:
with open('../data/processed/basket_item_sets.json', 'r') as f:
    transactions = json.load(f)
    
transactions

[['Gourmet brewed coffee'],
 ['Brewed Chai tea'],
 ['Hot chocolate'],
 ['Drip coffee'],
 ['Brewed Chai tea', 'Scone'],
 ['Drip coffee'],
 ['Gourmet brewed coffee'],
 ['Barista Espresso'],
 ['Hot chocolate'],
 ['Brewed Chai tea'],
 ['Gourmet brewed coffee'],
 ['Brewed Black tea'],
 ['Brewed Chai tea'],
 ['Barista Espresso'],
 ['Brewed Green tea', 'Scone'],
 ['Brewed herbal tea'],
 ['Hot chocolate'],
 ['Hot chocolate'],
 ['Gourmet brewed coffee'],
 ['Brewed Chai tea', 'Biscotti'],
 ['Brewed Chai tea'],
 ['Barista Espresso'],
 ['Brewed herbal tea', 'Biscotti'],
 ['Brewed herbal tea', 'Pastry'],
 ['Barista Espresso'],
 ['Barista Espresso'],
 ['Organic brewed coffee'],
 ['Organic brewed coffee'],
 ['Gourmet brewed coffee'],
 ['Brewed Chai tea'],
 ['Organic brewed coffee'],
 ['Brewed Chai tea'],
 ['Barista Espresso'],
 ['Brewed herbal tea'],
 ['Brewed herbal tea'],
 ['Drip coffee'],
 ['Hot chocolate', 'Scone'],
 ['Brewed Chai tea', 'Scone'],
 ['Brewed herbal tea'],
 ['Brewed Chai tea'],
 ['B

In [3]:
te = TransactionEncoder()

te_ary = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_ary, columns=te.columns_)

df

Unnamed: 0,Barista Espresso,Biscotti,Black tea,Brewed Black tea,Brewed Chai tea,Brewed Green tea,Brewed herbal tea,Chai tea,Clothing,Drinking Chocolate,...,Housewares,Organic Beans,Organic Chocolate,Organic brewed coffee,Pastry,Premium Beans,Premium brewed coffee,Regular syrup,Scone,Sugar free syrup
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116785,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
116786,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
116787,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
116788,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## FP-Growth

The **Frequent Pattern Growth** algorithm is an efficient alternative to Apriori for mining frequent itemsets. Unlike Apriori, FP-Growth avoids candidate generation by using a **divide-and-conquer** approach and a data structure called the **FP-tree**.

The algorithm begins by constructing the FP-tree from the transaction database, where items are arranged in a frequency-descending order to maximize tree compression. Each node in the tree represents an item, and paths represent itemsets.

Once the tree is built, FP-Growth extracts frequent itemsets by recursively partitioning the tree into conditional FP-trees, which represent the dataset projected onto specific frequent items. The process continues until all frequent itemsets are identified.

FP-Growth is particularly well-suited for large datasets with many transactions, as it avoids the computational overhead of repeatedly scanning the database, among the main drawbacks of Apriori.

In [31]:
tidsets = fpgrowth(df, min_support=0.003)

In [14]:
tidsets

Unnamed: 0,support,itemsets
0,0.144704,(13)
1,0.146811,(4)
2,0.098108,(17)
3,0.072429,(10)
4,0.086352,(27)
5,0.139473,(0)
6,0.097046,(3)
7,0.048523,(5)
8,0.096207,(6)
9,0.048848,(1)


For better readability, we added the names of the items based on their id.

In [34]:
itemset_names = []
for itemset in tidsets['itemsets']:
    itemset_names.append(', '.join([te.columns_[index] for index in itemset]))

In [39]:
tidsets = pd.concat([tidsets, pd.DataFrame(itemset_names, columns=['labels'])], axis=1)

In [40]:
tidsets

Unnamed: 0,support,itemsets,labels
0,0.144704,(13),Gourmet brewed coffee
1,0.146811,(4),Brewed Chai tea
2,0.098108,(17),Hot chocolate
3,0.072429,(10),Drip coffee
4,0.086352,(27),Scone
5,0.139473,(0),Barista Espresso
6,0.097046,(3),Brewed Black tea
7,0.048523,(5),Brewed Green tea
8,0.096207,(6),Brewed herbal tea
9,0.048848,(1),Biscotti


In [43]:
tidsets

Unnamed: 0,support,itemsets,labels
0,0.144704,(13),Gourmet brewed coffee
1,0.146811,(4),Brewed Chai tea
2,0.098108,(17),Hot chocolate
3,0.072429,(10),Drip coffee
4,0.086352,(27),Scone
5,0.139473,(0),Barista Espresso
6,0.097046,(3),Brewed Black tea
7,0.048523,(5),Brewed Green tea
8,0.096207,(6),Brewed herbal tea
9,0.048848,(1),Biscotti


### Exporting model


In [44]:
tidsets.to_csv('../models/fp-growth-results.csv')