In [None]:
#!/bin/bash
!kaggle datasets download crawlfeeds/tesco-uk-groceries-dataset
!unzip tesco-uk-groceries-dataset

Dataset URL: https://www.kaggle.com/datasets/crawlfeeds/tesco-uk-groceries-dataset
License(s): CC0-1.0
Downloading tesco-uk-groceries-dataset.zip to /content
  0% 0.00/135k [00:00<?, ?B/s]
100% 135k/135k [00:00<00:00, 36.1MB/s]
Archive:  tesco-uk-groceries-dataset.zip
  inflating: tesco_groceries_dataset.csv  


In [None]:
!pip install mlxtend
!pip install graphviz



In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, apriori
from mlxtend.preprocessing import TransactionEncoder
from collections import defaultdict
from graphviz import Digraph

In [None]:
df = pd.read_csv("/content/tesco_groceries_dataset.csv")
df.columns

Index(['name', 'url', 'sku', 'gtin13', 'price', 'currency', 'availability',
       'description', 'brand', 'breadcrumbs', 'images', 'avg_rating',
       'reviews_count', 'pack_size', 'ingredients', 'storage_details',
       'product_origin', 'percentage_alcohol', 'serving_size', 'nutrition',
       'uniq_id', 'scraped_at'],
      dtype='object')

In [None]:
# Function to build the FP-tree
class FPTree:
    def __init__(self, min_support=0.5):
        self.min_support = min_support
        self.header_table = defaultdict(list)  # table that stores item frequencies and linked nodes
        self.tree = None

    # Pass 1: Create item frequency table and prune infrequent items
    def create_header_table(self, dataset):
        item_counts = defaultdict(int)
        for transaction in dataset:
            for item in transaction:
                item_counts[item] += 1
        return {item: count for item, count in item_counts.items() if count >= self.min_support * len(dataset)}

    # Pass 2: Build the FP-tree based on frequent items
    def build_tree(self, dataset, header_table):
        self.tree = {}
        for transaction in dataset:
            # Filter and sort the items based on header_table (frequency order)
            sorted_items = [item for item in sorted(transaction, key=lambda x: header_table.get(x, 0), reverse=True)]
            self._insert_tree(sorted_items, self.tree)

    def _insert_tree(self, items, tree, count=1):
        if not items:
            return

        first_item = items[0]
        if first_item in tree:
            tree[first_item]['count'] += count
        else:
            tree[first_item] = {'count': count, 'children': {}}

        # Link to the header table
        self.header_table[first_item].append(tree[first_item])

        # Recur to insert the rest of the items
        self._insert_tree(items[1:], tree[first_item]['children'], count)

    # Function to get the FP-Tree
    def get_tree(self):
        return self.tree

    # Function to get header table
    def get_header_table(self):
        return self.header_table

    # Function to visualize the FP-Tree using Graphviz
    def visualize_tree(self):
        dot = Digraph(comment='FP-Tree', format='png', engine='dot')

        def add_nodes(tree, parent_name):
            for item, data in tree.items():
                # Generate a unique node for the item
                node_name = f"{parent_name}_{item}"
                dot.node(node_name, label=f"{item} ({data['count']})")

                # Create an edge from the parent node to the item node
                dot.edge(parent_name, node_name)

                add_nodes(data['children'], node_name)

        # Create the root node (initial node is "root")
        dot.node('root', label='Root')

        # Start the recursive tree visualization
        add_nodes(self.tree, 'root')

        # Render the tree
        dot.render('fp_tree', view=True)  # This saves the image and opens it in the default viewer

In [None]:
# Create transactions based on 'sku' and 'availability'
transactions = df[['sku', 'availability']].astype(str)

# Apply TransactionEncoder to convert data into boolean format suitable for FP-Growth
te = TransactionEncoder()
te_ary = te.fit(transactions.values).transform(transactions.values)

# Convert the encoded array into a DataFrame
encoded_df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply FP-Growth algorithm with a lower support threshold
frequent_itemsets = fpgrowth(encoded_df, min_support=0.1, use_colnames=True)

# Show the frequent itemsets to ensure that we have meaningful patterns
print("Frequent Itemsets:\n", frequent_itemsets)

Frequent Itemsets:
     support   itemsets
0  0.906907  (InStock)


In [None]:
columns_of_interest = ['sku', 'availability', 'brand']
dataset = df[columns_of_interest].fillna('Unknown').values.tolist()

#  minimum support threshold
fp_tree = FPTree(min_support=0.4)

# Step 1: Create the header table
header_table = fp_tree.create_header_table(dataset)
print(f"Header Table (Frequent Items): {header_table}")

# Step 2: Build the FP-tree using the header table
fp_tree.build_tree(dataset, header_table)

# Step 3: print the FP-tree and the header table
print("\nFP-Tree:")
print(fp_tree.get_tree())

print("\nHeader Table (Linked Nodes):")
print(fp_tree.get_header_table())

# Step 4: Visualize the FP-Tree
fp_tree.visualize_tree()

Header Table (Frequent Items): {'InStock': 302}

FP-Tree:
{305829059: {'count': 1, 'children': {'OutOfStock': {'count': 1, 'children': {'BIRDS EYE': {'count': 1, 'children': {}}}}}}, 'InStock': {'count': 302, 'children': {311932942: {'count': 1, 'children': {'IGLOO': {'count': 1, 'children': {}}}}, 257522449: {'count': 1, 'children': {'SCHWARTZ': {'count': 1, 'children': {}}}}, 299555755: {'count': 1, 'children': {'HEARTY FOOD CO.': {'count': 1, 'children': {}}}}, 260691710: {'count': 1, 'children': {'TESCO': {'count': 1, 'children': {}}}}, 255250290: {'count': 1, 'children': {'DISARONNO': {'count': 1, 'children': {}}}}, 299960380: {'count': 1, 'children': {'LITTLE LIFE': {'count': 1, 'children': {}}}}, 308462937: {'count': 1, 'children': {'CRUSHA': {'count': 1, 'children': {}}}}, 309493789: {'count': 1, 'children': {'BISTO': {'count': 1, 'children': {}}}}, 305737088: {'count': 1, 'children': {'TESCO FINEST': {'count': 1, 'children': {}}}}, 311264010: {'count': 1, 'children': {'DOVE': 

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.46104 to fit
