In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from copy import deepcopy
tqdm.pandas()

df = pd.read_csv('./Groceries_dataset.csv')

df.insert(0, 'tid', df.apply(lambda row: f'{row["Member_number"]} {row["Date"]}', axis='columns'))
df.drop(['Member_number', 'Date'], axis='columns', inplace=True)
display(df.head(3))
print(f'Number of raw transactions: {len(df)}')

Unnamed: 0,tid,itemDescription
0,1808 21-07-2015,tropical fruit
1,2552 05-01-2015,whole milk
2,2300 19-09-2015,pip fruit


Number of raw transactions: 38765


In [2]:
items = df['itemDescription'].drop_duplicates().to_list()
tids = df['tid'].drop_duplicates().to_list()
print(f'Kinds of items: {len(items)}')
print(f'Number of tids: {len(tids)}')

Kinds of items: 167
Number of tids: 14963


In [3]:
binary = np.zeros((len(tids), len(items)), dtype=int)
binary = pd.DataFrame(binary, index=tids, columns=items, dtype=int)

def counter(row): binary.loc[row[0], row[1]] = 1

df.progress_apply(counter, axis='columns')

display(binary.head(3))

  0%|          | 0/38765 [00:00<?, ?it/s]

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
1808 21-07-2015,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2552 05-01-2015,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2300 19-09-2015,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
MIN_SUP = 10

abandoned = {}
over_min_sup = {}

def check_support(row): return int(sum(row) == len(row))

for it_idx in tqdm(range(len(items))):
    item = [items[it_idx]]
    sup  = binary[item].apply(check_support, axis='columns').sum()

    key = ', '.join(item)
    if key in abandoned.keys(): continue

    if sup >= MIN_SUP:
        over_min_sup[key] = 0
    else:
        abandoned[key] = 0
        continue

    for it_jdx in range(len(items)):
        print(f'progressing {it_jdx:3d} / {len(items):3d}', end='\r')
        item.append(items[it_jdx])
        item.sort()
        key = ', '.join(item)

        if key in abandoned.keys():
            item.remove(items[it_jdx])
            continue
        elif key in over_min_sup.keys():
            continue

        sup = binary[item].apply(check_support, axis='columns').sum()

        if sup >= MIN_SUP:
            over_min_sup[key] = 0
        else:
            abandoned[key] = 0
            item.remove(items[it_jdx])
            continue

print(f'Number of itemsets that exceed MIN_SUP: {len(over_min_sup)}')

  0%|          | 0/167 [00:00<?, ?it/s]

Number of itemsets that exceed MIN_SUP: 411


In [6]:
import pickle

with open('./result.pkl', 'wb') as file:
    pickle.dump(over_min_sup, file)

In [7]:
with open('./result.pkl', 'rb') as file:
    result = pickle.load(file)

    print(f'List of items that exceed MIN_SUP: {list(result.keys())}')

List of items that exceed MIN_SUP: ['tropical fruit', 'tropical fruit, tropical fruit', 'tropical fruit, tropical fruit, whole milk', 'other vegetables, tropical fruit, tropical fruit, whole milk', 'whole milk', 'tropical fruit, whole milk', 'tropical fruit, whole milk, whole milk', 'other vegetables, tropical fruit, whole milk, whole milk', 'pip fruit', 'pip fruit, tropical fruit', 'pip fruit, pip fruit, tropical fruit', 'other vegetables', 'other vegetables, tropical fruit', 'other vegetables, tropical fruit, whole milk', 'other vegetables, other vegetables, tropical fruit, whole milk', 'rolls/buns', 'rolls/buns, tropical fruit', 'rolls/buns, tropical fruit, whole milk', 'rolls/buns, rolls/buns, tropical fruit, whole milk', 'pot plants', 'pot plants, whole milk', 'pot plants, pot plants, whole milk', 'citrus fruit', 'citrus fruit, tropical fruit', 'citrus fruit, citrus fruit, tropical fruit', 'beef', 'beef, tropical fruit', 'beef, beef, tropical fruit', 'frankfurter', 'frankfurter, t