In [20]:
import pandas as pd
from collections import defaultdict

In [21]:
df = pd.read_csv("Online Retail.xlsx.csv")

We have loaded the dataset


In [22]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom


In [23]:
total = df["Quantity"].sum()
print(total)

5176450


In [24]:
grouped_df = df.groupby('InvoiceNo').agg({'StockCode': lambda x: ', '.join(x.astype(str))})

In [25]:
grouped_df = grouped_df.reset_index()
grouped_df.head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,"85123A, 71053, 84406B, 84029G, 84029E, 22752, ..."
1,536366,"22633, 22632"
2,536367,"84879, 22745, 22748, 22749, 22310, 84969, 2262..."
3,536368,"22960, 22913, 22912, 22914"
4,536369,21756


Till this point we have grouped the database in each invoice number


Below is the code for apriori algorithm in which we are generating the candidate sets and then on the basis of the  the support value we are filtering the candidate set .

The algorithm will terminate when there will be no member in the candidate set.

In [26]:

def create_itemsets(transactions):
    itemsets = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            itemsets[item] += 1
    return itemsets

def filter_itemsets(itemsets, min_support):
    filtered_itemsets = {item: freq for item, freq in itemsets.items() if freq >= min_support}
    return filtered_itemsets

def generate_candidates(itemsets, k):

    candidates = set()
    for itemset1 in itemsets:
        for itemset2 in itemsets:
            if itemset1 != itemset2:
                union = tuple(sorted(set(itemset1) | set(itemset2)))
                if len(union) == k:
                    candidates.add(union)
    return candidates

def calculate_support(transactions, itemsets):

    support = defaultdict(int)
    for transaction in transactions:
        for itemset in itemsets:
            if set(itemset).issubset(transaction):
                support[itemset] += 1
    return support

def apriori_algorithm(transactions, min_support):

    itemsets = create_itemsets(transactions)
    frequent_itemsets = filter_itemsets(itemsets, min_support)
    k = 2
    while True:
        candidates = generate_candidates(frequent_itemsets, k)
        candidates_support = calculate_support(transactions, candidates)
        frequent_candidates = filter_itemsets(candidates_support, min_support)
        if not frequent_candidates:
            break
        frequent_itemsets.update(frequent_candidates)
        k += 1
    return frequent_itemsets

Now we will run our code

In [27]:
frequent_itemsets = apriori_algorithm(grouped_df['StockCode'],min_support=50000)

In [28]:
print(frequent_itemsets)

{'8': 210898, '5': 180005, '1': 296053, '2': 828325, '3': 259035, ',': 516009, ' ': 516046, '7': 180372, '0': 197322, '4': 186057, '6': 155713, '9': 201222}
