All the libraries, modules, imports and constants used in this program 

In [7]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from time import time

FILE_INPUT = "online_retail.csv"
USE_DICT_OVER_SET = False
USE_FPGROWTH = True
USE_APRIORI = False

This functions reads from the given file creating a dictionary of invoices:
<ol>
<li>skips the first row
<li>if the row starts with "C" is a cancellation -> to be skipped</li>
<li>split the row and convert it to integer</li>
<li>add the invoiceCode if not present to the data dictionary</li>
<li>if not present add the product code and description to the other dictionary</li>
<li>finally match invoice number with product number</li>
</ol>
Returns:
<ol>
<li>The dictionary #Invoice : set(#products)</li>
<li>The dictionary #products : description  or the set of #products</li>
</ol>

In [8]:
def read_file(FILE_INPUT:str) -> [dict[int:str], dict[str:str]|set[str]]: # type: ignore
    invoices = dict()
    products = dict() if USE_DICT_OVER_SET else set()
    
    with open(FILE_INPUT, "r", encoding='UTF-8') as file:
        columns = file.readline().strip().split(",")

        for row in file:    
            try:
                row = row.strip().split(",")
                row[0] = int(row[0])
                if len(row[1]) <= 1 or len(row[2]) <= 1:
                    raise ValueError 
            except ValueError:
                continue

            if row[0] not in invoices:
                invoices[row[0]] = set()

            if USE_DICT_OVER_SET:
                if row[1] not in products:
                    products[row[1]] = row[2] 
            else:
                products.add(row[2]) # use 1 for product id

            invoices[row[0]].add(row[2]) # use 1 for product id

    return [invoices, products, columns]

This functions creates a matrix starting from the products and the invoices:
- The rows are the invoices (which are sorted in ascending order)
- The columns are the products code (sorted in ascending order)
- The element (i, j) of the matrix is 1 if the product i is present in the invoice j, 0 otherwise

The matrix is then returned as a pd data frame

In [9]:
def create_matrix(invoices:dict[int:set[str]], invoice_ids:list[int], product_ids:list[str])->list[list[bool]]:            
    return pd.DataFrame(data=[[product in invoices[invoice] for product in product_ids] for invoice in invoice_ids],
                        columns=product_ids, index=invoice_ids) 

This function operates just like the previously defined function, although it's much faster due to dictionary lookup

In [10]:
def moreEfficientCreateMatrix(invoices:dict[int:set[str]], invoice_ids:list[int], product_ids:set[str]) -> list[list[bool]]:
    product_ids = {v:k for k,v in enumerate(product_ids)}
    matrix = [[False] * len(product_ids)] * len(invoice_ids)

    for i in range(len(invoice_ids)):
        for item in invoices[invoice_ids[i]]:
            matrix[i][product_ids[item]] = True

    return pd.DataFrame(data=matrix, columns=product_ids, index=invoice_ids)
    

This is the main function of the program that will have all the main instructions and function calls of the whole program

In [11]:
def main()->None:
    
    [invoices, product, columns] = read_file(FILE_INPUT)
    #dataFrame = create_matrix(invoices, sorted(invoices.keys()), sorted(product.keys()) if USE_DICT_OVER_SET else sorted(product))
    dataFrame = moreEfficientCreateMatrix(invoices, sorted(invoices.keys()), sorted(product))
    display(dataFrame)
    

    if USE_FPGROWTH:
        fp = fpgrowth(df=dataFrame, min_support=0.02, use_colnames=True)
        #print("items present = ", len(fp))
        #print(fp.to_string())

    print(fp[fp['itemsets'].map(len) > 1])
    
    if USE_APRIORI:
        ap = apriori(df=dataFrame, min_support=0.05, use_colnames=True)
        print("items apriori = ", len(ap))
        print(ap.to_string())
        
    asr = association_rules(df=fp if USE_FPGROWTH else ap, metric='confidence', min_threshold=0.85)
    print(len(asr))
    print(asr.to_string())    
    
        
main()

Unnamed: 0,4 PURPLE FLOCK DINNER CANDLES,50'S CHRISTMAS GIFT BAG LARGE,DOLLY GIRL BEAKER,I LOVE LONDON MINI BACKPACK,I LOVE LONDON MINI RUCKSACK,NINE DRAWER OFFICE TIDY,OVAL WALL MIRROR DIAMANTE,RED SPOT GIFT BAG LARGE,SET 2 TEA TOWELS I LOVE LONDON,SPACEBOY BABY GIFT SET,...,wrongly coded 20713,wrongly coded 23343,wrongly coded-23343,wrongly marked,wrongly marked 23343,wrongly marked carton 22804,wrongly marked. 23343 in box,wrongly sold (22719) barcode,wrongly sold as sets,wrongly sold sets
536365,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
536366,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
536367,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
536368,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
536369,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581583,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
581584,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
581585,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
581586,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


KeyboardInterrupt: 