All the libraries, modules, imports and constants used in this program 

In [114]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules
from timeit import timeit

FILE_INPUT = "online_retail.csv"
USE_DICT_OVER_SET = False
USE_FPGROWTH = False
USE_APRIORI = True

This functions reads from the given file creating a dictionary of invoices:
<ol>
<li>skips the first row
<li>if the row starts with "C" is a cancellation -> to be skipped</li>
<li>split the row and convert it to integer</li>
<li>add the invoiceCode if not present to the data dictionary</li>
<li>if not present add the product code and description to the other dictionary</li>
<li>finally match invoice number with product number</li>
</ol>
Returns:
<ol>
<li>The dictionary #Invoice : set(#products)</li>
<li>The dictionary #products : description </li>
</ol>

In [115]:
def read_file(FILE_INPUT:str) -> [dict[int:str], dict[str:str]|set[str]]: # type: ignore
    invoices = dict()
    products = dict() if USE_DICT_OVER_SET else set()
    
    with open(FILE_INPUT, "r", encoding='UTF-8') as file:
        file.readline()

        for row in file:    
            try:
                row = row.strip().split(",")
                row[0] = int(row[0])
                if len(row[1]) <= 1 or len(row[2]) <= 1:
                    raise ValueError 
            except ValueError:
                continue

            if row[0] not in invoices:
                invoices[row[0]] = set()

            if USE_DICT_OVER_SET:
                if row[1] not in products:
                    products[row[1]] = row[2]
            else:
                products.add(row[1])

            invoices[row[0]].add(row[1])

    return [invoices, products]

This functions creates a matrix starting from the products and the invoices:
- The rows are the invoices (which are sorted in ascending order)
- The columns are the products code (sorted in ascending order)
- The element (i, j) of the matrix is 1 if the product i is present in the invoice j, 0 otherwise

The matrix is then returned as a pd data frame

In [116]:
def create_matrix(invoices:dict[int:set[str]], invoice_ids:list[int], product_ids:list[str])->list[list[bool]]:            
    return pd.DataFrame(data=[[product in invoices[invoice] for product in product_ids] for invoice in invoice_ids],
                        columns=product_ids, index=invoice_ids) 

This is the main function of the program that will have all the main instructions and function calls of the whole program

In [117]:
def main()->None:
    [invoices, product] = read_file(FILE_INPUT)
    #print(timeit(lambda : read_file(FILE_INPUT)), number=1)
    dataFrame = create_matrix(invoices, sorted(invoices.keys()), sorted(product.keys()) if USE_DICT_OVER_SET else sorted(product))

    #print(timeit(lambda: create_matrix(invoices, sorted(invoices.keys()), sorted(product.keys() if USE_DICT_OVER_SET else sorted(product))), number=1))

    if USE_FPGROWTH:
        fp = fpgrowth(df=dataFrame, min_support=0.02, use_colnames=True)
        print("items present = ", len(fp))
        print(fp.to_string())
        
    if USE_APRIORI:
        ap = apriori(df=dataFrame, min_support=0.02, use_colnames=True)
        print("items apriori = ", len(ap))
        print(ap.to_string())
        
    asr = association_rules(df=fp if USE_FPGROWTH else ap, min_threshold=0.15)
    print(len(asr))
    print(asr.to_string())    
    
    
main()

items apriori =  367
      support                itemsets
0    0.025300                 (15036)
1    0.022272                (15056N)
2    0.021881                 (20676)
3    0.034043                 (20685)
4    0.025642                 (20711)
5    0.042151                 (20712)
6    0.032431                 (20713)
7    0.020123                 (20717)
8    0.028084                 (20718)
9    0.040783                 (20719)
10   0.035264                 (20723)
11   0.050503                 (20724)
12   0.076438                 (20725)
13   0.049722                 (20726)
14   0.062176                 (20727)
15   0.056169                 (20728)
16   0.042981                 (20914)
17   0.025496                 (20971)
18   0.030087                 (20972)
19   0.042542                 (21034)
20   0.048159                 (21080)
21   0.020905                 (21086)
22   0.025740                 (21094)
23   0.022028                 (21154)
24   0.023689                