In [1]:
!pip install fpgrowth_py

Collecting fpgrowth_py
  Downloading fpgrowth_py-1.0.0-py3-none-any.whl (5.6 kB)
Installing collected packages: fpgrowth-py
Successfully installed fpgrowth-py-1.0.0


### Load the dataset

In [3]:
import pandas as pd

# data location:
path = '../data/'
file_name = 'Online Retail.xlsx'
sheet_name = 'Online Retail'
# read excel file
df = pd.read_excel(path + file_name, sheet_name=sheet_name)

df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,`,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


### Filter top three countries

In [32]:
unique_invoices = df.groupby(["InvoiceNo", "Country"]).size().reset_index()

# Top 3 countries by unique invoices
top_three_countries = unique_invoices['Country'].value_counts().nlargest(3)

top_three_countries = top_three_countries.index.values

top_three_countries

array(['United Kingdom', 'Germany', 'France'], dtype=object)

In [38]:
filtered_df = df.loc[df['Country'].isin(top_three_countries)]

filtered_df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,`,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


### Drop Unnecessary columns

In [40]:
filtered_df.drop(columns=['Quantity', "InvoiceDate", "UnitPrice", "CustomerID", "Country"], inplace=True)

filtered_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop(columns=['Quantity', "InvoiceDate", "UnitPrice", "CustomerID", "Country"], inplace=True)


Unnamed: 0,InvoiceNo,StockCode,Description
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,536365,71053,WHITE METAL LANTERN
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.
...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE


### Prepare data for the FP growth algorithm

In [82]:
grouped_invoices = filtered_df.set_index("Description").groupby("InvoiceNo").groups

grouped_invoices_list = [list(map(str, v.values)) for k,v in grouped_invoices.items()]

grouped_invoices_list[:3]

[['WHITE HANGING HEART T-LIGHT HOLDER',
  'WHITE METAL LANTERN',
  'CREAM CUPID HEARTS COAT HANGER',
  'KNITTED UNION FLAG HOT WATER BOTTLE',
  'RED WOOLLY HOTTIE WHITE HEART.',
  'SET 7 BABUSHKA NESTING BOXES',
  'GLASS STAR FROSTED T-LIGHT HOLDER'],
 ['HAND WARMER UNION JACK', 'HAND WARMER RED POLKA DOT'],
 ['ASSORTED COLOUR BIRD ORNAMENT',
  "POPPY'S PLAYHOUSE BEDROOM ",
  "POPPY'S PLAYHOUSE KITCHEN",
  'FELTCRAFT PRINCESS CHARLOTTE DOLL',
  'IVORY KNITTED MUG COSY ',
  'BOX OF 6 ASSORTED COLOUR TEASPOONS',
  'BOX OF VINTAGE JIGSAW BLOCKS ',
  'BOX OF VINTAGE ALPHABET BLOCKS',
  'HOME BUILDING BLOCK WORD',
  'LOVE BUILDING BLOCK WORD',
  'RECIPE BOX WITH METAL HEART',
  'DOORMAT NEW ENGLAND']]

### FP Growth algorithm and results

In [106]:
from fpgrowth_py import fpgrowth

freqItemSet, rules = fpgrowth(grouped_invoices_list, minSupRatio=0.009, minConf=0.6)

In [109]:
print(f"{len(freqItemSet)} frequent sets of items found.")

2078 frequent sets of items found.


In [108]:
# We will just print the most intereseting sets (the ones with size bigger than 2)

filtered_sets = sorted(list(filter(lambda x: len(x) > 2,  freqItemSet)), key=len, reverse=True)

print("FREQUENTLY ORDERED TOGETHER")

for item_set in filtered_sets:
    print("\n")
    print("\n  - ".join(list(item_set)))

FREQUENTLY ORDERED TOGETHER


HERB MARKER PARSLEY
  - HERB MARKER MINT
  - HERB MARKER THYME
  - HERB MARKER BASIL
  - HERB MARKER ROSEMARY


HERB MARKER PARSLEY
  - HERB MARKER ROSEMARY
  - HERB MARKER MINT
  - HERB MARKER THYME
  - HERB MARKER BASIL


CHARLOTTE BAG SUKI DESIGN
  - STRAWBERRY CHARLOTTE BAG
  - RED RETROSPOT CHARLOTTE BAG
  - CHARLOTTE BAG PINK POLKADOT
  - WOODLAND CHARLOTTE BAG


HERB MARKER MINT
  - HERB MARKER THYME
  - HERB MARKER PARSLEY
  - HERB MARKER BASIL


HERB MARKER THYME
  - HERB MARKER PARSLEY
  - HERB MARKER BASIL
  - HERB MARKER ROSEMARY


HERB MARKER MINT
  - HERB MARKER ROSEMARY
  - HERB MARKER PARSLEY
  - HERB MARKER BASIL


HERB MARKER MINT
  - HERB MARKER THYME
  - HERB MARKER PARSLEY
  - HERB MARKER ROSEMARY


SET OF 3 WOODEN HEART DECORATIONS
  - SET OF 3 WOODEN SLEIGH DECORATIONS
  - SET OF 3 WOODEN STOCKING DECORATION
  - SET OF 3 WOODEN TREE DECORATIONS


REGENCY TEA PLATE ROSES 
  - GREEN REGENCY TEACUP AND SAUCER
  - PINK REGENCY TEACUP AND