In [1]:
!pip install -q kaggle

In [2]:
import os
os.environ["KAGGLE_CONFIG_DIR"] = '/content/'

In [4]:
!chmod 600 /content/kaggle.json

In [5]:
!kaggle datasets download -d mashlyn/online-retail-ii-uci

Dataset URL: https://www.kaggle.com/datasets/mashlyn/online-retail-ii-uci
License(s): CC0-1.0
online-retail-ii-uci.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
!unzip online-retail-ii-uci.zip

Archive:  online-retail-ii-uci.zip
  inflating: online_retail_II.csv    


# Data Preparation

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv('/content/online_retail_II.csv')

In [11]:
df.dropna(inplace=True)
invalid_codes = [item for item in set(df['StockCode']) if not item[0].isdigit()]
df = df[~df['StockCode'].isin(invalid_codes)]

In [24]:
invalid_codes

['C2',
 'D',
 'SP1002',
 'ADJUST',
 'DOT',
 'M',
 'BANK CHARGES',
 'POST',
 'ADJUST2',
 'TEST001',
 'CRUK',
 'PADS',
 'TEST002']

In [12]:
france_df = df[df["Country"] == "France"]
france_df

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
71,489439,22065,CHRISTMAS PUDDING TRINKET POT,12,2009-12-01 09:28:00,1.45,12682.0,France
72,489439,22138,BAKING SET 9 PIECE RETROSPOT,9,2009-12-01 09:28:00,4.95,12682.0,France
73,489439,22139,RETRO SPOT TEA SET CERAMIC 11 PC,9,2009-12-01 09:28:00,4.95,12682.0,France
74,489439,22352,LUNCHBOX WITH CUTLERY RETROSPOT,12,2009-12-01 09:28:00,2.55,12682.0,France
75,489439,85014A,BLACK/BLUE DOTS RUFFLED UMBRELLA,3,2009-12-01 09:28:00,5.95,12682.0,France
...,...,...,...,...,...,...,...,...
1067365,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [23]:
transactions = france_df.groupby("Invoice")["StockCode"].apply(tuple)
transactions

Unnamed: 0_level_0,StockCode
Invoice,Unnamed: 1_level_1
489439,"(22065, 22138, 22139, 22352, 85014A, 85014B, 1..."
489557,"(21439, 16169D, 16169C, 21017, 21731, 22352, 2..."
489883,"(22139, 22138, 20696, 85089, 21249, 21791, 852..."
490139,"(22275, 85183A, 85183B, 21100, 22125, 22114, 2..."
490152,"(16169P, 20679, 20685, 20749, 20750, 20940, 21..."
...,...
C579192,"(23320, 23318, 22959, 22900, 22818, 22809, 224..."
C579532,"(22890,)"
C579562,"(23084, 21731)"
C580263,"(70007, 85175, 84821, 84819, 84817, 72586, 222..."


In [26]:
trans = transactions.to_list()
print(len(trans))

701


In [29]:
min_sup = 0.05*len(transactions)
print(min_sup)

35.050000000000004


In [28]:
!pip install efficient-apriori

Collecting efficient-apriori
  Downloading efficient_apriori-2.0.6-py3-none-any.whl.metadata (6.7 kB)
Downloading efficient_apriori-2.0.6-py3-none-any.whl (14 kB)
Installing collected packages: efficient-apriori
Successfully installed efficient-apriori-2.0.6


In [30]:
from efficient_apriori import apriori

In [31]:
itemsets, rules = apriori(trans, min_support=0.05, min_confidence=0.5)

In [32]:
len(itemsets)

3

In [34]:
itemsets
# First index is combinition of stockcode, second is frequency

{1: {('22138',): 62,
  ('22139',): 59,
  ('22352',): 119,
  ('22333',): 39,
  ('21731',): 132,
  ('20749',): 52,
  ('21212',): 82,
  ('21238',): 42,
  ('85099B',): 68,
  ('20719',): 60,
  ('20712',): 52,
  ('20682',): 48,
  ('20724',): 84,
  ('20750',): 109,
  ('21559',): 97,
  ('21156',): 47,
  ('21121',): 59,
  ('22090',): 62,
  ('15056BL',): 36,
  ('22027',): 57,
  ('22029',): 46,
  ('22197',): 40,
  ('22303',): 45,
  ('22356',): 52,
  ('21086',): 87,
  ('21094',): 80,
  ('21080',): 84,
  ('84997D',): 39,
  ('20726',): 74,
  ('22326',): 112,
  ('22328',): 79,
  ('21843',): 38,
  ('20725',): 97,
  ('21936',): 43,
  ('22382',): 65,
  ('22383',): 44,
  ('21987',): 36,
  ('22367',): 37,
  ('22385',): 36,
  ('22432',): 40,
  ('22467',): 36,
  ('21915',): 39,
  ('22554',): 97,
  ('22551',): 80,
  ('22556',): 106,
  ('22555',): 51,
  ('22423',): 69,
  ('22437',): 36,
  ('22492',): 60,
  ('22435',): 38,
  ('22636',): 40,
  ('22634',): 43,
  ('22620',): 40,
  ('22629',): 80,
  ('22631',): 54

In [35]:
rules

[{20719} -> {20724},
 {22356} -> {20724},
 {22382} -> {20725},
 {20749} -> {20750},
 {21086} -> {21080},
 {21080} -> {21086},
 {21094} -> {21080},
 {21080} -> {21094},
 {21094} -> {21086},
 {21086} -> {21094},
 {22352} -> {21559},
 {21559} -> {22352},
 {22328} -> {22326},
 {22554} -> {22551},
 {22551} -> {22554},
 {22556} -> {22551},
 {22551} -> {22556},
 {22556} -> {22554},
 {22554} -> {22556},
 {22555} -> {22556},
 {22630} -> {22629},
 {22629} -> {22630},
 {22631} -> {22629},
 {21086, 21094} -> {21080},
 {21080, 21094} -> {21086},
 {21080, 21086} -> {21094},
 {21094} -> {21080, 21086},
 {21086} -> {21080, 21094},
 {21080} -> {21086, 21094},
 {22554, 22556} -> {22551},
 {22551, 22556} -> {22554},
 {22551, 22554} -> {22556},
 {22551} -> {22554, 22556}]

In [40]:
stock_df = df[["StockCode", "Description"]].drop_duplicates(ignore_index=True).set_index("StockCode")
stock_df

Unnamed: 0_level_0,Description
StockCode,Unnamed: 1_level_1
85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS
79323P,PINK CHERRY LIGHTS
79323W,WHITE CHERRY LIGHTS
22041,"RECORD FRAME 7"" SINGLE SIZE"
21232,STRAWBERRY CERAMIC TRINKET BOX
...,...
21175,GIN AND TONIC DIET METAL SIGN
23561,SET OF 6 RIBBONS PARTY
90014C,SILVER AND BLACK ORBIT NECKLACE
85123A,CREAM HANGING HEART T-LIGHT HOLDER


In [41]:
for i, rule in enumerate(rules):
  print(f'Rule: {i+1}')
  left = ' / '.join(stock_df.loc[list(rule.lhs)].values.reshape(-1))
  right = ' / '.join(stock_df.loc[list(rule.rhs)].values.reshape(-1))

  print(f'{left} -> {right}')
  print(f'Supp: {rule.support}, Confidence: {rule.confidence}, Lift: {rule.lift}, Conviction: {rule.conviction}')
  print()

Rule: 1
WOODLAND CHARLOTTE BAG -> RED SPOTTY CHARLOTTE BAG / RED RETROSPOT CHARLOTTE BAG
Supp: 0.05420827389443652, Confidence: 0.6333333333333333, Lift: 5.285317460317461, Conviction: 2.400466858970066

Rule: 2
CHARLOTTE BAG , PINK/WHITE SPOTS / CHARLOTTE BAG PINK WITH WHITE SPOTS / CHARLOTTE BAG PINK POLKADOT -> RED SPOTTY CHARLOTTE BAG / RED RETROSPOT CHARLOTTE BAG
Supp: 0.05135520684736091, Confidence: 0.6923076923076923, Lift: 5.777472527472527, Conviction: 2.8605563387773714

Rule: 3
LUNCHBAG SPACEBOY DESIGN  / LUNCH BAG SPACEBOY DESIGN  -> LUNCH BAG RED SPOTTY / LUNCH BAG RED RETROSPOT
Supp: 0.05135520684736091, Confidence: 0.5538461538461539, Lift: 4.00253766851705, Conviction: 1.931231241674625

Rule: 4
ASSORTED COLOUR MINI CASES ->  RED/WHITE DOT MINI CASES / RED/WHITE DOT MINI CASES / RED RETROSPOT MINI CASES
Supp: 0.052781740370898715, Confidence: 0.7115384615384616, Lift: 4.576040931545519, Conviction: 2.9276271890900745

Rule: 5
SET/6 RED SPOTTY PAPER CUPS -> SET/20 RED S