# Running on local instance and connect to Colab, as Colab's runtimes are difficult to setup with older versions.

In [1]:
import sys, pycaret
print(sys.version)
print("PyCaret:", pycaret.__version__)
from pycaret.arules import *
print("ARules OK")

3.8.20 (default, Oct  3 2024, 15:19:54) [MSC v.1929 64 bit (AMD64)]
PyCaret: 2.3.5
ARules OK


## Load data and clean transactions

In [99]:
import pandas as pd
import numpy as np
import os, re

# Try typical file names present in popular mirrors
candidates = [
    "online_retail_II.csv",
]
f = next((p for p in candidates if os.path.exists(p)), None)
assert f is not None, "Put the Online Retail file in the project directory."

df_raw = pd.read_csv(f, encoding="latin1")

# print(df_full.shape)

# df_raw = df_full.sample(n=50000, random_state=42)

print(df_raw.shape)



df = df_raw.copy()
df.columns = [c.strip().replace(" ", "") for c in df.columns]  # safe names
df.head(3)



(1067371, 8)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,CustomerID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom


In [100]:
# Standardize common column names across variants
rename_map = {
    'Invoice':'Invoice','StockCode':'StockCode','Description':'Description',
    'Quantity':'Quantity','InvoiceDate':'InvoiceDate','Price':'Price',
    'CustomerID':'CustomerID','Country':'Country'
}
# Handle alternate casing/spaces
col_lower = {c.lower(): c for c in df.columns}
def col(name):  # find the matching column regardless of casing/spacing
    return col_lower.get(name.lower())

df = df.rename(columns={col(k): v for k, v in rename_map.items() if col(k)})

# Basic cleaning
df = df[df['Quantity'] > 0]                        # positive quantities only
df = df[df['Description'].notna()]                 # drop missing item names
df = df[~df['Invoice'].astype(str).str.startswith('C')]  # drop cancellations

df = df[['Invoice','StockCode','Description','Quantity','InvoiceDate','Price']]
df.reset_index(drop=True, inplace=True)
df.head(5)


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25


In [101]:
import pandas as pd
import numpy as np

# 1) Basic health: drop invoices with <2 unique items (globally)
items_per_tx = df.groupby('Invoice')['StockCode'].nunique()
df2 = df[df['Invoice'].isin(items_per_tx[items_per_tx >= 2].index)].copy()

# 2) Item frequency (by number of transactions)
tx_per_item = df2.groupby('StockCode')['Invoice'].nunique().sort_values(ascending=False)

# 3) Quick co-occurrence "degree"
#   For speed, sample invoices if your dataset is huge; otherwise compute on filtered df
grp = df2.drop_duplicates(['Invoice','StockCode']).groupby('Invoice')['StockCode'].apply(list)

from collections import Counter, defaultdict
deg = Counter()
for items in grp:
    # increment degree for each item by (basket_size - 1)
    for it in items:
        deg[it] += (len(items) - 1)

deg_s = pd.Series(deg).sort_values(ascending=False)

# 4) Pick ~100 items by combining frequency + co-occurrence degree
top_freq = tx_per_item.head(150)
top_deg  = deg_s.head(150)
candidates = set(top_freq.index).intersection(set(top_deg.index))
if len(candidates) < 100:
    # fallback: union then cut to 100 by freq
    candidates = list(set(top_freq.index).union(set(top_deg.index)))
    candidates = candidates[:100]
else:
    candidates = list(pd.Index(list(candidates)).map(str))[:100]  # ensure list length ~100

len(candidates), candidates[:10]


(100,
 ['22086',
  '20728',
  '22659',
  '20726',
  '22469',
  '20713',
  '20725',
  '21481',
  '22149',
  '22909'])

In [102]:
# 5) Filter to those ~100 items and keep invoices that contain at least 2 of them
df_small = df2[df2['StockCode'].isin(candidates)].copy()
k = df_small.groupby('Invoice')['StockCode'].nunique()
df_small = df_small[df_small['Invoice'].isin(k[k >= 2].index)].copy()

# Optional: reduce dominance by very large baskets (cap to first 10 unique items per invoice)
def cap_items(g, cap=10):
    uniq = g['StockCode'].drop_duplicates().head(cap)
    return g[g['StockCode'].isin(uniq)]

df_small = df_small.groupby('Invoice', group_keys=False).apply(cap_items, cap=10)

n_tx = df_small['Invoice'].nunique()
n_items = df_small['StockCode'].nunique()
n_tx, n_items


(24458, 100)

In [118]:
seed = list(tx_per_item.head(10).index)
neighbors = (deg_s.loc[deg_s.index.difference(seed)]
             .head(200).index)  # candidates
# Keep neighbors that co-occur with seeds at least X times
pair_ok = set()
seed_set = set(seed)
for items in grp:
    s = set(items)
    if len(s & seed_set) >= 1:
        pair_ok |= (s - seed_set)

# build final list
rest = [it for it in neighbors if it in pair_ok]
candidates = seed + rest[:90]  # ~100 total

In [119]:
df_demo = df[df['StockCode'].isin(candidates)].copy()

# keep only invoices that contain at least 2 of those items
tx_size = df_demo.groupby('Invoice')['StockCode'].nunique()
df_demo = df_demo[df_demo['Invoice'].isin(tx_size[tx_size >= 2].index)]

print("Transactions:", df_demo['Invoice'].nunique())
print("Unique items:", df_demo['StockCode'].nunique())


Transactions: 10174
Unique items: 100


In [120]:
n_tx = df_demo['Invoice'].nunique()
min_support = max(25 / n_tx, 0.002)  # at least ~25 transactions or 0.2%
print("Auto min_support:", round(min_support, 4))


Auto min_support: 0.0025


## PyCaret ARules â€” setup (transactional schema) and Rules

In [121]:
from pycaret.arules import *

exp = setup(
    data=df_demo,
    transaction_id='Invoice',
    item_id='StockCode'
)

rules = create_model(
    metric='lift',
    threshold=1.0,         # minimum lift of 1
    min_support=min_support,
    round=4
)
print("Number of rules:", rules.shape[0])
rules.head(10)


Description,Value
session_id,8150.0
# Transactions,10174.0
# Items,100.0
Ignore Items,


Number of rules: 31116


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(16016, 16015)",(16014),0.0039,0.0069,0.0028,0.7,101.74,0.0027,3.3104
1,(16014),"(16016, 16015)",0.0069,0.0039,0.0028,0.4,101.74,0.0027,1.6601
2,"(16014, 16015)",(16016),0.0033,0.0088,0.0028,0.8235,93.0954,0.0027,5.6165
3,(16016),"(16014, 16015)",0.0088,0.0033,0.0028,0.3111,93.0954,0.0027,1.4468
4,(16015),"(16016, 16014)",0.0058,0.0051,0.0028,0.4746,92.8527,0.0027,1.8935
5,"(16016, 16014)",(16015),0.0051,0.0058,0.0028,0.5385,92.8527,0.0027,2.1541
6,(16014),(16016),0.0069,0.0088,0.0051,0.7429,83.9759,0.0051,3.8545
7,(16016),(16014),0.0088,0.0069,0.0051,0.5778,83.9759,0.0051,2.3521
8,(16015),(16014),0.0058,0.0069,0.0033,0.5763,83.7569,0.0033,2.3438
9,(16014),(16015),0.0069,0.0058,0.0033,0.4857,83.7569,0.0033,1.9332


In [122]:
rules_sorted = rules.sort_values('lift', ascending=False)
rules_sorted.head(10)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(16016, 16015)",(16014),0.0039,0.0069,0.0028,0.7,101.74,0.0027,3.3104
1,(16014),"(16016, 16015)",0.0069,0.0039,0.0028,0.4,101.74,0.0027,1.6601
2,"(16014, 16015)",(16016),0.0033,0.0088,0.0028,0.8235,93.0954,0.0027,5.6165
3,(16016),"(16014, 16015)",0.0088,0.0033,0.0028,0.3111,93.0954,0.0027,1.4468
4,(16015),"(16016, 16014)",0.0058,0.0051,0.0028,0.4746,92.8527,0.0027,1.8935
5,"(16016, 16014)",(16015),0.0051,0.0058,0.0028,0.5385,92.8527,0.0027,2.1541
6,(16014),(16016),0.0069,0.0088,0.0051,0.7429,83.9759,0.0051,3.8545
7,(16016),(16014),0.0088,0.0069,0.0051,0.5778,83.9759,0.0051,2.3521
8,(16015),(16014),0.0058,0.0069,0.0033,0.5763,83.7569,0.0033,2.3438
9,(16014),(16015),0.0069,0.0058,0.0033,0.4857,83.7569,0.0033,1.9332


## Filter rules for business use

In [124]:
min_sup, min_conf, min_lift = 0.02, 0.35, 3.0

rules_useful = (
    rules
    .query("support >= @min_sup and confidence >= @min_conf and lift >= @min_lift")
    .copy()
)

# Remove postage-like consequents just in case
mask_boring = rules_useful['consequents'].astype(str).str.contains('POSTAGE|CARRIAGE|SAMPLES', case=False, na=False)
rules_useful = rules_useful[~mask_boring]

rules_useful = rules_useful.sort_values(['lift','confidence','support'], ascending=False)
rules_useful.head(15)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
6699,(15056P),(15056N),0.031,0.0893,0.0223,0.7206,8.0657,0.0195,3.2597
9122,(15056BL),(15056N),0.0726,0.0893,0.0437,0.6022,6.7397,0.0372,2.289
9123,(15056N),(15056BL),0.0893,0.0726,0.0437,0.4895,6.7397,0.0372,1.8168
