In [12]:
# === Cell 1: Install lightweight deps (works on Py 3.11) ===
!pip -q install mlxtend pandas

import pandas as pd, numpy as np, os, glob
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
print("Ready.")


Ready.


In [13]:
# === Cell 2: Load Groceries dataset (robust) ===
DATA_DIR = "/kaggle/input/groceries" if os.path.exists("/kaggle/input/groceries") else "/content"
files = glob.glob(os.path.join(DATA_DIR, "*.csv"))
if not files:
    # Fallback: public CSV (each row is a basket, comma-separated)
    url = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/groceries.csv"
    df = pd.read_csv(url, header=None)
else:
    df = pd.read_csv(files[0])

print("Raw shape:", df.shape)
df.head()


Raw shape: (9835, 33)


Unnamed: 0,Item(s),Item 1,Item 2,Item 3,Item 4,Item 5,Item 6,Item 7,Item 8,Item 9,...,Item 23,Item 24,Item 25,Item 26,Item 27,Item 28,Item 29,Item 30,Item 31,Item 32
0,4,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,...,,,,,,,,,,
1,3,tropical fruit,yogurt,coffee,,,,,,,...,,,,,,,,,,
2,1,whole milk,,,,,,,,,...,,,,,,,,,,
3,4,pip fruit,yogurt,cream cheese,meat spreads,,,,,,...,,,,,,,,,,
4,4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,...,,,,,,,,,,


In [14]:
# 3) Build transactions list
# Many variants have columns like Member_number, Date, itemDescription
# Group by (member, date) when available; otherwise treat each row as its own basket.
cols = {c.lower(): c for c in df.columns}

if "member_number" in cols and "date" in cols and "itemdescription" in cols:
    baskets = (df.groupby([cols["member_number"], cols["date"]])[cols["itemdescription"]]
                 .apply(list)
                 .tolist())
else:
    # Fallback: one item per row
    item_col = next((c for c in df.columns if "item" in c.lower() or "desc" in c.lower()), None)
    assert item_col, "Could not find an item/description column."
    baskets = [[str(x)] for x in df[item_col].dropna().astype(str)]

print("Num baskets:", len(baskets))
print("Example basket:", baskets[0][:5])


Num baskets: 9835
Example basket: ['4']


In [15]:
# 4) One-hot encode transactions
te = TransactionEncoder()
arr = te.fit(baskets).transform(baskets, sparse=True)
baskets_ohe = pd.DataFrame.sparse.from_spmatrix(arr, columns=te.columns_)
baskets_ohe.iloc[:5, :10]


Unnamed: 0,1,10,11,12,13,14,15,16,17,18
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0


In [16]:
# 5) Frequent itemsets + rules (Apriori)
itemsets = apriori(baskets_ohe, min_support=0.003, use_colnames=True)
rules = association_rules(itemsets, metric="confidence", min_threshold=0.3)
rules = rules.sort_values(["lift","confidence","support"], ascending=False)

print("Itemsets:", itemsets.shape, "Rules:", rules.shape)
rules.head(10)


Itemsets: (16, 2) Rules: (0, 14)


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski


In [17]:
# 6) Save results
out_path = "/kaggle/working/groceries_association_rules.csv"
rules.to_csv(out_path, index=False)
print("Saved:", out_path)
!ls -lh /kaggle/working


Saved: /kaggle/working/groceries_association_rules.csv
total 24K
drwxr-xr-x 2 root root 4.0K Nov  2 19:50 cudf
drwxr-xr-x 2 root root 4.0K Nov  2 19:50 cuml
drwxr-xr-x 2 root root 4.0K Nov  2 19:50 cupy
-rw-r--r-- 1 root root  166 Nov  2 20:54 groceries_association_rules.csv
-rw-r--r-- 1 root root  313 Nov  2 20:53 logs.log
drwxr-xr-x 2 root root 4.0K Nov  2 19:50 rmm
