Step 2: Frequent Itemsets (single items) — support >= 5%
Works with either:
 - transactions (list of lists) from preprocessing, OR
 - one-hot CSV saved from preprocessing

In [None]:
import pandas as pd
from collections import Counter
from typing import List

In [None]:
# ---------- Config ----------
MIN_SUPPORT = 0.05          # 5%
total_transactions = None  # filled later
# Input paths (change if needed)
one_hot_csv_path = r"D:\DATA SCIENCE\ASSIGNMENTS\10 association rules\Association Rules\basket_one_hot.csv"
# If you have the transactions list in memory (from preprocessing), set it here:
# transactions = [...]   # list of lists, each inner list = items (lowercased)
# ----------------------------

In [None]:
def frequent_items_from_transactions(transactions: List[List[str]], min_support: float):
    """
    Count single-item supports from a list-of-lists transactions and return items >= min_support.
    Returns a DataFrame: item | support | count
    """
    n = len(transactions)
    total_transactions_local = n
    # Count occurrences (in how many transactions each item appears)
    counter = Counter()
    for tx in transactions:
        unique_items = set(tx)  # ensure a transaction counts an item only once
        counter.update(unique_items)
    rows = []
    for item, count in counter.items():
        support = count / total_transactions_local
        rows.append((item, support, count))
    df = pd.DataFrame(rows, columns=["item", "support", "count"]).sort_values("support", ascending=False).reset_index(drop=True)
    df = df[df["support"] >= min_support]
    return df, total_transactions_local

In [None]:
def frequent_items_from_onehot(one_hot_df: pd.DataFrame, min_support: float):
    """
    Compute supports directly from one-hot DataFrame where rows = transactions, cols = items (0/1).
    """
    n = one_hot_df.shape[0]
    support_series = one_hot_df.sum(axis=0) / n
    counts = (one_hot_df.sum(axis=0)).astype(int)
    df = pd.DataFrame({
        "item": support_series.index,
        "support": support_series.values,
        "count": counts.values
    }).sort_values("support", ascending=False).reset_index(drop=True)
    df = df[df["support"] >= min_support]
    return df, n

In [None]:
# -------------------------
# Try to read one-hot CSV first; fallback to transactions variable if not available
# -------------------------
import os

In [None]:
if os.path.exists(one_hot_csv_path):
    print("Loading one-hot CSV from:", one_hot_csv_path)
    onehot = pd.read_csv(one_hot_csv_path, index_col=False)  # columns are items
    # If your CSV saved with an index column, you may need index_col=0 — adjust if necessary.
    # Ensure values are 0/1:
    onehot = onehot.fillna(0).astype(int)
    freq_df, total_transactions = frequent_items_from_onehot(onehot, MIN_SUPPORT)
else:
    print("One-hot CSV not found at the path. Looking for 'transactions' variable in memory...")
    try:
        # Use the transactions list produced earlier in preprocessing step
        transactions  # type: ignore
        freq_df, total_transactions = frequent_items_from_transactions(transactions, MIN_SUPPORT)  # type: ignore
    except NameError:
        raise RuntimeError("No input found: place 'transactions' variable in memory or save one-hot CSV at one_hot_csv_path.")

In [None]:
# Pretty print top results
print(f"\nTotal transactions used: {total_transactions}")
print(f"Items with support >= {MIN_SUPPORT*100:.1f}% (count >= {int(MIN_SUPPORT*total_transactions)}):\n")
pd.set_option("display.max_rows", None)
print(freq_df.to_string(index=False))

In [None]:
# Save the frequent single-item list to CSV for embedding in your assignment
out_csv = r"D:\DATA SCIENCE\ASSIGNMENTS\10 association rules\Association Rules\frequent_items_single.csv"
freq_df.to_csv(out_csv, index=False)
print(f"\nSaved frequent single-item list to: {out_csv}")

Quick tips:
- If you want the top-k items only, do freq_df.head(k)
- To change threshold, modify MIN_SUPPORT variable at the top

In [None]:
"""
Step 2b: Pairwise frequent itemsets + association rules (support, confidence, lift)
- Works from one-hot DataFrame (preferred) or transactions list.
- Only considers pairs made from single items that meet min_support (Apriori pruning).
"""

In [None]:
import pandas as pd
from itertools import combinations
from typing import List, Tuple
import os

In [None]:
# -------------------- Config --------------------
MIN_SUPPORT = 0.05          # minimum support threshold (fraction of transactions)
MIN_CONFIDENCE = 0.0       # optional: filter rules with confidence >= this (0 to 1). Set 0 to keep all.
ONE_HOT_CSV = r"D:\DATA SCIENCE\ASSIGNMENTS\10 association rules\Association Rules\basket_one_hot.csv"
OUT_RULES_CSV = r"D:\DATA SCIENCE\ASSIGNMENTS\10 association rules\Association Rules\pairwise_rules.csv"
# ------------------------------------------------

In [None]:
def load_onehot_or_transactions(one_hot_path: str):
    """
    Try loading one-hot CSV; otherwise expect `transactions` list to exist in memory.
    Returns: (onehot_df, transactions_list)
    One of them may be None depending on source.
    """
    if os.path.exists(one_hot_path):
        onehot = pd.read_csv(one_hot_path, index_col=False)
        # Ensure binary 0/1
        onehot = onehot.fillna(0).astype(int)
        return onehot, None
    else:
        try:
            # transactions variable should be a list of lists (from preprocessing)
            transactions  # type: ignore
            return None, transactions  # type: ignore
        except NameError:
            raise RuntimeError("No input found. Provide basket_one_hot.csv at ONE_HOT_CSV path or ensure 'transactions' exists in memory.")

In [None]:
def compute_single_item_supports_from_onehot(onehot: pd.DataFrame) -> Tuple[pd.Series, int]:
    n = onehot.shape[0]
    support = onehot.sum(axis=0) / n
    counts = onehot.sum(axis=0).astype(int)
    return pd.DataFrame({"support": support, "count": counts}), n

In [None]:
def compute_single_item_supports_from_tx(transactions: List[List[str]]) -> Tuple[pd.DataFrame, int]:
    from collections import Counter
    n = len(transactions)
    cnt = Counter()
    for tx in transactions:
        cnt.update(set(tx))
    items = []
    counts = []
    supports = []
    for itm, c in cnt.items():
        items.append(itm)
        counts.append(c)
        supports.append(c / n)
    df = pd.DataFrame({"item": items, "support": supports, "count": counts}).set_index("item")
    return df, n

In [None]:
def generate_pairwise_rules_from_onehot(onehot: pd.DataFrame, min_support: float, min_confidence: float):
    single_df, n = compute_single_item_supports_from_onehot(onehot)
    # Keep items that meet min_support (Apriori)
    frequent_items = single_df[single_df["support"] >= min_support].copy()
    frequent_items = frequent_items.sort_values("support", ascending=False)
    items = frequent_items.index.tolist()

    rules = []
    for a, b in combinations(items, 2):
        # pair support = count of transactions where both a and b = 1
        pair_count = ((onehot[a] == 1) & (onehot[b] == 1)).sum()
        pair_support = pair_count / n
        if pair_support >= min_support:
            support_a = frequent_items.loc[a, "support"]
            support_b = frequent_items.loc[b, "support"]
            # confidence a->b and b->a
            conf_a_b = pair_support / support_a if support_a > 0 else 0.0
            conf_b_a = pair_support / support_b if support_b > 0 else 0.0
            lift = pair_support / (support_a * support_b) if (support_a * support_b) > 0 else 0.0

            if conf_a_b >= min_confidence:
                rules.append({
                    "antecedent": a,
                    "consequent": b,
                    "support": pair_support,
                    "confidence": conf_a_b,
                    "lift": lift,
                    "pair_count": int(pair_count)
                })
            if conf_b_a >= min_confidence:
                rules.append({
                    "antecedent": b,
                    "consequent": a,
                    "support": pair_support,
                    "confidence": conf_b_a,
                    "lift": lift,
                    "pair_count": int(pair_count)
                })

    rules_df = pd.DataFrame(rules).sort_values(by=["lift", "confidence"], ascending=False).reset_index(drop=True)
    return rules_df, frequent_items, n

In [None]:
def generate_pairwise_rules_from_tx(transactions: List[List[str]], min_support: float, min_confidence: float):
    # Build a quick mapping of item -> set of transaction indices
    item_to_tids = {}
    for tid, tx in enumerate(transactions):
        for itm in set(tx):
            item_to_tids.setdefault(itm, set()).add(tid)
    n = len(transactions)
    # single support
    single_support = {itm: len(tids)/n for itm, tids in item_to_tids.items()}
    # frequent items
    frequent_items = [itm for itm, sup in single_support.items() if sup >= min_support]
    rules = []
    for a, b in combinations(sorted(frequent_items), 2):
        tids_a = item_to_tids[a]
        tids_b = item_to_tids[b]
        pair_tids = tids_a & tids_b
        pair_count = len(pair_tids)
        pair_support = pair_count / n
        if pair_support >= min_support:
            support_a = single_support[a]
            support_b = single_support[b]
            conf_a_b = pair_support / support_a if support_a > 0 else 0.0
            conf_b_a = pair_support / support_b if support_b > 0 else 0.0
            lift = pair_support / (support_a * support_b) if (support_a * support_b) > 0 else 0.0
            if conf_a_b >= min_confidence:
                rules.append({"antecedent": a, "consequent": b, "support": pair_support, "confidence": conf_a_b, "lift": lift, "pair_count": pair_count})
            if conf_b_a >= min_confidence:
                rules.append({"antecedent": b, "consequent": a, "support": pair_support, "confidence": conf_b_a, "lift": lift, "pair_count": pair_count})
    rules_df = pd.DataFrame(rules).sort_values(by=["lift", "confidence"], ascending=False).reset_index(drop=True)
    # Convert frequent_items to DataFrame for convenience
    freq_df = pd.DataFrame([(itm, int(len(item_to_tids[itm])), single_support[itm]) for itm in sorted(frequent_items)],
                           columns=["item", "count", "support"]).set_index("item").sort_values("support", ascending=False)
    return rules_df, freq_df, n

In [None]:
# -------------------- Run --------------------
onehot, transactions = load_onehot_or_transactions(ONE_HOT_CSV)

In [None]:
if onehot is not None:
    print("Using one-hot CSV as input.")
    rules_df, freq_items_df, total_tx = generate_pairwise_rules_from_onehot(onehot, MIN_SUPPORT, MIN_CONFIDENCE)
else:
    print("Using 'transactions' variable in memory as input.")
    rules_df, freq_items_df, total_tx = generate_pairwise_rules_from_tx(transactions, MIN_SUPPORT, MIN_CONFIDENCE)

In [None]:
print(f"Total transactions: {total_tx}")
print(f"Frequent single items (support >= {MIN_SUPPORT}): {len(freq_items_df)}\n")
print("Top 10 pairwise rules by lift:")
pd.set_option("display.max_rows", 20)
print(rules_df.head(10).to_string(index=False))

In [None]:
# Save to CSV
rules_df.to_csv(OUT_RULES_CSV, index=False)
print(f"\nSaved pairwise rules to: {OUT_RULES_CSV}")

Step 1: Data Preprocessing for Association Rule Mining

In [None]:
import pandas as pd

In [None]:
def load_transactions_from_file(path: str, col_index: int = 0):
    """
    Load Excel/CSV file and return a list of transactions (list of lists).
    Each transaction = list of items.
    """
    if path.lower().endswith(('.xls', '.xlsx')):
        df = pd.read_excel(path, engine="openpyxl")
    else:
        df = pd.read_csv(path)
    
    raw_col = df.iloc[:, col_index].astype(str)

    transactions = []
    for basket_str in raw_col:
        # Split by comma, strip spaces, lowercase
        items = [itm.strip().lower() for itm in basket_str.split(',') if itm.strip()]
        # Remove duplicates inside a basket
        items = list(dict.fromkeys(items))  
        transactions.append(items)

    return transactions

In [None]:
def transactions_to_ohe(transactions):
    """
    Convert list-of-lists (transactions) to one-hot encoded DataFrame.
    """
    tx_df = pd.DataFrame({'tid': range(len(transactions)), 'items': transactions})
    tx_exploded = tx_df.explode('items').dropna(subset=['items'])
    ohe = pd.crosstab(tx_exploded['tid'], tx_exploded['items'])
    ohe = ohe.reindex(range(len(transactions)), fill_value=0)  # keep all transactions
    return ohe

In [None]:
# -------------------------
# Run preprocessing
# -------------------------
filepath = r"D:\DATA SCIENCE\ASSIGNMENTS\10 association rules\Association Rules\Online Retail.xlsx"

In [None]:
transactions = load_transactions_from_file(filepath)
print(f"Total transactions loaded: {len(transactions)}")

In [None]:
# Quick peek at first 5 transactions
for i, t in enumerate(transactions[:5]):
    print(f"{i}: {t}")

In [None]:
basket_ohe = transactions_to_ohe(transactions)
print("\nOne-hot encoded basket shape:", basket_ohe.shape)
print("\nFirst 5 rows:")
print(basket_ohe.head())

In [None]:
# Save the one-hot encoded dataset if needed
basket_ohe.to_csv(r"D:\DATA SCIENCE\ASSIGNMENTS\10 association rules\Association Rules\basket_one_hot.csv", index=False)
print("\nSaved one-hot file to basket_one_hot.csv")