Import necessary libraries

In [24]:
import pandas as pd
import numpy as np
import os

Read datasets

In [18]:
transaction = pd.read_csv('../Datasets/sales_transaction_items.csv')
product = pd.read_csv('../Datasets/products.csv')
transaction_details = pd.read_csv('../Datasets/sales_transactions.csv')

Calculate Lift (Apriori Association)

In [3]:
# Group products by transaction_id to create baskets
baskets = transaction.groupby('transaction_id')['product_id'].apply(set)

# Get all unique products
products = set(transaction['product_id'])

# Calculate total number of transactions
num_transactions = len(baskets)

# Precompute support for each product
product_support = {p: sum([p in basket for basket in baskets]) / num_transactions for p in products}

# Calculate lift for each product pair
lift_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        # Support for both products together
        support_both = sum([(p1 in basket) and (p2 in basket) for basket in baskets]) / num_transactions
        # Calculate lift
        if product_support[p1] > 0 and product_support[p2] > 0:
            lift = support_both / (product_support[p1] * product_support[p2])
            lift_results.append((p1, p2, lift))
        else:
            lift_results.append((p1, p2, -1))

Calculate Yule's Q

In [4]:
yule_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        n11 = sum([(p1 in basket) and (p2 in basket) for basket in baskets])
        n10 = sum([(p1 in basket) and (p2 not in basket) for basket in baskets])
        n01 = sum([(p1 not in basket) and (p2 in basket) for basket in baskets])
        n00 = sum([(p1 not in basket) and (p2 not in basket) for basket in baskets])
        denominator = n11 * n00 + n10 * n01
        if denominator == 0:
            q = None
        else:
            q = (n11 * n00 - n10 * n01) / denominator
        yule_results.append((p1, p2, q))

Combine the results

In [5]:
# Combine lift_results and yule_results into a DataFrame and save as ProductTransactionScores.csv
combined = []
for (p1, p2, lift), (_, _, yuleq) in zip(lift_results, yule_results):
    lift = None if lift == -1 else lift
    combined.append({'ProductID1': p1, 'ProductID2': p2, 'Lift': lift, 'YuleQ': yuleq})

df_combined = pd.DataFrame(combined)
df_combined.to_csv('../Datasets/ProductTransactionScores.csv', index=False)
print(df_combined.head())

  ProductID1 ProductID2  Lift  YuleQ
0      P1055      P1343   0.0   -1.0
1      P1055      P1305   0.0   -1.0
2      P1055      P1192   0.0   -1.0
3      P1055      P1059   0.0   -1.0
4      P1055      P1345   0.0   -1.0


Aggregate the score for each store

In [26]:
# --- Yule phi coefficient formula ---
# φ_ij = (n_ij * n_00 - n_i1 * n_1j) / sqrt(n_i1 * n_1j * n_0i * n_0j)
# where:
# n_ij: number of baskets containing both i and j
# n_i1: number of baskets containing i but not j
# n_1j: number of baskets containing j but not i
# n_00: number of baskets containing neither i nor j

items = transaction
txns = transaction_details

# Merge to get store_id for each item
items = items.merge(txns[["transaction_id", "store_id"]], on="transaction_id", how="inner")

for store_id, g in items.groupby("store_id"):
    baskets = g.groupby("transaction_id")["product_id"].apply(set).tolist()
    prods = list(pd.unique(g["product_id"]))
    p = len(prods)
    if p < 2:
        pd.DataFrame(columns=["product1", "product2", "correlation", "Yulephi"]).to_csv(
            os.path.join("../Datasets/Stores_level", f"{store_id}.csv"), index=False
        )
        continue

    co_matrix = np.zeros((p, p))
    prod_to_idx = {prod: idx for idx, prod in enumerate(prods)}
    for basket in baskets:
        idxs = [prod_to_idx[prod] for prod in basket if prod in prod_to_idx]
        for i in idxs:
            for j in idxs:
                if i != j:
                    co_matrix[i, j] += 1

    out = []
    num_baskets = len(baskets)
    for i in range(p-1):
        for j in range(i+1, p):
            x = co_matrix[i]
            y = co_matrix[j]
            corr = np.corrcoef(x, y)[0, 1] if np.std(x) > 0 and np.std(y) > 0 else np.nan

            # Yule phi calculation
            prod_i = prods[i]
            prod_j = prods[j]
            n_ij = sum([(prod_i in basket) and (prod_j in basket) for basket in baskets])
            n_i1 = sum([(prod_i in basket) and (prod_j not in basket) for basket in baskets])
            n_1j = sum([(prod_i not in basket) and (prod_j in basket) for basket in baskets])
            n_00 = sum([(prod_i not in basket) and (prod_j not in basket) for basket in baskets])
            denominator = np.sqrt((n_ij + n_i1) * (n_ij + n_1j) * (n_00 + n_i1) * (n_00 + n_1j))
            if denominator == 0:
                yulephi = None
            else:
                yulephi = ((n_ij * n_00) - (n_i1 * n_1j)) / denominator

            out.append((prod_i, prod_j, corr, yulephi))

    df_out = pd.DataFrame(out, columns=["product1", "product2", "correlation", "Yulephi"])
    df_out.to_csv(os.path.join("../Datasets/Stores_level", f"{store_id}.csv"), index=False)