### Preparation

Import libraries

In [7]:
import pandas as pd
import numpy as np
from itertools import combinations

Read dataset

In [2]:
products = pd.read_csv('../NewData/products.csv')
transactions = pd.read_csv('../NewData/sales_transactions.csv')

### Calculate Transaction Score

Calculate Lift (Association Rule)

In [3]:
# Group products by transaction_id to create baskets
baskets = transactions.groupby('transaction_id')['product_id'].apply(set)

# Get all unique products
products = set(transactions['product_id'])

# Calculate total number of transactions
num_transactions = len(baskets)

# Precompute support for each product
product_support = {p: sum([p in basket for basket in baskets]) / num_transactions for p in products}

# Calculate lift for each product pair
lift_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        # Support for both products together
        support_both = sum([(p1 in basket) and (p2 in basket) for basket in baskets]) / num_transactions
        # Calculate lift
        if product_support[p1] > 0 and product_support[p2] > 0:
            lift = support_both / (product_support[p1] * product_support[p2])
            lift_results.append((p1, p2, lift))
        else:
            lift_results.append((p1, p2, -1))

Calculate Yule's Q

In [4]:
yule_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        a = sum([(p1 in basket) and (p2 in basket) for basket in baskets])
        b = sum([(p1 in basket) and (p2 not in basket) for basket in baskets])
        c = sum([(p1 not in basket) and (p2 in basket) for basket in baskets])
        d = sum([(p1 not in basket) and (p2 not in basket) for basket in baskets])
        denominator = a * d + b * c
        if denominator == 0:
            q = None
        else:
            q = (a * d - b * c) / denominator
        yule_results.append((p1, p2, q))

Calculate Correlation

In [8]:
correlation_results = []

product_list = sorted(list(products))   # deterministic order
m = len(product_list)

# Transaction × Product incidence matrix (0/1)
X = pd.DataFrame(0, index=baskets.index, columns=product_list, dtype=int)
for t_id, items in baskets.items():
    if items:  # guard in case of empty basket
        X.loc[t_id, list(items)] = 1

# Product × Product co-occurrence counts
N = X.T.dot(X)  # DataFrame with index/cols = product_list

# Row means μ_i over k ≠ i
diag = N.to_numpy().diagonal()
mu = (N.sum(axis=1) - diag) / max(m - 1, 1)

# Center rows by μ_i
C = N.sub(mu, axis=0)

# ρ_ij = sum_{k≠i,j}(n_ik-μ_i)(n_jk-μ_j) / sqrt( sum_{k≠i}(n_ik-μ_i)^2 * sum_{k≠j}(n_jk-μ_j)^2 )
for i in range(m):
    for j in range(i + 1, m):
        pi, pj = product_list[i], product_list[j]

        # Numerator excludes i and j
        ci = C.loc[pi].drop(index=[pi, pj], errors='ignore')
        cj = C.loc[pj].drop(index=[pi, pj], errors='ignore')
        numerator = float((ci * cj).sum())

        # Denominator excludes i for row i, and j for row j
        si = float((C.loc[pi].drop(index=pi) ** 2).sum())
        sj = float((C.loc[pj].drop(index=pj) ** 2).sum())
        denominator = (si * sj) ** 0.5

        corr = (numerator / denominator) if denominator > 0 else None
        correlation_results.append((pi, pj, corr))

In [13]:
# Helper to wrap lists -> DataFrames
def to_df(lst, col):
    return pd.DataFrame(lst, columns=["p1", "p2", col])

# Combine all three metrics by product pair
scores = (
    to_df(lift_results, "lift")
    .merge(to_df(yule_results, "yule_q"), on=["p1", "p2"], how="outer")
    .merge(to_df(correlation_results, "rho"), on=["p1", "p2"], how="outer")
)

# --- Map each metric to substitutability in [0,1] ---
# Lift: lower lift ⇒ more substitutable → 1/(1+lift)
def lift_to_sub(x):
    if x is None or pd.isna(x) or x <= 0:
        return np.nan
    return 1.0 / (1.0 + float(x))

# Yule's Q: negative ⇒ more substitutable → (1 - q)/2
# ρ (correlation of common purchases): higher ⇒ more substitutable → (ρ + 1)/2
scores["lift_sub"] = scores["lift"].apply(lift_to_sub)
scores["yule_sub"] = ((1 - scores["yule_q"]) / 2).where(scores["yule_q"].notna())
scores["rho_sub"]  = ((scores["rho"] + 1) / 2).where(scores["rho"].notna())

# Keep within bounds
scores[["lift_sub", "yule_sub", "rho_sub"]] = scores[["lift_sub", "yule_sub", "rho_sub"]].clip(0, 1)

# --- Final transaction score: mean of available signals (skip NaNs) ---
scores["transaction_score"] = scores[["lift_sub", "yule_sub", "rho_sub"]].mean(axis=1, skipna=True)

# Optional: if all three are NaN for a pair, set a neutral default (e.g., 0.5)
scores["transaction_score"] = scores["transaction_score"].fillna(0.5)

# If you want a list[(p1, p2, score)]:
transaction_scores = list(scores[["p1", "p2", "transaction_score"]].itertuples(index=False, name=None))


In [18]:
transaction_scores_df = pd.DataFrame(transaction_scores, columns=['productid_1', 'productid_2', 'transaction_score'])
print(transaction_scores_df.head())
transaction_scores_df.to_csv('../NewData/transaction_scores.csv', index=False)

  productid_1 productid_2  transaction_score
0       P0001       P0002           0.397706
1       P0001       P0003           0.730883
2       P0001       P0004           0.742388
3       P0001       P0005           0.763156
4       P0001       P0006           0.395690
