Import Libraries

In [1]:
import os
from itertools import combinations
import numpy as np
import pandas as pd

Paths

In [2]:
PRODUCTS_PATH = '../NewData/products.csv'
TX_PATH       = '../NewData/sales_transactions.csv'
OUT_DIR       = '../NewData/StoresLevel'
os.makedirs(OUT_DIR, exist_ok=True)

Helpers

In [None]:
def canonicalize_pairs(df, a='product_id1', b='product_id2'):
    """Ensure (A,B)==(B,A) and create canonical columns p1,p2."""
    out = df.copy()
    out[a] = out[a].astype(str).str.strip()
    out[b] = out[b].astype(str).str.strip()
    ij = np.sort(out[[a, b]].values, axis=1)
    out['p1'] = ij[:, 0]
    out['p2'] = ij[:, 1]
    return out

def compute_attribute_scores(products_df):
    """Compute attribute scores for all product pairs (global)."""
    # Prepare columns used in scoring
    df = products_df.copy()
    df['compatible_vehicle'] = df['compatible_vehicle'].apply(
        lambda x: [v.strip() for v in str(x).split(',')] if isinstance(x, str) else []
    )

    # Precompute min/max for price normalization
    min_price = df['unit_price'].min()
    max_price = df['unit_price'].max()
    price_range = (max_price - min_price) if pd.notna(max_price) and pd.notna(min_price) else 0.0

    def _score(a, b):
        # Category
        s = 0.5 if a['category'] == b['category'] else 0.0
        # Grade
        s += 0.05 if a['grade'] == b['grade'] else 0.0
        # Material
        s += 0.05 if a['material'] == b['material'] else 0.0
        # Vehicle type
        s += 0.15 if a['vehicle_type'] == b['vehicle_type'] else 0.0
        # Compatible vehicle intersection
        s += 0.15 if set(a['compatible_vehicle']) & set(b['compatible_vehicle']) else 0.0
        # Price similarity
        if price_range > 0:
            s += 0.1 * (1 - abs(a['unit_price'] - b['unit_price']) / price_range)
        return float(s)

    pairs = []
    for i, j in combinations(df.index, 2):
        a, b = df.loc[i], df.loc[j]
        pairs.append({
            'product_id1': a['product_id'],
            'product_id2': b['product_id'],
            'attribute_score': _score(a, b)
        })

    attr = pd.DataFrame(pairs)
    attr = canonicalize_pairs(attr, 'product_id1', 'product_id2')
    attr = attr[['p1', 'p2', 'attribute_score']].drop_duplicates()
    return attr

def compute_transaction_scores_for_store(tx_store_df):
    """
    Compute transaction_score (0..1) for a single store using:
      - Lift → 1/(1+lift)
      - Yule's Q → (1 - Q)/2
      - ρ(common purchases) → (ρ+1)/2
    Then mean of available signals.
    """
    if tx_store_df.empty:
        return pd.DataFrame(columns=['p1', 'p2', 'transaction_score'])

    # Baskets per transaction
    baskets = tx_store_df.groupby('transaction_id')['product_id'].apply(set)
    products = sorted(tx_store_df['product_id'].astype(str).unique().tolist())
    if len(products) < 2:
        return pd.DataFrame(columns=['p1', 'p2', 'transaction_score'])

    # Incidence X (TxP) and co-occurrence N
    X = pd.crosstab(tx_store_df['transaction_id'], tx_store_df['product_id']).astype(bool).astype(int)
    N = X.T.dot(X)  # index=columns=product_id
    m = len(products)
    n_baskets = X.shape[0]

    # Supports
    n_i = np.diag(N.to_numpy())
    si = n_i / n_baskets

    # Row means μ_i over k≠i and centered rows C
    mu = (N.sum(axis=1).to_numpy() - n_i) / max(m - 1, 1)
    C = N.sub(pd.Series(mu, index=N.index), axis=0)

    pairs = []
    for i, j in combinations(range(m), 2):
        pi, pj = N.index[i], N.index[j]
        a = int(N.iat[i, j])
        ni, nj = int(n_i[i]), int(n_i[j])
        b = ni - a
        c = nj - a
        d = n_baskets - (a + b + c)

        # Lift
        s_i, s_j, s_ab = si[i], si[j], a / n_baskets
        lift = np.nan if (s_i == 0 or s_j == 0) else (s_ab / (s_i * s_j))
        lift_sub = np.nan if (pd.isna(lift) or lift <= 0) else 1.0 / (1.0 + lift)

        # Yule's Q (smoothed)
        eps = 1e-9
        num = (a + eps) * (d + eps) - (b + eps) * (c + eps)
        den = (a + eps) * (d + eps) + (b + eps) * (c + eps)
        q = num / den
        yule_sub = (1 - q) / 2.0

        # ρ(common purchases)
        ci = C.iloc[i].drop(index=pi)
        cj = C.iloc[j].drop(index=pj)
        # numerator excludes i and j
        num_rho = (C.iloc[i].drop(index=[pi, pj]) * C.iloc[j].drop(index=[pi, pj])).sum()
        den_rho = np.sqrt((ci**2).sum() * (cj**2).sum())
        rho = np.nan if den_rho == 0 else (num_rho / den_rho)
        rho_sub = np.nan if pd.isna(rho) else (rho + 1) / 2.0

        # Final transaction score = mean of available signals
        sig = np.array([lift_sub, yule_sub, rho_sub], dtype=float)
        ts = np.nanmean(sig)
        if np.isnan(ts):
            ts = 0.5  

        pairs.append((str(pi), str(pj), float(ts)))

    txn = pd.DataFrame(pairs, columns=['p1', 'p2', 'transaction_score'])
    return txn

Load Data

In [4]:
products_df = pd.read_csv(PRODUCTS_PATH)
transactions_df = pd.read_csv(TX_PATH, usecols=['transaction_id', 'product_id', 'store_id'])
transactions_df['product_id'] = transactions_df['product_id'].astype(str)
products_df['product_id'] = products_df['product_id'].astype(str)

Compute Attribute Score

In [None]:
attr_global = compute_attribute_scores(products_df)

Main code

In [6]:
for store_id, tx_store in transactions_df.groupby('store_id'):
    # products present in this store
    store_products = set(tx_store['product_id'].astype(str).unique())

    # attribute pairs restricted to store's product universe
    attr = attr_global[attr_global['p1'].isin(store_products) & attr_global['p2'].isin(store_products)].copy()

    # transaction score for this store
    txn = compute_transaction_scores_for_store(tx_store[['transaction_id', 'product_id']].copy())

    # merge and compute substitute score
    df = (attr.merge(txn, on=['p1', 'p2'], how='left')
              .rename(columns={'p1': 'product_id1', 'p2': 'product_id2'}))

    df['transaction_score'] = df['transaction_score'].astype(float).fillna(0.0)
    df['attribute_score']   = df['attribute_score'].astype(float).fillna(0.0)
    df['substitute_score']  = (0.6 * df['attribute_score'] + 0.4 * df['transaction_score']).clip(0, 1)

    # tidy cols & save
    df = df[['product_id1', 'product_id2', 'attribute_score', 'transaction_score', 'substitute_score']]
    df.to_csv(os.path.join(OUT_DIR, f'{store_id}.csv'), index=False)

    # (optional) quick sanity print
    print(f"Saved store {store_id}: {df.shape[0]} pairs")


Saved store S001: 17766 pairs
Saved store S002: 18528 pairs
Saved store S003: 18721 pairs
Saved store S004: 18528 pairs
Saved store S005: 17020 pairs
Saved store S006: 17766 pairs
Saved store S007: 17955 pairs
Saved store S008: 16653 pairs
Saved store S009: 17766 pairs
Saved store S010: 17578 pairs
Saved store S011: 17391 pairs
Saved store S012: 15931 pairs
Saved store S013: 17020 pairs
Saved store S014: 17391 pairs
Saved store S015: 16653 pairs
