Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import itertools

Read datasets

In [2]:
transaction = pd.read_csv('../Datasets/sales_transaction_items.csv')
product = pd.read_csv('../Datasets/products.csv')

Calculate Lift (Apriori Association)

In [3]:
# Group products by transaction_id to create baskets
baskets = transaction.groupby('transaction_id')['product_id'].apply(set)

# Get all unique products
products = set(transaction['product_id'])

# Calculate total number of transactions
num_transactions = len(baskets)

# Precompute support for each product
product_support = {p: sum([p in basket for basket in baskets]) / num_transactions for p in products}

# Calculate lift for each product pair
lift_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        # Support for both products together
        support_both = sum([(p1 in basket) and (p2 in basket) for basket in baskets]) / num_transactions
        # Calculate lift
        if product_support[p1] > 0 and product_support[p2] > 0:
            lift = support_both / (product_support[p1] * product_support[p2])
            lift_results.append((p1, p2, lift))
        else:
            lift_results.append((p1, p2, -1))

In [4]:
# Calculate Yule's Q for each product pair
yule_results = []
for p1 in products:
    for p2 in products:
        if p1 >= p2:  # Avoid duplicate pairs and self-pairs
            continue
        n11 = sum([(p1 in basket) and (p2 in basket) for basket in baskets])
        n10 = sum([(p1 in basket) and (p2 not in basket) for basket in baskets])
        n01 = sum([(p1 not in basket) and (p2 in basket) for basket in baskets])
        n00 = sum([(p1 not in basket) and (p2 not in basket) for basket in baskets])
        denominator = n11 * n00 + n10 * n01
        if denominator == 0:
            q = None
        else:
            q = (n11 * n00 - n10 * n01) / denominator
        yule_results.append((p1, p2, q))


Combine the results

In [5]:
# Combine lift_results and yule_results into a DataFrame and save as ProductTransactionScores.csv
combined = []
for (p1, p2, lift), (_, _, yuleq) in zip(lift_results, yule_results):
    lift = None if lift == -1 else lift
    combined.append({'ProductID1': p1, 'ProductID2': p2, 'Lift': lift, 'YuleQ': yuleq})

df_combined = pd.DataFrame(combined)
df_combined.to_csv('../Datasets/ProductTransactionScores.csv', index=False)
print(df_combined.head())

  ProductID1 ProductID2  Lift  YuleQ
0      P1097      P1131   0.0   -1.0
1      P1097      P1117   0.0   -1.0
2      P1097      P1189   0.0   -1.0
3      P1097      P1297   0.0   -1.0
4      P1097      P1102   0.0   -1.0
